# Simple Feature  Engineering with BaggingClassification

Predict on fields: `ProviderId`, `ProductId`, `ProductCategory`, `ChannelId`, `Amount`, `Value`, `TransactionStartTime`, `PricingStrategy`

0. Example `gropby` with `agg`
1. Extract Hour-of-Day, Day-of-Week, Day-of-Month from `TransactionStartTime`
2. Drop unnecessary fetures
3. Aggregate fetures
4. Count Encoding
5. Sum over `FraudResult` Encoding
6. Min, Max and Mean over `Value` Encoding
7. Z-score over `Value` Encoding
8. Label Encoding
9. Recursive feature elimination
10. BaggingClassification
11. *Light GBM*

In [1]:
import numpy  as np
import pandas as pd

In [2]:
df_trn = pd.read_csv('../data/training.csv')
df_tst = pd.read_csv('../data/test.csv')
df_sbm = pd.read_csv('../data/sample_submission.csv')

In [3]:
print('Number of features:', df_trn.shape[1]-1)
df_trn.head()

Number of features: 15


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0


In [4]:
features_original = list(df_trn.columns)
# print(fetures_original)

In [5]:
# Detect missing values
if not df_trn.isna().sum().sum() and not df_tst.isna().sum().sum():
    print('No missing values into original features')
else:
    print('Warning! Missing values into original features!')

No missing values into original features


## 0. Example `gropby` with `agg`

In [None]:
grouped = df_trn[['ProviderId', 'Value', 'ProductCategory']][:20].groupby('ProductCategory')

for name, group in grouped:
    print('---', name, '---')
    print(group)

In [None]:
print(grouped['Value'].agg([np.sum, np.mean, np.std]))

## 1. Extract Hour-of-Day, Day-of-Week, Day-of-Month from `TransactionStartTime`

In [6]:
import datetime

########################### TransactionDT
# Let's add "time variables" for aggregations

df_trn['TransactionStartTime'] = pd.to_datetime(df_trn['TransactionStartTime'])
df_tst['TransactionStartTime'] = pd.to_datetime(df_tst['TransactionStartTime'])

for df in [df_trn, df_tst]:
    df['HourDay']  = df['TransactionStartTime'].dt.hour
    df['DayWeek']  = df['TransactionStartTime'].dt.dayofweek
    df['DayMonth'] = df['TransactionStartTime'].dt.day

features_datetime = ['HourDay', 'DayWeek', 'DayMonth']
# print(features_datetime)
print('3 new features added!')

print('Number of features:', df_trn.shape[1]-1)
df_trn[['TransactionStartTime', 'HourDay', 'DayWeek', 'DayMonth']][730:740]

3 new features added!
Number of features: 18


Unnamed: 0,TransactionStartTime,HourDay,DayWeek,DayMonth
730,2018-11-15 23:03:05,23,3,15
731,2018-11-15 23:09:06,23,3,15
732,2018-11-15 23:09:40,23,3,15
733,2018-11-15 23:11:36,23,3,15
734,2018-11-15 23:12:09,23,3,15
735,2018-11-15 23:12:47,23,3,15
736,2018-11-15 23:13:00,23,3,15
737,2018-11-16 00:03:09,0,4,16
738,2018-11-16 00:05:33,0,4,16
739,2018-11-16 00:06:13,0,4,16


In [7]:
# Detect missing values
if not df_trn[features_datetime].isna().sum().sum() and not df_tst[features_datetime].isna().sum().sum():
    print('No missing values into new features')
else:
    print('Warning! Missing values into new features!')

No missing values into new features


## 2. Drop unnecessary fields

In [8]:
columns4drop = [
    'TransactionId',
    'BatchId',
    'AccountId',
    'SubscriptionId',
    'CustomerId',
    'CurrencyCode',
    'CountryCode',
    'TransactionStartTime'
]

df_trn = df_trn.drop(columns=columns4drop, axis=1)
df_tst = df_tst.drop(columns=columns4drop, axis=1)

features_dropped = columns4drop.copy()
# print(features_dropped)
print(len(columns4drop), 'features deleted!')
print('Number of features:', df_trn.shape[1]-1)

df_trn.head()

8 features deleted!
Number of features: 10


Unnamed: 0,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,PricingStrategy,FraudResult,HourDay,DayWeek,DayMonth
0,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2,0,2,3,15
1,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2,0,2,3,15
2,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2,0,2,3,15
3,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2,0,3,3,15
4,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2,0,3,3,15


## 3. Aggregate fetures

In [9]:
columns4agg = list(df_trn.columns)
columns4agg.remove('Amount')
columns4agg.remove('Value')
columns4agg.remove('FraudResult')

# print('Create new aggregated fetures:')
num_new_fetures = 0
features_aggregated = []
for i, field_name in enumerate(columns4agg):
    for j in range(i+1, len(columns4agg)):
        num_new_fetures += 1
        new_feature_name = field_name + '_' + columns4agg[j]
        features_aggregated.append(new_feature_name)
#         print(num_new_fetures, '-', new_feature_name)
        
        df_trn[new_feature_name] = df_trn[field_name].astype(str) + '_' + df_trn[columns4agg[j]].astype(str)
        df_tst[new_feature_name] = df_tst[field_name].astype(str) + '_' + df_tst[columns4agg[j]].astype(str)
print(num_new_fetures, 'new features added!')
print('Number of features:', df_trn.shape[1]-1)

df_trn[['ProviderId',
        'ProductId',
        'ProductCategory',
        'ChannelId',
        'PricingStrategy',
        'HourDay',
        'DayWeek',
        'DayMonth',
        'ProviderId_ProductId',
        'ProviderId_ProductCategory',
        'ProviderId_ChannelId',
        'ProviderId_HourDay',
        'ProviderId_DayWeek',
        'ProviderId_DayMonth'
       ]].head()

28 new features added!
Number of features: 38


Unnamed: 0,ProviderId,ProductId,ProductCategory,ChannelId,PricingStrategy,HourDay,DayWeek,DayMonth,ProviderId_ProductId,ProviderId_ProductCategory,ProviderId_ChannelId,ProviderId_HourDay,ProviderId_DayWeek,ProviderId_DayMonth
0,ProviderId_6,ProductId_10,airtime,ChannelId_3,2,2,3,15,ProviderId_6_ProductId_10,ProviderId_6_airtime,ProviderId_6_ChannelId_3,ProviderId_6_2,ProviderId_6_3,ProviderId_6_15
1,ProviderId_4,ProductId_6,financial_services,ChannelId_2,2,2,3,15,ProviderId_4_ProductId_6,ProviderId_4_financial_services,ProviderId_4_ChannelId_2,ProviderId_4_2,ProviderId_4_3,ProviderId_4_15
2,ProviderId_6,ProductId_1,airtime,ChannelId_3,2,2,3,15,ProviderId_6_ProductId_1,ProviderId_6_airtime,ProviderId_6_ChannelId_3,ProviderId_6_2,ProviderId_6_3,ProviderId_6_15
3,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,2,3,3,15,ProviderId_1_ProductId_21,ProviderId_1_utility_bill,ProviderId_1_ChannelId_3,ProviderId_1_3,ProviderId_1_3,ProviderId_1_15
4,ProviderId_4,ProductId_6,financial_services,ChannelId_2,2,3,3,15,ProviderId_4_ProductId_6,ProviderId_4_financial_services,ProviderId_4_ChannelId_2,ProviderId_4_3,ProviderId_4_3,ProviderId_4_15


In [10]:
# Detect missing values
if not df_trn[features_aggregated].isna().sum().sum() and not df_tst[features_aggregated].isna().sum().sum():
    print('No missing values into new features')
else:
    print('Warning! Missing values into new features!')

No missing values into new features


### Aggregate aggregated features over `Value`

In [11]:
columns4agg_w_value = columns4agg + features_aggregated

# print('Create new aggregated fetures with Value:')
num_new_fetures = 0
features_aggregated_w_value = []
for i, field_name in enumerate(columns4agg_w_value):
    num_new_fetures += 1
    new_feature_name = field_name + '_Value'
    features_aggregated_w_value.append(new_feature_name)
#     print(num_new_fetures, '-', new_feature_name)

    df_trn[new_feature_name] = df_trn[field_name].astype(str) + '_' + df_trn['Value'].astype(str)
    df_tst[new_feature_name] = df_tst[field_name].astype(str) + '_' + df_tst['Value'].astype(str)

print(num_new_fetures, 'new features added!')
print('Number of features:', df_trn.shape[1]-1)

df_trn[['ProviderId', 'ProviderId_Value',
        'ProductId', 'ProductId_Value',
        'ProductCategory', 'ProductCategory_Value',
        'ChannelId', 'ChannelId_Value',
        'PricingStrategy', 'PricingStrategy_Value',
        'HourDay', 'HourDay_Value',
        'DayWeek', 'DayWeek_Value',
        'DayMonth', 'DayMonth_Value',
        'ProviderId_ProductId',
        'ProviderId_ProductCategory',
        'ProviderId_ChannelId',
        'ProviderId_HourDay',
        'ProviderId_DayWeek',
        'ProviderId_DayMonth'
       ]].head()

36 new features added!
Number of features: 74


Unnamed: 0,ProviderId,ProviderId_Value,ProductId,ProductId_Value,ProductCategory,ProductCategory_Value,ChannelId,ChannelId_Value,PricingStrategy,PricingStrategy_Value,...,DayWeek,DayWeek_Value,DayMonth,DayMonth_Value,ProviderId_ProductId,ProviderId_ProductCategory,ProviderId_ChannelId,ProviderId_HourDay,ProviderId_DayWeek,ProviderId_DayMonth
0,ProviderId_6,ProviderId_6_1000,ProductId_10,ProductId_10_1000,airtime,airtime_1000,ChannelId_3,ChannelId_3_1000,2,2_1000,...,3,3_1000,15,15_1000,ProviderId_6_ProductId_10,ProviderId_6_airtime,ProviderId_6_ChannelId_3,ProviderId_6_2,ProviderId_6_3,ProviderId_6_15
1,ProviderId_4,ProviderId_4_20,ProductId_6,ProductId_6_20,financial_services,financial_services_20,ChannelId_2,ChannelId_2_20,2,2_20,...,3,3_20,15,15_20,ProviderId_4_ProductId_6,ProviderId_4_financial_services,ProviderId_4_ChannelId_2,ProviderId_4_2,ProviderId_4_3,ProviderId_4_15
2,ProviderId_6,ProviderId_6_500,ProductId_1,ProductId_1_500,airtime,airtime_500,ChannelId_3,ChannelId_3_500,2,2_500,...,3,3_500,15,15_500,ProviderId_6_ProductId_1,ProviderId_6_airtime,ProviderId_6_ChannelId_3,ProviderId_6_2,ProviderId_6_3,ProviderId_6_15
3,ProviderId_1,ProviderId_1_21800,ProductId_21,ProductId_21_21800,utility_bill,utility_bill_21800,ChannelId_3,ChannelId_3_21800,2,2_21800,...,3,3_21800,15,15_21800,ProviderId_1_ProductId_21,ProviderId_1_utility_bill,ProviderId_1_ChannelId_3,ProviderId_1_3,ProviderId_1_3,ProviderId_1_15
4,ProviderId_4,ProviderId_4_644,ProductId_6,ProductId_6_644,financial_services,financial_services_644,ChannelId_2,ChannelId_2_644,2,2_644,...,3,3_644,15,15_644,ProviderId_4_ProductId_6,ProviderId_4_financial_services,ProviderId_4_ChannelId_2,ProviderId_4_3,ProviderId_4_3,ProviderId_4_15


In [12]:
# Detect missing values
if not df_trn[features_aggregated_w_value].isna().sum().sum() and not df_tst[features_aggregated_w_value].isna().sum().sum():
    print('No missing values into new features')
else:
    print('Warning! Missing values into new features!')

No missing values into new features


## 4. Count Encoding

In [15]:
df_trn.columns

Index(['ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'Amount',
       'Value', 'PricingStrategy', 'FraudResult', 'HourDay', 'DayWeek',
       'DayMonth', 'ProviderId_ProductId', 'ProviderId_ProductCategory',
       'ProviderId_ChannelId', 'ProviderId_PricingStrategy',
       'ProviderId_HourDay', 'ProviderId_DayWeek', 'ProviderId_DayMonth',
       'ProductId_ProductCategory', 'ProductId_ChannelId',
       'ProductId_PricingStrategy', 'ProductId_HourDay', 'ProductId_DayWeek',
       'ProductId_DayMonth', 'ProductCategory_ChannelId',
       'ProductCategory_PricingStrategy', 'ProductCategory_HourDay',
       'ProductCategory_DayWeek', 'ProductCategory_DayMonth',
       'ChannelId_PricingStrategy', 'ChannelId_HourDay', 'ChannelId_DayWeek',
       'ChannelId_DayMonth', 'PricingStrategy_HourDay',
       'PricingStrategy_DayWeek', 'PricingStrategy_DayMonth',
       'HourDay_DayWeek', 'HourDay_DayMonth', 'DayWeek_DayMonth',
       'ProviderId_Value', 'ProductId_Value', 'ProductCa

In [16]:
############################## Count Encoding (CE)
columns4ce = list(df_trn.columns)
columns4ce.remove('Amount')
columns4ce.remove('FraudResult')

# print('Count Encoding of these fields:')
features_ce = []
for i, clm in enumerate(columns4ce):
    df_tmp = pd.concat([df_trn[[clm]], df_tst[[clm]]])
    ce_dct = df_tmp[clm].value_counts().to_dict()
#     print(ce_dct)

    new_feature_name = clm + '_CE'
    features_ce.append(new_feature_name)
#     print(i+1, '-', new_feature_name)
    
    df_trn[new_feature_name] = df_trn[clm].map(ce_dct)
    df_tst[new_feature_name] = df_tst[clm].map(ce_dct)
    
print(i+1, 'new features added!')
print('Number of features:', df_trn.shape[1]-1)

df_trn[['ProviderId', 'ProviderId_CE',
        'ProductId', 'ProductId_CE',
        'ProductCategory', 'ProductCategory_CE',
        'ChannelId', 'ChannelId_CE',
        'Value', 'Value_CE',
        'PricingStrategy', 'PricingStrategy_CE',
        'HourDay', 'HourDay_CE',
        'DayWeek', 'DayWeek_CE',
        'DayMonth', 'DayMonth_CE',
        'ProviderId_ProductId', 'ProviderId_ProductId_CE'
       ]].head()

73 new features added!
Number of features: 147


Unnamed: 0,ProviderId,ProviderId_CE,ProductId,ProductId_CE,ProductCategory,ProductCategory_CE,ChannelId,ChannelId_CE,Value,Value_CE,PricingStrategy,PricingStrategy_CE,HourDay,HourDay_CE,DayWeek,DayWeek_CE,DayMonth,DayMonth_CE,ProviderId_ProductId,ProviderId_ProductId_CE
0,ProviderId_6,50007,ProductId_10,23866,airtime,65950,ChannelId_3,95025,1000,27193,2,117426,2,432,3,19584,15,5494,ProviderId_6_ProductId_10,16518
1,ProviderId_4,56445,ProductId_6,47821,financial_services,67105,ChannelId_2,43339,20,3634,2,117426,2,432,3,19584,15,5494,ProviderId_4_ProductId_6,47814
2,ProviderId_6,50007,ProductId_1,4097,airtime,65950,ChannelId_3,95025,500,8144,2,117426,2,432,3,19584,15,5494,ProviderId_6_ProductId_1,2708
3,ProviderId_1,8034,ProductId_21,2078,utility_bill,2660,ChannelId_3,95025,21800,171,2,117426,3,1084,3,19584,15,5494,ProviderId_1_ProductId_21,145
4,ProviderId_4,56445,ProductId_6,47821,financial_services,67105,ChannelId_2,43339,644,341,2,117426,3,1084,3,19584,15,5494,ProviderId_4_ProductId_6,47814


In [17]:
# Detect missing values
if not df_trn[features_ce].isna().sum().sum() and not df_tst[features_ce].isna().sum().sum():
    print('No missing values into new features')
else:
    print('Warning! Missing values into new features!')

No missing values into new features


## 5. Sum over `FraudResult` Encoding

In [18]:
############################## FraudResult Sum Encoding (FSE)
columns4fse = columns4ce.copy()

# print('FraudResult Sum Encoding of these fields:')
features_fse = []
for i, clm in enumerate(columns4fse):
    new_feature_name = clm + '_FSE'
    features_fse.append(new_feature_name)
#     print(i+1, '-', new_feature_name)

    fse_dct = df_trn.groupby([clm])['FraudResult'].agg(['sum']).reset_index().rename(columns={'sum': new_feature_name})
    fse_dct.index = fse_dct[clm].values
    fse_dct = fse_dct[new_feature_name].to_dict()
#     print(fme_dct)

    df_trn[new_feature_name] = df_trn[clm].map(fse_dct)
    df_tst[new_feature_name] = df_tst[clm].map(fse_dct)
print(i+1, 'new features added!')
print('Number of features:', df_trn.shape[1]-1)

df_trn[['ProviderId', 'ProviderId_FSE',
        'ProductId', 'ProductId_FSE',
        'ProductCategory', 'ProductCategory_FSE',
        'ChannelId', 'ChannelId_FSE',
        'Value', 'Value_FSE',
        'PricingStrategy', 'PricingStrategy_FSE',
        'HourDay', 'HourDay_FSE',
        'DayWeek', 'DayWeek_FSE',
        'DayMonth', 'DayMonth_FSE',
        'ProviderId_ProductId', 'ProviderId_ProductId_FSE'
       ]].head()

73 new features added!
Number of features: 220


Unnamed: 0,ProviderId,ProviderId_FSE,ProductId,ProductId_FSE,ProductCategory,ProductCategory_FSE,ChannelId,ChannelId_FSE,Value,Value_FSE,PricingStrategy,PricingStrategy_FSE,HourDay,HourDay_FSE,DayWeek,DayWeek_FSE,DayMonth,DayMonth_FSE,ProviderId_ProductId,ProviderId_ProductId_FSE
0,ProviderId_6,3,ProductId_10,6,airtime,18,ChannelId_3,184,1000,0,2,139,2,0,3,34,15,5,ProviderId_6_ProductId_10,1
1,ProviderId_4,5,ProductId_6,1,financial_services,161,ChannelId_2,5,20,0,2,139,2,0,3,34,15,5,ProviderId_4_ProductId_6,1
2,ProviderId_6,3,ProductId_1,0,airtime,18,ChannelId_3,184,500,1,2,139,2,0,3,34,15,5,ProviderId_6_ProductId_1,0
3,ProviderId_1,57,ProductId_21,4,utility_bill,12,ChannelId_3,184,21800,0,2,139,3,7,3,34,15,5,ProviderId_1_ProductId_21,0
4,ProviderId_4,5,ProductId_6,1,financial_services,161,ChannelId_2,5,644,0,2,139,3,7,3,34,15,5,ProviderId_4_ProductId_6,1


In [19]:
for clm in features_fse:
    df_tst[clm].fillna(-1, inplace=True)

In [20]:
# Detect missing values
if not df_trn[features_fse].isna().sum().sum() and not df_tst[features_fse].isna().sum().sum():
    print('No missing values into new features')
else:
    print('Warning! Missing values into new features!')

No missing values into new features


## 6. Min, Max and Mean over `Value` Encoding

In [21]:
# clm = 'ProviderId'
# new_feature_name = clm + '_VME'
# vme_dct = df_trn.groupby([clm])['Value'].agg(['max']).reset_index().rename(columns={'max': new_feature_name})
# vme_dct.index = vme_dct[clm].values
# vme_dct = vme_dct[new_feature_name].to_dict()
# vme_dct

### Min

In [22]:
columns4agg_w_value

['ProviderId',
 'ProductId',
 'ProductCategory',
 'ChannelId',
 'PricingStrategy',
 'HourDay',
 'DayWeek',
 'DayMonth',
 'ProviderId_ProductId',
 'ProviderId_ProductCategory',
 'ProviderId_ChannelId',
 'ProviderId_PricingStrategy',
 'ProviderId_HourDay',
 'ProviderId_DayWeek',
 'ProviderId_DayMonth',
 'ProductId_ProductCategory',
 'ProductId_ChannelId',
 'ProductId_PricingStrategy',
 'ProductId_HourDay',
 'ProductId_DayWeek',
 'ProductId_DayMonth',
 'ProductCategory_ChannelId',
 'ProductCategory_PricingStrategy',
 'ProductCategory_HourDay',
 'ProductCategory_DayWeek',
 'ProductCategory_DayMonth',
 'ChannelId_PricingStrategy',
 'ChannelId_HourDay',
 'ChannelId_DayWeek',
 'ChannelId_DayMonth',
 'PricingStrategy_HourDay',
 'PricingStrategy_DayWeek',
 'PricingStrategy_DayMonth',
 'HourDay_DayWeek',
 'HourDay_DayMonth',
 'DayWeek_DayMonth']

In [23]:
############################## Value Min Encoding (VminE)
columns4vmine = columns4agg_w_value.copy()

# print('Value Min Encoding of these fields:')
features_vmine = []
for i, clm in enumerate(columns4vmine):
    new_feature_name = clm + '_VminE'
    features_vmine.append(new_feature_name)
#     print(i+1, '-', new_feature_name)
    
#     vmine_dct = df_trn[df_trn['FraudResult']==0].groupby([clm])['Value'].agg(['min']).reset_index().rename(columns={'min': new_feature_name})
    vmine_dct = df_trn.groupby([clm])['Value'].agg(['min']).reset_index().rename(columns={'min': new_feature_name})
    vmine_dct.index = vmine_dct[clm].values
    vmine_dct = vmine_dct[new_feature_name].to_dict()
#     print(vme_dct)

    df_trn[new_feature_name] = df_trn[clm].map(vmine_dct)
    df_tst[new_feature_name] = df_tst[clm].map(vmine_dct)
print(i+1, 'new features added!')
print('Number of features:', df_trn.shape[1]-1)

df_trn[['ProviderId', 'ProviderId_VminE',
        'ProductId', 'ProductId_VminE',
        'ProductCategory', 'ProductCategory_VminE',
        'ChannelId', 'ChannelId_VminE',
        'PricingStrategy', 'PricingStrategy_VminE',
        'HourDay', 'HourDay_VminE',
        'DayWeek', 'DayWeek_VminE',
        'DayMonth', 'DayMonth_VminE',
        'ProviderId_ProductId', 'ProviderId_ProductId_VminE'
       ]].head()

36 new features added!
Number of features: 256


Unnamed: 0,ProviderId,ProviderId_VminE,ProductId,ProductId_VminE,ProductCategory,ProductCategory_VminE,ChannelId,ChannelId_VminE,PricingStrategy,PricingStrategy_VminE,HourDay,HourDay_VminE,DayWeek,DayWeek_VminE,DayMonth,DayMonth_VminE,ProviderId_ProductId,ProviderId_ProductId_VminE
0,ProviderId_6,20,ProductId_10,300,airtime,33,ChannelId_3,10,2,2,2,10,3,2,15,5,ProviderId_6_ProductId_10,300
1,ProviderId_4,2,ProductId_6,2,financial_services,2,ChannelId_2,2,2,2,2,10,3,2,15,5,ProviderId_4_ProductId_6,2
2,ProviderId_6,20,ProductId_1,33,airtime,33,ChannelId_3,10,2,2,2,10,3,2,15,5,ProviderId_6_ProductId_1,33
3,ProviderId_1,100,ProductId_21,1115,utility_bill,1115,ChannelId_3,10,2,2,3,7,3,2,15,5,ProviderId_1_ProductId_21,1115
4,ProviderId_4,2,ProductId_6,2,financial_services,2,ChannelId_2,2,2,2,3,7,3,2,15,5,ProviderId_4_ProductId_6,2


In [24]:
for clm in features_vmine:
    df_trn[clm].fillna(0, inplace=True)
    df_tst[clm].fillna(0, inplace=True)

In [25]:
# Detect missing values
if df_trn[features_vmine].isna().sum().sum() == 0 and df_tst[features_vmine].isna().sum().sum() == 0:
    print('No missing values into new features')
else:
    print('Warning! Missing values into new features!')

No missing values into new features


### Max

In [26]:
############################## Value Max Encoding (VmaxE)
columns4vmaxe = columns4agg_w_value.copy()

# print('Value Max Encoding of these fields:')
features_vmaxe = []
for i, clm in enumerate(columns4vmaxe):
    new_feature_name = clm + '_VmaxE'
    features_vmaxe.append(new_feature_name)
#     print(i+1, '-', new_feature_name)

#     vmaxe_dct = df_trn[df_trn['FraudResult']==0].groupby([clm])['Value'].agg(['max']).reset_index().rename(columns={'max': new_feature_name})
    vmaxe_dct = df_trn.groupby([clm])['Value'].agg(['max']).reset_index().rename(columns={'max': new_feature_name})
    vmaxe_dct.index = vmaxe_dct[clm].values
    vmaxe_dct = vmaxe_dct[new_feature_name].to_dict()
#     print(vme_dct)

    df_trn[new_feature_name] = df_trn[clm].map(vmaxe_dct)
    df_tst[new_feature_name] = df_tst[clm].map(vmaxe_dct)
print(i+1, 'new features added!')
print('Number of features:', df_trn.shape[1]-1)

df_trn[['ProviderId', 'ProviderId_VminE','ProviderId_VmaxE',
        'ProductId', 'ProductId_VminE', 'ProductId_VmaxE',
        'ProductCategory', 'ProductCategory_VminE', 'ProductCategory_VmaxE',
        'ChannelId', 'ChannelId_VminE', 'ChannelId_VmaxE',
        'PricingStrategy', 'PricingStrategy_VminE', 'PricingStrategy_VmaxE',
        'HourDay', 'HourDay_VminE', 'HourDay_VmaxE',
        'DayWeek', 'DayWeek_VminE', 'DayWeek_VmaxE',
        'DayMonth', 'DayMonth_VminE', 'DayMonth_VmaxE',
        'ProviderId_ProductId', 'ProviderId_ProductId_VminE', 'ProviderId_ProductId_VmaxE'
       ]].head()

36 new features added!
Number of features: 292


Unnamed: 0,ProviderId,ProviderId_VminE,ProviderId_VmaxE,ProductId,ProductId_VminE,ProductId_VmaxE,ProductCategory,ProductCategory_VminE,ProductCategory_VmaxE,ChannelId,...,HourDay_VmaxE,DayWeek,DayWeek_VminE,DayWeek_VmaxE,DayMonth,DayMonth_VminE,DayMonth_VmaxE,ProviderId_ProductId,ProviderId_ProductId_VminE,ProviderId_ProductId_VmaxE
0,ProviderId_6,20,600000,ProductId_10,300,2000000,airtime,33,2000000,ChannelId_3,...,200000,3,2,9880000,15,5,1000000,ProviderId_6_ProductId_10,300,600000
1,ProviderId_4,2,1000000,ProductId_6,2,25000,financial_services,2,9880000,ChannelId_2,...,200000,3,2,9880000,15,5,1000000,ProviderId_4_ProductId_6,2,25000
2,ProviderId_6,20,600000,ProductId_1,33,300000,airtime,33,2000000,ChannelId_3,...,200000,3,2,9880000,15,5,1000000,ProviderId_6_ProductId_1,33,50000
3,ProviderId_1,100,9880000,ProductId_21,1115,521000,utility_bill,1115,733000,ChannelId_3,...,9880000,3,2,9880000,15,5,1000000,ProviderId_1_ProductId_21,1115,106300
4,ProviderId_4,2,1000000,ProductId_6,2,25000,financial_services,2,9880000,ChannelId_2,...,9880000,3,2,9880000,15,5,1000000,ProviderId_4_ProductId_6,2,25000


In [27]:
for clm in features_vmaxe:
    df_trn[clm].fillna(0, inplace=True)
    df_tst[clm].fillna(0, inplace=True)

In [28]:
# Detect missing values
if not df_trn[features_vmaxe].isna().sum().sum() and not df_tst[features_vmaxe].isna().sum().sum():
    print('No missing values into new features')
else:
    print('Warning! Missing values into new features!')

No missing values into new features


### Mean

In [29]:
############################## Value Mean Encoding (VmeanE)
columns4vmeane = columns4agg_w_value.copy()

# print('Value Mean Encoding of these fields:')
features_vmeane = []
for i, clm in enumerate(columns4vmeane):
    new_feature_name = clm + '_VmeanE'
    features_vmeane.append(new_feature_name)
#     print(i+1, '-', new_feature_name)

#     vmeane_dct = df_trn[df_trn['FraudResult']==0].groupby([clm])['Value'].agg(['mean']).reset_index().rename(columns={'mean': new_feature_name})
    vmeane_dct = df_trn.groupby([clm])['Value'].agg(['mean']).reset_index().rename(columns={'mean': new_feature_name})
    vmeane_dct.index = vmeane_dct[clm].values
    vmeane_dct = vmeane_dct[new_feature_name].to_dict()
#     print(vmeane_dct)

    df_trn[new_feature_name] = df_trn[clm].map(vmeane_dct)
    df_tst[new_feature_name] = df_tst[clm].map(vmeane_dct)
print(i+1, 'new features added!')
print('Number of features:', df_trn.shape[1]-1)

df_trn[['ProviderId', 'ProviderId_VmeanE',
        'ProductId', 'ProductId_VmeanE',
        'ProductCategory', 'ProductCategory_VmeanE',
        'ChannelId', 'ChannelId_VmeanE',
        'PricingStrategy', 'PricingStrategy_VmeanE',
        'HourDay', 'HourDay_VmeanE',
        'DayWeek', 'DayWeek_VmeanE',
        'DayMonth', 'DayMonth_VmeanE',
        'ProviderId_ProductId', 'ProviderId_ProductId_VmeanE'
       ]].head()

36 new features added!
Number of features: 328


Unnamed: 0,ProviderId,ProviderId_VmeanE,ProductId,ProductId_VmeanE,ProductCategory,ProductCategory_VmeanE,ChannelId,ChannelId_VmeanE,PricingStrategy,PricingStrategy_VmeanE,HourDay,HourDay_VmeanE,DayWeek,DayWeek_VmeanE,DayMonth,DayMonth_VmeanE,ProviderId_ProductId,ProviderId_ProductId_VmeanE
0,ProviderId_6,3831.240449,ProductId_10,7607.050507,airtime,6049.799609,ChannelId_3,13683.301484,2,7972.704514,2,5055.463668,3,9572.68874,15,9999.166342,ProviderId_6_ProductId_10,4486.857251
1,ProviderId_4,3937.146613,ProductId_6,903.650835,financial_services,12734.46757,ChannelId_2,3901.456477,2,7972.704514,2,5055.463668,3,9572.68874,15,9999.166342,ProviderId_4_ProductId_6,903.752643
2,ProviderId_6,3831.240449,ProductId_1,4202.856499,airtime,6049.799609,ChannelId_3,13683.301484,2,7972.704514,2,5055.463668,3,9572.68874,15,9999.166342,ProviderId_6_ProductId_1,2898.933019
3,ProviderId_1,44201.269183,ProductId_21,15498.160053,utility_bill,20946.692188,ChannelId_3,13683.301484,2,7972.704514,3,68671.390756,3,9572.68874,15,9999.166342,ProviderId_1_ProductId_21,13769.868421
4,ProviderId_4,3937.146613,ProductId_6,903.650835,financial_services,12734.46757,ChannelId_2,3901.456477,2,7972.704514,3,68671.390756,3,9572.68874,15,9999.166342,ProviderId_4_ProductId_6,903.752643


In [30]:
for clm in features_vmeane:
    df_trn[clm].fillna(0, inplace=True)
    df_tst[clm].fillna(0, inplace=True)

In [31]:
# Detect missing values
if not df_trn[features_vmeane].isna().sum().sum() and not df_tst[features_vmeane].isna().sum().sum():
    print('No missing values into new features')
else:
    print('Warning! Missing values into new features!')

No missing values into new features


## 8. Label Encoding

In [32]:
# print('features_original:', features_original, '\n')
# print('features_datetime:', features_datetime, '\n')
# print('features_dropped:', features_dropped, '\n')
# print('features_aggregated:', features_aggregated, '\n')
# print('features_ce:', features_ce, '\n')
# print('features_fse:', features_fse, '\n')
# print('features_vmine:', features_vmine, '\n')
# print('features_vmaxe:', features_vmaxe, '\n')
# print('features_vmeane:', features_vmeane, '\n')

In [33]:
df_trn[['ProviderId',
        'ProductId',
        'ProductCategory',
        'ChannelId',
        'PricingStrategy',
        'ProviderId_ProductId'
       ]].head()

Unnamed: 0,ProviderId,ProductId,ProductCategory,ChannelId,PricingStrategy,ProviderId_ProductId
0,ProviderId_6,ProductId_10,airtime,ChannelId_3,2,ProviderId_6_ProductId_10
1,ProviderId_4,ProductId_6,financial_services,ChannelId_2,2,ProviderId_4_ProductId_6
2,ProviderId_6,ProductId_1,airtime,ChannelId_3,2,ProviderId_6_ProductId_1
3,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,2,ProviderId_1_ProductId_21
4,ProviderId_4,ProductId_6,financial_services,ChannelId_2,2,ProviderId_4_ProductId_6


In [34]:
from sklearn.preprocessing import LabelEncoder

############################## Label Encoding (LE)
columns4le = list(set(features_original) - set(features_dropped) | set(features_aggregated) | set(features_aggregated_w_value))
columns4le.remove('Amount')
columns4le.remove('Value')
columns4le.remove('FraudResult')
columns4le.sort()

# print('Label Encoding of these fields:')
for clm in columns4le:
    df_trn[clm] = df_trn[clm].astype(str)
    df_tst[clm] = df_tst[clm].astype(str)
        
    le = LabelEncoder()
    le.fit(list(df_trn[clm]) + list(df_tst[clm]))
    
    df_trn[clm] = le.transform(df_trn[clm])
    df_tst[clm] = le.transform(df_tst[clm])
        
    df_trn[clm] = df_trn[clm].astype('category')
    df_tst[clm] = df_tst[clm].astype('category')

#rename columns
rnm_dct = {}
features_le = []
for i, clm in enumerate(columns4le):
    new_feature_name = clm + '_LE'
    features_le.append(new_feature_name)
#     print(i+1, '-', new_feature_name)
    rnm_dct[clm] = clm + '_LE'

# print(rnm_dct)
df_trn.rename(columns=rnm_dct, inplace=True)
df_tst.rename(columns=rnm_dct, inplace=True)

print(len(columns4le), 'features encoded')
print('Number of features:', df_trn.shape[1]-1)

df_trn[['ProviderId_LE',
        'ProductId_LE',
        'ProductCategory_LE',
        'ChannelId_LE',
        'PricingStrategy_LE',
        'ProviderId_ProductId_LE'
       ]].head()

69 features encoded
Number of features: 328


Unnamed: 0,ProviderId_LE,ProductId_LE,ProductCategory_LE,ChannelId_LE,PricingStrategy_LE,ProviderId_ProductId_LE
0,5,1,0,2,2,89
1,3,23,2,1,2,62
2,5,0,0,2,2,88
3,0,13,9,2,2,11
4,3,23,2,1,2,62


In [35]:
# Detect missing values
if not df_trn[features_le].isna().sum().sum() and not df_tst[features_le].isna().sum().sum():
    print('No missing values into new features')
else:
    print('Warning! Missing values into new features!')

No missing values into new features


In [None]:
def z_score(df):
    df.columns = [x + "_zscore" for x in df.columns.tolist()]
    return ((df - df.mean())/df.std(ddof=0))

In [None]:
# now iterate over the remaining columns and create a new zscore column
for col in cols:
    col_zscore = col + '_zscore'
    df[col_zscore] = (df[col] - df[col].mean())/df[col].std(ddof=0)

## 9. Extra Trees Classifier

In [36]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE

In [37]:
X_trn = df_trn.drop(columns=['FraudResult'], axis=1)
y_trn = df_trn['FraudResult']

X_tst = df_tst

In [38]:
etc     = ExtraTreesClassifier(n_estimators=800, n_jobs=-1, random_state=24)
rfe_etc = RFE(estimator=etc, n_features_to_select=1, step=1)
rfe_etc = rfe_etc.fit(X_trn, y_trn)

In [39]:
feature_importances_etc = pd.DataFrame(rfe_etc.ranking_,
                                       index = X_trn.columns,
                                       columns=['importance']
                                      ).sort_values('importance', ascending=True)
feature_importances_etc

Unnamed: 0,importance
HourDay_DayMonth_Value_FSE,1
HourDay_DayWeek_Value_FSE,2
DayWeek_DayMonth_Value_FSE,3
ProviderId_DayMonth_Value_FSE,4
ProductId_DayMonth_Value_FSE,5
ProviderId_HourDay_Value_FSE,6
ProductCategory_DayMonth_Value_FSE,7
Value_CE,8
ChannelId_DayMonth_Value_FSE,9
ProductId_HourDay_Value_FSE,10


In [40]:
top50etc = list(feature_importances_etc.index[:50])

In [41]:
top50etc

['HourDay_DayMonth_Value_FSE',
 'HourDay_DayWeek_Value_FSE',
 'DayWeek_DayMonth_Value_FSE',
 'ProviderId_DayMonth_Value_FSE',
 'ProductId_DayMonth_Value_FSE',
 'ProviderId_HourDay_Value_FSE',
 'ProductCategory_DayMonth_Value_FSE',
 'Value_CE',
 'ChannelId_DayMonth_Value_FSE',
 'ProductId_HourDay_Value_FSE',
 'DayMonth_Value_CE',
 'PricingStrategy_DayMonth_Value_FSE',
 'PricingStrategy_HourDay_Value_FSE',
 'ChannelId_HourDay_Value_FSE',
 'DayMonth_Value_FSE',
 'ProviderId_DayWeek_Value_FSE',
 'HourDay_Value_FSE',
 'PricingStrategy_Value_CE',
 'ProductCategory_HourDay_Value_FSE',
 'ProductCategory_DayWeek_Value_FSE',
 'HourDay_Value_CE',
 'Value_FSE',
 'DayWeek_Value_CE',
 'ProductId_DayWeek_Value_FSE',
 'ChannelId_Value_CE',
 'ProviderId_ChannelId_Value_FSE',
 'ProviderId_Value_CE',
 'PricingStrategy_HourDay_Value_CE',
 'ChannelId_Value_FSE',
 'ProductCategory_Value_CE',
 'ProviderId_ProductCategory_Value_FSE',
 'Value',
 'ChannelId_DayWeek_Value_FSE',
 'Amount',
 'PricingStrategy_DayWe

In [None]:
['HourDay_DayMonth_Value_FSE',
 'HourDay_DayWeek_Value_FSE',
 'DayWeek_DayMonth_Value_FSE',
 'ProviderId_DayMonth_Value_FSE',
 'ProductId_DayMonth_Value_FSE',
 'ProviderId_HourDay_Value_FSE',
 'ProductCategory_DayMonth_Value_FSE',
 'Value_CE',
 'ChannelId_DayMonth_Value_FSE',
 'ProductId_HourDay_Value_FSE',
 'DayMonth_Value_CE',
 'PricingStrategy_DayMonth_Value_FSE',
 'PricingStrategy_HourDay_Value_FSE',
 'ChannelId_HourDay_Value_FSE',
 'DayMonth_Value_FSE',
 'ProviderId_DayWeek_Value_FSE',
 'HourDay_Value_FSE',
 'PricingStrategy_Value_CE',
 'ProductCategory_HourDay_Value_FSE',
 'ProductCategory_DayWeek_Value_FSE',
 'HourDay_Value_CE',
 'Value_FSE',
 'DayWeek_Value_CE',
 'ProductId_DayWeek_Value_FSE',
 'ChannelId_Value_CE',
 'ProviderId_ChannelId_Value_FSE',
 'ProviderId_Value_CE',
 'PricingStrategy_HourDay_Value_CE',
 'ChannelId_Value_FSE',
 'ProductCategory_Value_CE',
 'ProviderId_ProductCategory_Value_FSE',
 'Value',
 'ChannelId_DayWeek_Value_FSE',
 'Amount',
 'PricingStrategy_DayWeek_Value_FSE',
 'PricingStrategy_DayWeek_Value_CE',
 'ProductCategory_PricingStrategy_Value_CE',
 'ProviderId_ProductId_Value_FSE',
 'ProductCategory_Value_FSE',
 'HourDay_DayWeek_Value_CE',
 'ProviderId_Value_FSE',
 'ProductId_Value_CE',
 'ProductId_ChannelId_Value_FSE',
 'ProductId_ProductCategory_Value_CE',
 'DayWeek_Value_FSE',
 'DayWeek_DayMonth_Value_CE',
 'ChannelId_PricingStrategy_Value_CE',
 'ProductId_Value_FSE',
 'ChannelId_DayMonth_Value_CE',
 'ProviderId_PricingStrategy_Value_FSE']

## 10. BaggingClassification

### Random Seed

In [None]:
import random
import os

## Seeder
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=24):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
## ------------------- 

In [None]:
SEED = 24
seed_everything(SEED)

In [None]:
import os
from collections import Counter
from sklearn.ensemble import BaggingClassifier

### перебираем features по возрастанию их важности

In [None]:
classifier = BaggingClassifier(n_estimators=800, n_jobs=-1, random_state=24)

for k in range(1, 30+1):
    # prepare dataset on k columns
    X_trn_drp = X_trn[top30etc[:k]]
    X_tst_drp = X_tst[top30etc[:k]]
    
    predict = classifier.fit(X_trn_drp, y_trn).predict(X_tst_drp)
    print(k, ':', Counter(predict))
    df_sbm['FraudResult'] = predict
    
    # определяем был ли ранее точно такой же результат
    current_subm_set = set(df_sbm[df_sbm['FraudResult'] == 1].index.tolist())
    # просматриваем все файлы в папке submitted
    is_exist = False
    files = os.listdir('../submitted')
    files.sort()
    for f in files:
        f_csv = pd.read_csv('../submitted/' + f)
        if set(f_csv[f_csv['FraudResult'] == 1].index.tolist()) == current_subm_set:
            is_exist = True
            print('It is the same as in: ' + f)
    if not is_exist:
        print('New result!')
        df_sbm.to_csv('../submitted/AlBo0920_top' + str(k) + 'RFE_ETC_Bagging.csv', encoding='utf-8', index=False)

### Submit

In [None]:
classifier = BaggingClassifier(n_estimators=800, n_jobs=-1, random_state=24)
#X_trn_drp = X_trn.drop(columns=['ProviderId_LE', 'ProductId_LE', 'ProductCategory_LE', 'ChannelId_LE', 'PricingStrategy_LE'], axis=1)
#X_tst_drp = X_tst.drop(columns=['ProviderId_LE', 'ProductId_LE', 'ProductCategory_LE', 'ChannelId_LE', 'PricingStrategy_LE'], axis=1)
X_trn_drp = X_trn
X_tst_drp = X_tst
X_tst_drp = X_tst_drp.fillna(0)

predict = classifier.fit(X_trn_drp, y_trn).predict(X_tst_drp)
print(k, ':', Counter(predict))
df_sbm['FraudResult'] = predict

# определяем был ли ранее точно такой же результат
current_subm_set = set(df_sbm[df_sbm['FraudResult'] == 1].index.tolist())
# просматриваем все файлы в папке submitted
is_exist = False
files = os.listdir('../submitted')
files.sort()
for f in files:
    f_csv = pd.read_csv('../submitted/' + f)
    if set(f_csv[f_csv['FraudResult'] == 1].index.tolist()) == current_subm_set:
        is_exist = True
        print('It is the same as in: ' + f)
if not is_exist:
    print('New result! Write it')
    df_sbm.to_csv('../submitted/AlBo0917_w_categ_BC.csv', encoding='utf-8', index=False)

**Results:**

`k      #Frauds  F1-score`
 
`1-4    51       0.51063829787234`

`5      50       0.51063829787234`

`6      60       0.576923076923077`

`7-11   62       0.566037735849057`

`12-19  61       0.576923076923077`


## 4. Light GBM

In [None]:
import lightgbm as lgb
import random
import gc

In [None]:
########################### Model params
lgb_params = {
    'objective'             : 'binary',
    'boosting_type'         : 'gbdt',
    'metric'                : 'auc',
    'n_jobs'                : -1,
    'learning_rate'         : 0.01,
    'num_leaves'            : 2**8,
    'max_depth'             : -1,
    'tree_learner'          :'serial',
    'colsample_bytree'      : 0.7,
    'subsample_freq'        : 1,
    'subsample'             : 1,
    'n_estimators'          : 800,
    'max_bin'               : 255,
    'verbose'               : -1,
    'seed'                  : SEED,
    'early_stopping_rounds' : 100,
}

In [None]:
from sklearn.model_selection import train_test_split, KFold

def make_predictions(X_trn, y_trn, X_tst, Xy_sbm, lgb_params, NFOLDS=2):
    folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

    predictions_fin = np.zeros(len(Xy_sbm)) # final
    
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_trn, y_trn)):
        print('Fold:', fold_)
        
        X_trn_cur, y_trn_cur = X_trn.iloc[trn_idx, :], y_trn[trn_idx] # current
        X_val_cur, y_val_cur = X_trn.iloc[val_idx, :], y_trn[val_idx]

        print(len(X_trn_cur), len(X_val_cur))
        trn_data = lgb.Dataset(X_trn_cur, label=y_trn_cur)
        val_data = lgb.Dataset(X_val_cur, label=y_val_cur)  

        estimator = lgb.train(
            lgb_params,
            trn_data,
            valid_sets = [trn_data, val_data],
            verbose_eval = 200,
        )
        
        predictions_cur = estimator.predict(X_tst)
        predictions_fin += predictions_cur / NFOLDS

        # feature importance
        feature_imp = pd.DataFrame(sorted(zip(estimator.feature_importance(), X_trn.columns)), columns=['Weight', 'Feature'])
        print(feature_imp)
        
        # clear memory
        del X_trn_cur, y_trn_cur, X_val_cur, y_val_cur, trn_data, val_data
        gc.collect()
    
    return predictions_fin
## -------------------

In [None]:
lgb_params['learning_rate'] = 0.005
lgb_params['n_estimators']  = 1000
lgb_params['early_stopping_rounds'] = 100

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
probability_predictions = make_predictions(df_trn.drop(columns=['FraudResult'], axis=1),
                                           df_trn['FraudResult'],
                                           df_tst,
                                           df_sbm,
                                           lgb_params,
                                           NFOLDS=10
                                          )

df_sbm['FraudResult_probability'] = probability_predictions

In [None]:
def save_result():    
    # определяем был ли ранее точно такой же результат
    current_subm_set = set(df_sbm[df_sbm['FraudResult'] == 1].index.tolist())
    # просматриваем все файлы в папке submitted
    is_exist = False
    files = os.listdir('../submitted')
    files.sort()
    for f in files:
        f_csv = pd.read_csv('../submitted/' + f)
        if set(f_csv[f_csv['FraudResult'] == 1].index.tolist()) == current_subm_set:
            is_exist = True
            print('It is the same as in: ' + f)
    if not is_exist:
        print('New result!')
        df_sbm[['TransactionId', 'FraudResult']].to_csv('../submitted/AlBo0917_simplest_FE_LGBM.csv', encoding='utf-8', index=False)
        df_sbm[['TransactionId', 'FraudResult_probability']].to_csv('../submitted/AlBo0917_simplest_FE_LGBM_probability.csv', encoding='utf-8', index=False)

In [None]:
df_sbm['FraudResult'] = np.where(df_sbm['FraudResult_probability'] > 0.5, 1, 0)
save_result()

In [None]:
df_sbm['FraudResult'].value_counts()

In [None]:
df_sbm['FraudResult_probability'].value_counts()

### оставим только те features, которые увеличивают F1-score на sumbit'е

In [None]:
print(top20etc)

In [None]:
classifier = BaggingClassifier(n_estimators=800, n_jobs=-1, random_state=24)

X_tst = X_tst.fillna(0)
X_trn_drp = X_trn.drop(columns=['Value_FME', 'PricingStrategy_FME', 'ChannelId_LE'], axis=1)
X_tst_drp = X_tst.drop(columns=['Value_FME', 'PricingStrategy_FME', 'ChannelId_LE'], axis=1)
    
predict = classifier.fit(X_trn_drp, y_trn).predict(X_tst_drp)
print(Counter(predict))
df_sbm['FraudResult'] = predict
    
# определяем был ли ранее точно такой же результат
current_subm_set = set(df_sbm[df_sbm['FraudResult'] == 1].index.tolist())
# просматриваем все файлы в папке submitted
is_exist = False
files = os.listdir('../submitted')
files.sort()
for f in files:
    f_csv = pd.read_csv('../submitted/' + f)
    if set(f_csv[f_csv['FraudResult'] == 1].index.tolist()) == current_subm_set:
        is_exist = True
        print('It is the same as in: ' + f)
if not is_exist:
    print('New result! Write it')
    df_sbm.to_csv('../submitted/AlBo0917_top_1_6_12_RFE_ETC_Bagging.csv', encoding='utf-8', index=False)

**Result:**

`Filds:'Value_FME', 'PricingStrategy_FME', 'ChannelId_LE' frauds=72 F1-score=0.711864406779661`