# Simplest Feature  Engineering with LGBM and BaggingClassification

Predict on fields: `ProviderId`, `ProductId`, `ProductCategory`, `ChannelId`, `Amount`, `Value`, `PricingStrategy`

0. Drop unnecessary fields
1. Count Encoding
2. Mean over `IsFraud` Encoding
3. Label Encoding
4. Light GBM

In [1]:
import numpy  as np
import pandas as pd

In [2]:
df_trn = pd.read_csv('../data/training.csv')
df_tst = pd.read_csv('../data/test.csv')
df_sbm = pd.read_csv('../data/sample_submission.csv')

In [3]:
df_trn.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0


## 0. Drop unnecessary fields

In [4]:
columns4drop = [
    'TransactionId',
    'BatchId',
    'AccountId',
    'SubscriptionId',
    'CustomerId',
    'CurrencyCode',
    'CountryCode',
    'TransactionStartTime'
]

df_trn = df_trn.drop(columns=columns4drop, axis=1)
df_tst = df_tst.drop(columns=columns4drop, axis=1)

df_trn.head()

Unnamed: 0,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,PricingStrategy,FraudResult
0,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2,0
1,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2,0
2,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2,0
3,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2,0
4,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2,0


## 1. Count Encoding

In [5]:
############################## Count Encoding (CE)
ce_columns = ['ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'PricingStrategy', 'Value']

print('Count Encoding of these fields:')
for clm in ce_columns:
    print('---', clm, '---')
    df_tmp = pd.concat([df_trn[[clm]], df_tst[[clm]]])
    ce_dct = df_tmp[clm].value_counts().to_dict()
#     print(ce_dct)
    
    df_trn[clm + '_CE'] = df_trn[clm].map(ce_dct)
    df_tst[clm + '_CE'] = df_tst[clm].map(ce_dct)
print('Ok!')

df_trn.head()

Count Encoding of these fields:
--- ProviderId ---
--- ProductId ---
--- ProductCategory ---
--- ChannelId ---
--- PricingStrategy ---
--- Value ---
Ok!


Unnamed: 0,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,PricingStrategy,FraudResult,ProviderId_CE,ProductId_CE,ProductCategory_CE,ChannelId_CE,PricingStrategy_CE,Value_CE
0,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2,0,50007,23866,65950,95025,117426,27193
1,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2,0,56445,47821,67105,43339,117426,3634
2,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2,0,50007,4097,65950,95025,117426,8144
3,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2,0,8034,2078,2660,95025,117426,171
4,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2,0,56445,47821,67105,43339,117426,341


## 2. Mean over `IsFraud` Encoding

In [6]:
############################## FraudResult Mean Encoding (FME)
fme_columns = ['ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'PricingStrategy', 'Value']

print('FraudResult Mean Encoding of these fields:')
for clm in fme_columns:
    print('---', clm, '---')
    fme_dct = df_trn.groupby([clm])['FraudResult'].agg(['mean']).reset_index().rename(
        columns={'mean': clm + '_FME'})
    fme_dct.index = fme_dct[clm].values
    fme_dct = fme_dct[clm + '_FME'].to_dict()
#     print(fme_dct)

    df_trn[clm + '_FME'] = df_trn[clm].map(fme_dct)
    df_tst[clm + '_FME'] = df_tst[clm].map(fme_dct)
print('Ok!')

df_trn.head()

FraudResult Mean Encoding of these fields:
--- ProviderId ---
--- ProductId ---
--- ProductCategory ---
--- ChannelId ---
--- PricingStrategy ---
--- Value ---
Ok!


Unnamed: 0,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,PricingStrategy,FraudResult,ProviderId_CE,ProductId_CE,ProductCategory_CE,ChannelId_CE,PricingStrategy_CE,Value_CE,ProviderId_FME,ProductId_FME,ProductCategory_FME,ChannelId_FME,PricingStrategy_FME,Value_FME
0,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2,0,50007,23866,65950,95025,117426,27193,8.8e-05,0.00039,0.0004,0.003232,0.001741,0.0
1,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2,0,56445,47821,67105,43339,117426,3634,0.000131,3.1e-05,0.003546,0.000135,0.001741,0.0
2,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2,0,50007,4097,65950,95025,117426,8144,8.8e-05,0.0,0.0004,0.003232,0.001741,0.000172
3,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2,0,8034,2078,2660,95025,117426,171,0.010101,0.002646,0.00625,0.003232,0.001741,0.0
4,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2,0,56445,47821,67105,43339,117426,341,0.000131,3.1e-05,0.003546,0.000135,0.001741,0.0


## 3. Label Encoding

In [7]:
from sklearn.preprocessing import LabelEncoder

############################## Label Encoding (LE)
le_columns = ['ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'PricingStrategy']

print('Label Encoding of these fields:')
for clm in le_columns:
    print('---', clm, '---')
    df_trn[clm] = df_trn[clm].astype(str)
    df_tst[clm] = df_tst[clm].astype(str)
        
    le = LabelEncoder()
    le.fit(list(df_trn[clm]) + list(df_tst[clm]))
    
    df_trn[clm] = le.transform(df_trn[clm])
    df_tst[clm] = le.transform(df_tst[clm])
        
    df_trn[clm] = df_trn[clm].astype('category')
    df_tst[clm] = df_tst[clm].astype('category')
print('Ok!')

#rename columns
rnm_dct = {}
for clm in le_columns:
    rnm_dct[clm] = clm + '_LE'
# print(rnm_dct)
df_trn.rename(columns=rnm_dct, inplace=True)
df_tst.rename(columns=rnm_dct, inplace=True)

print('Shape train and test:', df_trn.shape, df_tst.shape)
df_trn.head()

Label Encoding of these fields:
--- ProviderId ---
--- ProductId ---
--- ProductCategory ---
--- ChannelId ---
--- PricingStrategy ---
Ok!
Shape train and test: (95662, 20) (45019, 19)


Unnamed: 0,ProviderId_LE,ProductId_LE,ProductCategory_LE,ChannelId_LE,Amount,Value,PricingStrategy_LE,FraudResult,ProviderId_CE,ProductId_CE,ProductCategory_CE,ChannelId_CE,PricingStrategy_CE,Value_CE,ProviderId_FME,ProductId_FME,ProductCategory_FME,ChannelId_FME,PricingStrategy_FME,Value_FME
0,5,1,0,2,1000.0,1000,2,0,50007,23866,65950,95025,117426,27193,8.8e-05,0.00039,0.0004,0.003232,0.001741,0.0
1,3,23,2,1,-20.0,20,2,0,56445,47821,67105,43339,117426,3634,0.000131,3.1e-05,0.003546,0.000135,0.001741,0.0
2,5,0,0,2,500.0,500,2,0,50007,4097,65950,95025,117426,8144,8.8e-05,0.0,0.0004,0.003232,0.001741,0.000172
3,0,13,9,2,20000.0,21800,2,0,8034,2078,2660,95025,117426,171,0.010101,0.002646,0.00625,0.003232,0.001741,0.0
4,3,23,2,1,-644.0,644,2,0,56445,47821,67105,43339,117426,341,0.000131,3.1e-05,0.003546,0.000135,0.001741,0.0


## 4. Light GBM

In [None]:
import lightgbm as lgb
import random
import os
import gc

In [None]:
## Seeder
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
## ------------------- 

In [None]:
SEED = 24
seed_everything(SEED)

In [None]:
########################### Model params
lgb_params = {
    'objective'             : 'binary',
    'boosting_type'         : 'gbdt',
    'metric'                : 'auc',
    'n_jobs'                : -1,
    'learning_rate'         : 0.01,
    'num_leaves'            : 2**8,
    'max_depth'             : -1,
    'tree_learner'          :'serial',
    'colsample_bytree'      : 0.7,
    'subsample_freq'        : 1,
    'subsample'             : 1,
    'n_estimators'          : 800,
    'max_bin'               : 255,
    'verbose'               : -1,
    'seed'                  : SEED,
    'early_stopping_rounds' : 100,
}

In [None]:
from sklearn.model_selection import train_test_split, KFold

def make_predictions(X_trn, y_trn, X_tst, Xy_sbm, lgb_params, NFOLDS=2):
    folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

    predictions_fin = np.zeros(len(Xy_sbm)) # final
    
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_trn, y_trn)):
        print('Fold:', fold_)
        
        X_trn_cur, y_trn_cur = X_trn.iloc[trn_idx, :], y_trn[trn_idx] # current
        X_val_cur, y_val_cur = X_trn.iloc[val_idx, :], y_trn[val_idx]

        print(len(X_trn_cur), len(X_val_cur))
        trn_data = lgb.Dataset(X_trn_cur, label=y_trn_cur)
        val_data = lgb.Dataset(X_val_cur, label=y_val_cur)  

        estimator = lgb.train(
            lgb_params,
            trn_data,
            valid_sets = [trn_data, val_data],
            verbose_eval = 200,
        )
        
        predictions_cur = estimator.predict(X_tst)
        predictions_fin += predictions_cur / NFOLDS

        # feature importance
        feature_imp = pd.DataFrame(sorted(zip(estimator.feature_importance(), X_trn.columns)), columns=['Weight', 'Feature'])
        print(feature_imp)
        
        # clear memory
        del X_trn_cur, y_trn_cur, X_val_cur, y_val_cur, trn_data, val_data
        gc.collect()
    
    return predictions_fin
## -------------------

In [None]:
lgb_params['learning_rate'] = 0.005
lgb_params['n_estimators']  = 1000
lgb_params['early_stopping_rounds'] = 100

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
probability_predictions = make_predictions(df_trn.drop(columns=['FraudResult'], axis=1),
                                           df_trn['FraudResult'],
                                           df_tst,
                                           df_sbm,
                                           lgb_params,
                                           NFOLDS=10
                                          )

df_sbm['FraudResult_probability'] = probability_predictions

In [None]:
def save_result():    
    # определяем был ли ранее точно такой же результат
    current_subm_set = set(df_sbm[df_sbm['FraudResult'] == 1].index.tolist())
    # просматриваем все файлы в папке submitted
    is_exist = False
    files = os.listdir('../submitted')
    files.sort()
    for f in files:
        f_csv = pd.read_csv('../submitted/' + f)
        if set(f_csv[f_csv['FraudResult'] == 1].index.tolist()) == current_subm_set:
            is_exist = True
            print('It is the same as in: ' + f)
    if not is_exist:
        print('New result!')
        df_sbm[['TransactionId', 'FraudResult']].to_csv('../submitted/AlBo0917_simplest_FE_LGBM.csv', encoding='utf-8', index=False)
        df_sbm[['TransactionId', 'FraudResult_probability']].to_csv('../submitted/AlBo0917_simplest_FE_LGBM_probability.csv', encoding='utf-8', index=False)

In [None]:
df_sbm['FraudResult'] = np.where(df_sbm['FraudResult_probability'] > 0.5, 1, 0)
save_result()

In [None]:
df_sbm['FraudResult'].value_counts()

In [None]:
df_sbm['FraudResult_probability'].value_counts()

In [8]:
X_trn = df_trn.drop(columns=['FraudResult'], axis=1)
y_trn = df_trn['FraudResult']

X_tst = df_tst

In [9]:
from sklearn.feature_selection import RFE

## Extra Trees Classifier

In [10]:
from sklearn.ensemble import ExtraTreesClassifier

In [11]:
etc     = ExtraTreesClassifier(n_estimators=800, n_jobs=-1, random_state=24)
rfe_etc = RFE(estimator=etc, n_features_to_select=1, step=1)
rfe_etc = rfe_etc.fit(X_trn, y_trn)

In [12]:
feature_importances_etc = pd.DataFrame(rfe_etc.ranking_,
                                       index = X_trn.columns,
                                       columns=['importance']
                                      ).sort_values('importance', ascending=True)
feature_importances_etc

Unnamed: 0,importance
Value_FME,1
Value,2
Amount,3
Value_CE,4
ProviderId_FME,5
PricingStrategy_FME,6
ProductId_FME,7
ProviderId_LE,8
PricingStrategy_LE,9
ProviderId_CE,10


In [13]:
top20etc = list(feature_importances_etc.index[:20])

### Submit

In [14]:
import os
from collections import Counter
from sklearn.ensemble import BaggingClassifier

In [46]:
classifier = BaggingClassifier(n_estimators=800, n_jobs=-1, random_state=24)
#X_trn_drp = X_trn.drop(columns=['ProviderId_LE', 'ProductId_LE', 'ProductCategory_LE', 'ChannelId_LE', 'PricingStrategy_LE'], axis=1)
#X_tst_drp = X_tst.drop(columns=['ProviderId_LE', 'ProductId_LE', 'ProductCategory_LE', 'ChannelId_LE', 'PricingStrategy_LE'], axis=1)
X_trn_drp = X_trn
X_tst_drp = X_tst
X_tst_drp = X_tst_drp.fillna(0)

predict = classifier.fit(X_trn_drp, y_trn).predict(X_tst_drp)
print(k, ':', Counter(predict))
df_sbm['FraudResult'] = predict

# определяем был ли ранее точно такой же результат
current_subm_set = set(df_sbm[df_sbm['FraudResult'] == 1].index.tolist())
# просматриваем все файлы в папке submitted
is_exist = False
files = os.listdir('../submitted')
files.sort()
for f in files:
    f_csv = pd.read_csv('../submitted/' + f)
    if set(f_csv[f_csv['FraudResult'] == 1].index.tolist()) == current_subm_set:
        is_exist = True
        print('It is the same as in: ' + f)
if not is_exist:
    print('New result! Write it')
    df_sbm.to_csv('../submitted/AlBo0917_w_categ_BC.csv', encoding='utf-8', index=False)

1 : Counter({0: 44958, 1: 61})
It is the same as in: AlBo0917_RFE_ETC_Bagging.csv


### перебираем features по возрастанию их важности

In [52]:
classifier = BaggingClassifier(n_estimators=800, n_jobs=-1, random_state=24)
X_tst = X_tst.fillna(0)
for k in range(1, 20):
    # prepare dataset on k columns
    X_trn_drp = X_trn[top20etc[:k]]
    X_tst_drp = X_tst[top20etc[:k]]
    
    predict = classifier.fit(X_trn_drp, y_trn).predict(X_tst_drp)
    print(k, ':', Counter(predict))
    df_sbm['FraudResult'] = predict
    
    # определяем был ли ранее точно такой же результат
    current_subm_set = set(df_sbm[df_sbm['FraudResult'] == 1].index.tolist())
    # просматриваем все файлы в папке submitted
    is_exist = False
    files = os.listdir('../submitted')
    files.sort()
    for f in files:
        f_csv = pd.read_csv('../submitted/' + f)
        if set(f_csv[f_csv['FraudResult'] == 1].index.tolist()) == current_subm_set:
            is_exist = True
            print('It is the same as in: ' + f)
    if not is_exist:
        print('New result! Write it')
        df_sbm.to_csv('../submitted/AlBo0917_top' + str(k) + 'RFE_ETC_Bagging.csv', encoding='utf-8', index=False)

1 : Counter({0: 44968, 1: 51})
New result! Write it
2 : Counter({0: 44968, 1: 51})
It is the same as in: AlBo0917_top1RFE_ETC_Bagging.csv
3 : Counter({0: 44968, 1: 51})
It is the same as in: AlBo0917_top1RFE_ETC_Bagging.csv
4 : Counter({0: 44968, 1: 51})
It is the same as in: AlBo0917_top1RFE_ETC_Bagging.csv
5 : Counter({0: 44969, 1: 50})
New result! Write it
6 : Counter({0: 44959, 1: 60})
New result! Write it
7 : Counter({0: 44957, 1: 62})
New result! Write it
8 : Counter({0: 44957, 1: 62})
It is the same as in: AlBo0917_top7RFE_ETC_Bagging.csv
9 : Counter({0: 44957, 1: 62})
It is the same as in: AlBo0917_top7RFE_ETC_Bagging.csv
10 : Counter({0: 44957, 1: 62})
It is the same as in: AlBo0917_top7RFE_ETC_Bagging.csv
11 : Counter({0: 44957, 1: 62})
It is the same as in: AlBo0917_top7RFE_ETC_Bagging.csv
12 : Counter({0: 44958, 1: 61})
It is the same as in: AlBo0917_RFE_ETC_Bagging.csv
13 : Counter({0: 44958, 1: 61})
It is the same as in: AlBo0917_RFE_ETC_Bagging.csv
14 : Counter({0: 44958

**Results:**

`k      #Frauds  F1-score`
 
`1-4    51       0.51063829787234`

`5      50       0.51063829787234`

`6      60       0.576923076923077`

`7-11   62       0.566037735849057`

`12-19  61       0.576923076923077`


### оставим только те features, которые увеличивают F1-score на sumbit'е

In [55]:
print(top20etc)

['Value_FME', 'Value', 'Amount', 'Value_CE', 'ProviderId_FME', 'PricingStrategy_FME', 'ProductId_FME', 'ProviderId_LE', 'PricingStrategy_LE', 'ProviderId_CE', 'ProductId_CE', 'ChannelId_LE', 'ProductId_LE', 'PricingStrategy_CE', 'ProductCategory_LE', 'ChannelId_CE', 'ProductCategory_FME', 'ProductCategory_CE', 'ChannelId_FME']


In [56]:
classifier = BaggingClassifier(n_estimators=800, n_jobs=-1, random_state=24)

X_tst = X_tst.fillna(0)
X_trn_drp = X_trn.drop(columns=['Value_FME', 'PricingStrategy_FME', 'ChannelId_LE'], axis=1)
X_tst_drp = X_tst.drop(columns=['Value_FME', 'PricingStrategy_FME', 'ChannelId_LE'], axis=1)
    
predict = classifier.fit(X_trn_drp, y_trn).predict(X_tst_drp)
print(Counter(predict))
df_sbm['FraudResult'] = predict
    
# определяем был ли ранее точно такой же результат
current_subm_set = set(df_sbm[df_sbm['FraudResult'] == 1].index.tolist())
# просматриваем все файлы в папке submitted
is_exist = False
files = os.listdir('../submitted')
files.sort()
for f in files:
    f_csv = pd.read_csv('../submitted/' + f)
    if set(f_csv[f_csv['FraudResult'] == 1].index.tolist()) == current_subm_set:
        is_exist = True
        print('It is the same as in: ' + f)
if not is_exist:
    print('New result! Write it')
    df_sbm.to_csv('../submitted/AlBo0917_top_1_6_12_RFE_ETC_Bagging.csv', encoding='utf-8', index=False)

Counter({0: 44947, 1: 72})
New result! Write it


**Result:**

`Filds:'Value_FME', 'PricingStrategy_FME', 'ChannelId_LE' frauds=72 F1-score=0.711864406779661`