In [1]:
# importing libraries
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, confusion_matrix, log_loss

# classifiers
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier

# for hyperparameter tuning
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import re

In [2]:
train = pd.read_csv("Data/ML_train_sample_ada.csv")
train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
train = train.reset_index(drop=True)

test = pd.read_csv("Data/ML_test_no_sample.csv")
test = test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
test = test.reset_index(drop=True)

In [3]:
# constant = 1e-10
train_num_char_benefits_min_value = train["num_char_benefits"].min()
train_num_char_benefits_max_value = train["num_char_benefits"].max()
train_num_words_benefits_min_value = train["num_words_benefits"].min()
train_num_words_benefits_max_value = train["num_words_benefits"].max()
train_avg_word_length_benefits_min_value = train["avg_word_length_benefits"].min()
train_avg_word_length_benefits_max_value = train["avg_word_length_benefits"].max()

train["minmax_num_char_benefits"] = (train["num_char_benefits"] - train_num_char_benefits_min_value) / (train_num_char_benefits_max_value - train_num_char_benefits_min_value)
train['minmax_num_words_benefits'] = (train["num_words_benefits"] - train_num_words_benefits_min_value) / (train_num_words_benefits_max_value - train_num_words_benefits_min_value)
train['minmax_avg_word_length_benefits'] = (train["avg_word_length_benefits"] - train_avg_word_length_benefits_min_value) / (train_avg_word_length_benefits_max_value - train_avg_word_length_benefits_min_value)

train = train.drop(columns = ["num_char_benefits", "num_words_benefits", "avg_word_length_benefits"])

test = test.drop(columns = ["Country", "City", "State", "sentence_length"])
test_num_char_benefits_min_value = test["num_char_benefits"].min()
test_num_char_benefits_max_value = test["num_char_benefits"].max()
test_num_words_benefits_min_value = test["num_words_benefits"].min()
test_num_words_benefits_max_value = test["num_words_benefits"].max()
test_avg_word_length_benefits_min_value = test["avg_word_length_benefits"].min()
test_avg_word_length_benefits_max_value = test["avg_word_length_benefits"].max()

test["minmax_num_char_benefits"] = (test["num_char_benefits"] - test_num_char_benefits_min_value) / (test_num_char_benefits_max_value - test_num_char_benefits_min_value)
test['minmax_num_words_benefits'] = (test["num_words_benefits"] - test_num_words_benefits_min_value) / (test_num_words_benefits_max_value - test_num_words_benefits_min_value)
test['minmax_avg_word_length_benefits'] = (test["avg_word_length_benefits"] - test_avg_word_length_benefits_min_value) / (test_avg_word_length_benefits_max_value - test_avg_word_length_benefits_min_value)

test = test.drop(columns = ["num_char_benefits", "num_words_benefits", "avg_word_length_benefits"])

In [4]:
def get_overall_evaluation_score(y_true, y_pred, y_pred_prob):
    #accuracy
    print('Accuracy score: ', accuracy_score(y_true, y_pred))
    #precision
    print('Precision score: ', precision_score(y_true, y_pred))
    #recall
    print('Recall score: ', recall_score(y_true, y_pred))
    #f1
    print('F1 score: ', f1_score(y_true, y_pred))
    #log loss
    print('Logarithmic Loss: ', log_loss(y_true, y_pred_prob))
    #confusion matix
    confusion = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix: ")
    print(confusion)

In [5]:
y_train = train.copy()["fraudulent"]
X_train = train.copy().drop(columns = ["fraudulent"])
y_test = test.copy()["fraudulent"]
X_test = test.copy().drop(columns = ["fraudulent"])

### Baseline LightGBM (No Tuning)

#### On Training Data

In [6]:
classifier_gb = LGBMClassifier(random_state=0)
# train
classifier_gb.fit(X_train, y_train)
# predict
predictions_gb_train = classifier_gb.predict(X_train)

[LightGBM] [Info] Number of positive: 11894, number of negative: 11911
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.054582 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55090
[LightGBM] [Info] Number of data points in the train set: 23805, number of used features: 281
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499643 -> initscore=-0.001428
[LightGBM] [Info] Start training from score -0.001428


In [7]:
# predict probability
predictions_proba_gb_train = classifier_gb.predict_proba(X_train)

In [8]:
# model evaluation overall score
get_overall_evaluation_score(y_train, predictions_gb_train, predictions_proba_gb_train)

Accuracy score:  0.9955891619407687
Precision score:  0.9933048790693781
Recall score:  0.9978980998822936
F1 score:  0.9955961917543934
Logarithmic Loss:  0.02114808051531237
Confusion Matrix: 
[[11831    80]
 [   25 11869]]


#### On Test Data

In [9]:
# predict
predictions_gb_test = classifier_gb.predict(X_test)

In [10]:
# predict probability
predictions_proba_gb_test = classifier_gb.predict_proba(X_test)

In [11]:
# model evaluation overall score
get_overall_evaluation_score(y_test, predictions_gb_test, predictions_proba_gb_test)

Accuracy score:  0.9798657718120806
Precision score:  0.7593220338983051
Recall score:  0.8582375478927203
F1 score:  0.8057553956834531
Logarithmic Loss:  0.052782141456181306
Confusion Matrix: 
[[5032   71]
 [  37  224]]


### On Test Data (Hyperparameter Tuning)

In [12]:
param_test_lgbm = {
    'scale_pos_weight':range(1,18,4),
    'n_estimators':range(20,81,10),
    'min_split_gain':np.arange(0,0.4,0.1),
    'min_child_samples': range(20,71,10),
    'subsample':[0.6,0.7,0.75,0.8,0.85,0.9],
    'learning_rate':[0.1, 0.2, 0.3, 0.4, 0.5],
    'num_leaves': [31, 63, 127]
}

In [13]:
lgb_tuned = LGBMClassifier(random_state=0)
lgb_search = RandomizedSearchCV(estimator=lgb_tuned, param_distributions=param_test_lgbm, scoring='recall', cv=5, verbose=3, random_state=0)
lgb_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[LightGBM] [Info] Number of positive: 9516, number of negative: 9528
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.036747 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55079
[LightGBM] [Info] Number of data points in the train set: 19044, number of used features: 281
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499685 -> initscore=-0.001260
[LightGBM] [Info] Start training from score -0.001260
[CV 1/5] END learning_rate=0.5, min_child_samples=50, min_split_gain=0.0, n_estimators=40, num_leaves=63, scale_pos_weight=13, subsample=0.6;, score=0.926 total time=   1.7s
[LightGBM] [Info] Number of positive: 9515, number of negative: 9529
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.034448 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55089
[Ligh

In [14]:
# get the best hyperparameters and estimator
lgb_best_params = lgb_search.best_params_
lgb_best_estimator = lgb_search.best_estimator_

# train the final model with the best hyperparameters
lgb_final_tuned = lgb_best_estimator.fit(X_train, y_train)

# eval on test set
lgb_y_pred = lgb_final_tuned.predict(X_test)
lgb_y_pred_proba = lgb_final_tuned.predict_proba(X_test)

[LightGBM] [Info] Number of positive: 11894, number of negative: 11911
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.056886 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55090
[LightGBM] [Info] Number of data points in the train set: 23805, number of used features: 281
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499643 -> initscore=-0.001428
[LightGBM] [Info] Start training from score -0.001428


In [15]:
# results
get_overall_evaluation_score(y_test, lgb_y_pred, lgb_y_pred_proba)

Accuracy score:  0.9703579418344519
Precision score:  0.6349206349206349
Recall score:  0.9195402298850575
F1 score:  0.7511737089201878
Logarithmic Loss:  0.07672630770817271
Confusion Matrix: 
[[4965  138]
 [  21  240]]


### Baseline XGBoost (No Tuning)

#### On Training Data

In [16]:
classifier_xgb = XGBClassifier(random_state=0)
# train
classifier_xgb.fit(X_train, y_train)
# predict
predictions_xgb_train = classifier_xgb.predict(X_train)

In [17]:
# predict probability
predictions_proba_xgb_train = classifier_xgb.predict_proba(X_train)

In [18]:
# model evaluation overall score
get_overall_evaluation_score(y_train, predictions_xgb_train, predictions_proba_xgb_train)

Accuracy score:  0.9984877126654065
Precision score:  0.9976498237367802
Recall score:  0.999327391962334
F1 score:  0.9984879032258065
Logarithmic Loss:  0.011108492146793185
Confusion Matrix: 
[[11883    28]
 [    8 11886]]


#### On Test Data

In [19]:
# predict
predictions_xgb_test = classifier_xgb.predict(X_test)

In [20]:
# predict probability
predictions_proba_xgb_test = classifier_xgb.predict_proba(X_test)

In [21]:
# model evaluation overall score
get_overall_evaluation_score(y_test, predictions_xgb_test, predictions_proba_xgb_test)

Accuracy score:  0.9834079045488442
Precision score:  0.8208955223880597
Recall score:  0.842911877394636
F1 score:  0.8317580340264651
Logarithmic Loss:  0.047246070699490474
Confusion Matrix: 
[[5055   48]
 [  41  220]]


### On Test Data (Hyperparameter Tuning)

In [22]:
xgb_param_test = {
 'scale_pos_weight':range(1,18,4),
 'max_depth':range(4,10,2),
 'min_child_weight':range(1,6,2),
 'gamma':[i/10.0 for i in range(0,5)],
 'subsample':[i/100.0 for i in range(70,105,10)],
 'colsample_bytree':[i/100.0 for i in range(70,105,10)],
}

In [23]:
xgb_tuned = XGBClassifier(random_state=0)
xgb_search = RandomizedSearchCV(estimator=xgb_tuned, param_distributions=xgb_param_test, scoring='recall', cv=5, verbose=3, random_state=0)
xgb_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[CV 1/5] END colsample_bytree=1.0, gamma=0.0, max_depth=4, min_child_weight=3, scale_pos_weight=13, subsample=0.7;, score=0.959 total time=   5.8s
[CV 2/5] END colsample_bytree=1.0, gamma=0.0, max_depth=4, min_child_weight=3, scale_pos_weight=13, subsample=0.7;, score=0.999 total time=   5.0s
[CV 3/5] END colsample_bytree=1.0, gamma=0.0, max_depth=4, min_child_weight=3, scale_pos_weight=13, subsample=0.7;, score=1.000 total time=   5.3s
[CV 4/5] END colsample_bytree=1.0, gamma=0.0, max_depth=4, min_child_weight=3, scale_pos_weight=13, subsample=0.7;, score=1.000 total time=   5.1s
[CV 5/5] END colsample_bytree=1.0, gamma=0.0, max_depth=4, min_child_weight=3, scale_pos_weight=13, subsample=0.7;, score=1.000 total time=   5.4s
[CV 1/5] END colsample_bytree=0.9, gamma=0.4, max_depth=6, min_child_weight=3, scale_pos_weight=5, subsample=1.0;, score=0.942 total time=   6.1s
[CV 2/5] END colsample_bytree=0.9, gamma=0.4, max_depth=6, min_child_weight=3, scale_pos_weight=5, subsample=1.0;, scor

In [24]:
# get the best hyperparameters and estimator
xgb_best_params = xgb_search.best_params_
xgb_best_estimator = xgb_search.best_estimator_

# train the final model with the best hyperparameters
xgb_final_tuned = xgb_best_estimator.fit(X_train, y_train)

# eval on test set
xgb_y_pred = xgb_final_tuned.predict(X_test)
xgb_y_pred_proba = xgb_final_tuned.predict_proba(X_test)

In [25]:
# results
get_overall_evaluation_score(y_test, xgb_y_pred, xgb_y_pred_proba)

Accuracy score:  0.9586129753914989
Precision score:  0.5436241610738255
Recall score:  0.9310344827586207
F1 score:  0.6864406779661016
Logarithmic Loss:  0.111319135481601
Confusion Matrix: 
[[4899  204]
 [  18  243]]


### AdaBoost

#### On Training Data

In [26]:
classifier_ab = AdaBoostClassifier(random_state=0)
# train
classifier_ab.fit(X_train, y_train)
# predict
predictions_ab_train = classifier_ab.predict(X_train)

In [27]:
# predict probability
predictions_proba_ab_train = classifier_ab.predict_proba(X_train)

In [28]:
# model evaluation overall score
get_overall_evaluation_score(y_train, predictions_ab_train, predictions_proba_ab_train)

Accuracy score:  0.9486242386053351
Precision score:  0.9232172602522408
Recall score:  0.9785606187993946
F1 score:  0.9500836700542834
Logarithmic Loss:  0.6151632190599021
Confusion Matrix: 
[[10943   968]
 [  255 11639]]


#### On Test Data

In [29]:
# predict
predictions_ab_test = classifier_ab.predict(X_test)

In [30]:
# predict probability
predictions_proba_ab_test = classifier_ab.predict_proba(X_test)

In [31]:
# model evaluation overall score
get_overall_evaluation_score(y_test, predictions_ab_test, predictions_proba_ab_test)

Accuracy score:  0.9159209545115585
Precision score:  0.34824281150159747
Recall score:  0.8352490421455939
F1 score:  0.49154453213077787
Logarithmic Loss:  0.5778279387471257
Confusion Matrix: 
[[4695  408]
 [  43  218]]


### On Test Data (Hyperparameter Tuning)

In [32]:
adab_param_test = {
 'n_estimators': [50, 100, 500],
 'learning_rate': np.arange(0.6, 1.5, 0.2),
}

In [33]:
adab_tuned = AdaBoostClassifier(random_state=0)
adab_search = RandomizedSearchCV(estimator=adab_tuned, param_distributions=adab_param_test, scoring='recall', cv=5, verbose=3, random_state=0)
adab_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END learning_rate=0.6, n_estimators=100;, score=0.937 total time=  36.3s
[CV 2/5] END learning_rate=0.6, n_estimators=100;, score=0.984 total time=  38.9s
[CV 3/5] END learning_rate=0.6, n_estimators=100;, score=0.987 total time=  43.0s
[CV 4/5] END learning_rate=0.6, n_estimators=100;, score=0.974 total time=  38.9s
[CV 5/5] END learning_rate=0.6, n_estimators=100;, score=0.989 total time=  39.0s
[CV 1/5] END learning_rate=1.0, n_estimators=50;, score=0.916 total time=  19.9s
[CV 2/5] END learning_rate=1.0, n_estimators=50;, score=0.976 total time=  18.9s
[CV 3/5] END learning_rate=1.0, n_estimators=50;, score=0.966 total time=  19.8s
[CV 4/5] END learning_rate=1.0, n_estimators=50;, score=0.957 total time=  19.7s
[CV 5/5] END learning_rate=1.0, n_estimators=50;, score=0.978 total time=  19.6s
[CV 1/5] END learning_rate=1.0, n_estimators=500;, score=0.884 total time= 3.2min
[CV 2/5] END learning_rate=1.0, n_estimato

In [34]:
# get the best hyperparameters and estimator
adab_best_params = adab_search.best_params_
adab_best_estimator = adab_search.best_estimator_

# train the final model with the best hyperparameters
adab_final_tuned = adab_best_estimator.fit(X_train, y_train)

# eval on test set
adab_y_pred = adab_final_tuned.predict(X_test)
adab_y_pred_proba = adab_final_tuned.predict_proba(X_test)

In [35]:
# results
get_overall_evaluation_score(y_test, adab_y_pred, adab_y_pred_proba)

Accuracy score:  0.9366144668158091
Precision score:  0.42300194931773877
Recall score:  0.8314176245210728
F1 score:  0.5607235142118863
Logarithmic Loss:  0.6090054884730933
Confusion Matrix: 
[[4807  296]
 [  44  217]]


### CatBoost

#### On Training Data

In [36]:
classifier_catb = CatBoostClassifier(random_state=0)
# train
classifier_catb.fit(X_train, y_train)
# predict
predictions_catb_train = classifier_catb.predict(X_train)

Learning rate set to 0.039881
0:	learn: 0.6499909	total: 321ms	remaining: 5m 20s
1:	learn: 0.6112170	total: 397ms	remaining: 3m 18s
2:	learn: 0.5884560	total: 569ms	remaining: 3m 9s
3:	learn: 0.5581293	total: 653ms	remaining: 2m 42s
4:	learn: 0.5344385	total: 851ms	remaining: 2m 49s
5:	learn: 0.5098563	total: 1.07s	remaining: 2m 58s
6:	learn: 0.4878037	total: 1.22s	remaining: 2m 53s
7:	learn: 0.4675589	total: 1.34s	remaining: 2m 46s
8:	learn: 0.4489735	total: 1.43s	remaining: 2m 37s
9:	learn: 0.4349149	total: 1.56s	remaining: 2m 34s
10:	learn: 0.4151182	total: 1.64s	remaining: 2m 27s
11:	learn: 0.4045193	total: 1.73s	remaining: 2m 22s
12:	learn: 0.3883041	total: 1.87s	remaining: 2m 22s
13:	learn: 0.3742825	total: 1.94s	remaining: 2m 16s
14:	learn: 0.3634506	total: 2.02s	remaining: 2m 12s
15:	learn: 0.3545026	total: 2.18s	remaining: 2m 13s
16:	learn: 0.3447229	total: 2.23s	remaining: 2m 9s
17:	learn: 0.3355061	total: 2.3s	remaining: 2m 5s
18:	learn: 0.3279631	total: 2.47s	remaining: 2m 

In [37]:
# predict probability
predictions_proba_catb_train = classifier_catb.predict_proba(X_train)

In [38]:
# model evaluation overall score
get_overall_evaluation_score(y_train, predictions_catb_train, predictions_proba_catb_train)

Accuracy score:  0.996555345515648
Precision score:  0.993977919036467
Recall score:  0.9991592399529174
F1 score:  0.9965618448637317
Logarithmic Loss:  0.02114953280274731
Confusion Matrix: 
[[11839    72]
 [   10 11884]]


#### On Test Data

In [39]:
# predict
predictions_catb_test = classifier_catb.predict(X_test)

In [40]:
# predict probability
predictions_proba_catb_test = classifier_catb.predict_proba(X_test)

In [41]:
# model evaluation overall score
get_overall_evaluation_score(y_test, predictions_catb_test, predictions_proba_catb_test)

Accuracy score:  0.977255779269202
Precision score:  0.7293729372937293
Recall score:  0.8467432950191571
F1 score:  0.7836879432624112
Logarithmic Loss:  0.05641850288376288
Confusion Matrix: 
[[5021   82]
 [  40  221]]


### On Test Data (Hyperparameter Tuning)

In [42]:
catb_param_test = {
    'scale_pos_weight': [1,5,9,13,17],
    'depth': [4, 6, 8],
    'min_child_samples': [1,2,3,4,5,6,7,8],
    'l2_leaf_reg': [0.1, 1.1, 2.1, 3.1, 4.1, 5.1],
    'subsample': [0.6, 0.7, 0.8, 0.9],
    'rsm': [0.6,0.7,0.8,0.9],
}

In [43]:
catb_tuned = CatBoostClassifier(random_state=0)
catb_search = RandomizedSearchCV(estimator=catb_tuned, param_distributions=catb_param_test, scoring='recall', cv=5, verbose=3, random_state=0)
catb_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


0:	learn: 0.6491966	total: 180ms	remaining: 3m
1:	learn: 0.6097567	total: 237ms	remaining: 1m 58s
2:	learn: 0.5721916	total: 422ms	remaining: 2m 20s
3:	learn: 0.5365485	total: 573ms	remaining: 2m 22s
4:	learn: 0.5054216	total: 630ms	remaining: 2m 5s
5:	learn: 0.4800269	total: 682ms	remaining: 1m 52s
6:	learn: 0.4556434	total: 764ms	remaining: 1m 48s
7:	learn: 0.4307357	total: 887ms	remaining: 1m 49s
8:	learn: 0.4072670	total: 948ms	remaining: 1m 44s
9:	learn: 0.3859321	total: 1.01s	remaining: 1m 39s
10:	learn: 0.3696010	total: 1.12s	remaining: 1m 40s
11:	learn: 0.3532906	total: 1.24s	remaining: 1m 42s
12:	learn: 0.3388619	total: 1.3s	remaining: 1m 38s
13:	learn: 0.3253129	total: 1.36s	remaining: 1m 36s
14:	learn: 0.3123356	total: 1.5s	remaining: 1m 38s
15:	learn: 0.3005556	total: 1.55s	remaining: 1m 35s
16:	learn: 0.2892609	total: 1.63s	remaining: 1m 34s
17:	learn: 0.2784614	total: 1.72s	remaining: 1m 33s
18:	learn: 0.2668610	total: 1.88s	remaining: 1m 36s
19:	learn: 0.2577015	total: 1

In [44]:
# get the best hyperparameters and estimator
catb_best_params = catb_search.best_params_
catb_best_estimator = catb_search.best_estimator_

# train the final model with the best hyperparameters
catb_final_tuned = catb_best_estimator.fit(X_train, y_train)

# eval on test set
catb_y_pred = catb_final_tuned.predict(X_test)
catb_y_pred_proba = catb_final_tuned.predict_proba(X_test)

0:	learn: 0.6514037	total: 56.5ms	remaining: 56.4s
1:	learn: 0.6119039	total: 118ms	remaining: 58.8s
2:	learn: 0.5744316	total: 155ms	remaining: 51.5s
3:	learn: 0.5411075	total: 207ms	remaining: 51.4s
4:	learn: 0.5096275	total: 251ms	remaining: 50s
5:	learn: 0.4840395	total: 302ms	remaining: 50s
6:	learn: 0.4590624	total: 356ms	remaining: 50.5s
7:	learn: 0.4340777	total: 402ms	remaining: 49.8s
8:	learn: 0.4103958	total: 455ms	remaining: 50.1s
9:	learn: 0.3895488	total: 493ms	remaining: 48.8s
10:	learn: 0.3713201	total: 555ms	remaining: 49.9s
11:	learn: 0.3528471	total: 627ms	remaining: 51.6s
12:	learn: 0.3371263	total: 668ms	remaining: 50.7s
13:	learn: 0.3233996	total: 718ms	remaining: 50.6s
14:	learn: 0.3103290	total: 766ms	remaining: 50.3s
15:	learn: 0.2976347	total: 821ms	remaining: 50.5s
16:	learn: 0.2867877	total: 869ms	remaining: 50.3s
17:	learn: 0.2752826	total: 913ms	remaining: 49.8s
18:	learn: 0.2652429	total: 961ms	remaining: 49.6s
19:	learn: 0.2566427	total: 1.01s	remaining:

In [45]:
# results
get_overall_evaluation_score(y_test, catb_y_pred, catb_y_pred_proba)

Accuracy score:  0.9183445190156599
Precision score:  0.36927621861152143
Recall score:  0.9578544061302682
F1 score:  0.5330490405117271
Logarithmic Loss:  0.2399526603664135
Confusion Matrix: 
[[4676  427]
 [  11  250]]
