In [2]:
import re

# importing libraries
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, confusion_matrix, log_loss
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import RandomizedSearchCV

# classifiers
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
train = pd.read_csv("../Data/ML_train_sample_ada.csv")
train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
train = train.reset_index(drop=True)

test = pd.read_csv("../Data/ML_test_no_sample.csv")
test = test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
test = test.reset_index(drop=True)

In [4]:
train_num_char_benefits_min_value = train["num_char_benefits"].min()
train_num_char_benefits_max_value = train["num_char_benefits"].max()
train_num_words_benefits_min_value = train["num_words_benefits"].min()
train_num_words_benefits_max_value = train["num_words_benefits"].max()
train_avg_word_length_benefits_min_value = train["avg_word_length_benefits"].min()
train_avg_word_length_benefits_max_value = train["avg_word_length_benefits"].max()

train["minmax_num_char_benefits"] = (train["num_char_benefits"] - train_num_char_benefits_min_value) / (train_num_char_benefits_max_value - train_num_char_benefits_min_value)
train['minmax_num_words_benefits'] = (train["num_words_benefits"] - train_num_words_benefits_min_value) / (train_num_words_benefits_max_value - train_num_words_benefits_min_value)
train['minmax_avg_word_length_benefits'] = (train["avg_word_length_benefits"] - train_avg_word_length_benefits_min_value) / (train_avg_word_length_benefits_max_value - train_avg_word_length_benefits_min_value)

train = train.drop(columns = ["num_char_benefits", "num_words_benefits", "avg_word_length_benefits"])

test = test.drop(columns = ["Country", "City", "State", "sentence_length"])
test_num_char_benefits_min_value = test["num_char_benefits"].min()
test_num_char_benefits_max_value = test["num_char_benefits"].max()
test_num_words_benefits_min_value = test["num_words_benefits"].min()
test_num_words_benefits_max_value = test["num_words_benefits"].max()
test_avg_word_length_benefits_min_value = test["avg_word_length_benefits"].min()
test_avg_word_length_benefits_max_value = test["avg_word_length_benefits"].max()

test["minmax_num_char_benefits"] = (test["num_char_benefits"] - test_num_char_benefits_min_value) / (test_num_char_benefits_max_value - test_num_char_benefits_min_value)
test['minmax_num_words_benefits'] = (test["num_words_benefits"] - test_num_words_benefits_min_value) / (test_num_words_benefits_max_value - test_num_words_benefits_min_value)
test['minmax_avg_word_length_benefits'] = (test["avg_word_length_benefits"] - test_avg_word_length_benefits_min_value) / (test_avg_word_length_benefits_max_value - test_avg_word_length_benefits_min_value)

test = test.drop(columns = ["num_char_benefits", "num_words_benefits", "avg_word_length_benefits"])

In [5]:
X_train = train.drop(["fraudulent"], axis = 1)
y_train = train["fraudulent"]

X_test = test.drop(["fraudulent"], axis = 1)
y_test = test["fraudulent"]

In [6]:
tuned_xgb = XGBClassifier(random_state=0, scale_pos_weight=9, max_depth=4, min_child_weight=1, gamma=0.4, subsample=1, colsample_bytree=1)
tuned_lgbm = LGBMClassifier(random_state=0, scale_pos_weight=13, n_estimators=60, min_split_gain=0.2, subsample=0.9, learning_rate=0.3, num_leaves=63, min_child_samples=60)
tuned_catb = CatBoostClassifier(random_state=0, subsample=0.6, scale_pos_weight=13, rsm=0.6, min_child_samples=3, l2_leaf_reg=4.1, depth=4)
tuned_rfc = RandomForestClassifier(random_state=0, n_estimators=800, min_samples_split=2, min_samples_leaf=1, max_depth=10)

In [7]:
def get_overall_evaluation_score(y_true, y_pred, y_pred_prob):
    #accuracy
    print('Accuracy score: ', accuracy_score(y_true, y_pred))
    #precision
    print('Precision score: ', precision_score(y_true, y_pred))
    #recall
    print('Recall score: ', recall_score(y_true, y_pred))
    #f1
    print('F1 score: ', f1_score(y_true, y_pred))
    #log loss
    print('Logarithmic Loss: ', log_loss(y_true, y_pred_prob))
    #confusion matix
    confusion = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix: ")
    print(confusion)

## Ensemble 1 (XGBoost, LightGBM, RFC)

### Average Weights (Equal)

In [116]:
models = [('XGB', tuned_xgb), ('LGBM', tuned_lgbm), ('RandomForest', tuned_rfc)]

# equal weights (average weighted)
weights = [1/len(models)] * len(models)

ensemble_model = VotingClassifier(estimators=models, voting='soft', weights=weights)

#train
ensemble_model.fit(X_train, y_train)

# predict
predictions = ensemble_model.predict(X_test)

# predict probability
predictions_proba = ensemble_model.predict_proba(X_test)

[LightGBM] [Info] Number of positive: 3065, number of negative: 11911
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007483 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 54308
[LightGBM] [Info] Number of data points in the train set: 14976, number of used features: 281
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.204661 -> initscore=-1.357415
[LightGBM] [Info] Start training from score -1.357415


In [117]:
ensemble_predictions_proba = predictions_proba / predictions_proba.sum(axis=1, keepdims=True)

In [118]:
# model evaluation overall score
get_overall_evaluation_score(y_test, predictions, ensemble_predictions_proba)

Accuracy score:  0.9796793437733035
Precision score:  0.7345679012345679
Recall score:  0.9118773946360154
F1 score:  0.8136752136752138
Logarithmic Loss:  0.06758405284051248
Confusion Matrix: 
[[5017   86]
 [  23  238]]


### Different Weights

In [8]:
models = [('XGB', tuned_xgb), ('LGBM', tuned_lgbm), ('RandomForest', tuned_rfc)]

# Define the parameter grid
param_grid = {
    'weights': [[w1, w2, w3] for w1 in np.linspace(0, 1, 11)
                              for w2 in np.linspace(0, 1, 11)
                              for w3 in np.linspace(0, 1, 11)
                              if w1 + w2 + w3 == 1]
}

# Create the ensemble model
ensemble_model = VotingClassifier(estimators=models, voting='soft')

# Create RandomSearch
random_search = RandomizedSearchCV(ensemble_model, param_grid, scoring='recall', cv=5, verbose=3, random_state=0)

# Fit the random search to the data
random_search.fit(X_train, y_train)

# Get the best weights
best_weights = random_search.best_params_['weights']
print("Best Weights:", best_weights)

# Use the best weights to train the final ensemble model
ensemble_model_best_weights = VotingClassifier(estimators=models, voting='soft', weights=best_weights)
ensemble_model_best_weights.fit(X_train, y_train)

# Make predictions
predictions_best_weights = ensemble_model_best_weights.predict(X_test)
predictions_proba_best_weights = ensemble_model_best_weights.predict_proba(X_test)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[LightGBM] [Info] Number of positive: 2452, number of negative: 9528
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005927 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 54306
[LightGBM] [Info] Number of data points in the train set: 11980, number of used features: 280
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.204674 -> initscore=-1.357331
[LightGBM] [Info] Start training from score -1.357331
[CV 1/5] END weights=[0.30000000000000004, 0.1, 0.6000000000000001];, score=0.765 total time=  23.1s
[LightGBM] [Info] Number of positive: 2452, number of negative: 9529
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006793 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 54247
[LightGBM] [Info] Number of data points in the train set: 11981, number of use

In [10]:
# model evaluation overall score
get_overall_evaluation_score(y_test, predictions_best_weights, predictions_proba_best_weights)

Accuracy score:  0.9688665175242357
Precision score:  0.6236842105263158
Recall score:  0.9080459770114943
F1 score:  0.7394695787831512
Logarithmic Loss:  0.07340959145600934
Confusion Matrix: 
[[4960  143]
 [  24  237]]




## Ensemble 2 (LightGBM, CatBoost, RFC)

In [106]:
models = [('CatBoost', tuned_catb), ('LGBM', tuned_lgbm), ('RandomForest', tuned_rfc)]

# equal weights (average weighted)
weights = [1/len(models)] * len(models)

ensemble_model = VotingClassifier(estimators=models, voting='soft', weights=weights)

#train
ensemble_model.fit(X_train, y_train)

# predict
predictions = ensemble_model.predict(X_test)

# predict probability
predictions_proba = ensemble_model.predict_proba(X_test)

0:	learn: 0.6683691	total: 5.43ms	remaining: 5.42s
1:	learn: 0.6448280	total: 11.3ms	remaining: 5.65s
2:	learn: 0.6233293	total: 16.5ms	remaining: 5.49s
3:	learn: 0.6012889	total: 22ms	remaining: 5.47s
4:	learn: 0.5824315	total: 27.7ms	remaining: 5.51s
5:	learn: 0.5649135	total: 32.8ms	remaining: 5.44s
6:	learn: 0.5479852	total: 39.1ms	remaining: 5.55s
7:	learn: 0.5313124	total: 46.7ms	remaining: 5.79s
8:	learn: 0.5153134	total: 52.5ms	remaining: 5.78s
9:	learn: 0.5010821	total: 63.3ms	remaining: 6.27s
10:	learn: 0.4890252	total: 69.3ms	remaining: 6.24s
11:	learn: 0.4735010	total: 75.7ms	remaining: 6.23s
12:	learn: 0.4622592	total: 81.7ms	remaining: 6.2s
13:	learn: 0.4527811	total: 87.5ms	remaining: 6.16s
14:	learn: 0.4428111	total: 94ms	remaining: 6.17s
15:	learn: 0.4313274	total: 99.1ms	remaining: 6.1s
16:	learn: 0.4219553	total: 105ms	remaining: 6.08s
17:	learn: 0.4139997	total: 110ms	remaining: 6.03s
18:	learn: 0.4034458	total: 117ms	remaining: 6.02s
19:	learn: 0.3949172	total: 122

In [107]:
ensemble_predictions_proba = predictions_proba / predictions_proba.sum(axis=1, keepdims=True)

In [108]:
# model evaluation overall score
get_overall_evaluation_score(y_test, predictions, ensemble_predictions_proba)

Accuracy score:  0.9739000745712155
Precision score:  0.6648501362397821
Recall score:  0.9348659003831418
F1 score:  0.7770700636942676
Logarithmic Loss:  0.08017319687898482
Confusion Matrix: 
[[4980  123]
 [  17  244]]


## Ensemble 3 (CatBoost, LightGBM, XGBoost, RFC)

In [109]:
models = [('CatBoost', tuned_catb), ('LGBM', tuned_lgbm), ('RandomForest', tuned_rfc), ('XGBoost', tuned_xgb)]

# equal weights (average weighted)
weights = [1/len(models)] * len(models)

ensemble_model = VotingClassifier(estimators=models, voting='soft', weights=weights)

#train
ensemble_model.fit(X_train, y_train)

# predict
predictions = ensemble_model.predict(X_test)

# predict probability
predictions_proba = ensemble_model.predict_proba(X_test)

0:	learn: 0.6683691	total: 6.91ms	remaining: 6.91s
1:	learn: 0.6448280	total: 14.4ms	remaining: 7.19s
2:	learn: 0.6233293	total: 20.3ms	remaining: 6.75s
3:	learn: 0.6012889	total: 26.6ms	remaining: 6.63s
4:	learn: 0.5824315	total: 32.2ms	remaining: 6.4s
5:	learn: 0.5649135	total: 37.9ms	remaining: 6.29s
6:	learn: 0.5479852	total: 44ms	remaining: 6.25s
7:	learn: 0.5313124	total: 50.2ms	remaining: 6.23s
8:	learn: 0.5153134	total: 57ms	remaining: 6.28s
9:	learn: 0.5010821	total: 63ms	remaining: 6.24s
10:	learn: 0.4890252	total: 68.9ms	remaining: 6.2s
11:	learn: 0.4735010	total: 75ms	remaining: 6.17s
12:	learn: 0.4622592	total: 80.9ms	remaining: 6.14s
13:	learn: 0.4527811	total: 86.9ms	remaining: 6.12s
14:	learn: 0.4428111	total: 92.8ms	remaining: 6.09s
15:	learn: 0.4313274	total: 98.7ms	remaining: 6.07s
16:	learn: 0.4219553	total: 105ms	remaining: 6.06s
17:	learn: 0.4139997	total: 111ms	remaining: 6.05s
18:	learn: 0.4034458	total: 116ms	remaining: 6s
19:	learn: 0.3949172	total: 123ms	rema

In [110]:
ensemble_predictions_proba = predictions_proba / predictions_proba.sum(axis=1, keepdims=True)

In [111]:
# model evaluation overall score
get_overall_evaluation_score(y_test, predictions, ensemble_predictions_proba)

Accuracy score:  0.9686800894854586
Precision score:  0.6183206106870229
Recall score:  0.9310344827586207
F1 score:  0.743119266055046
Logarithmic Loss:  0.07930753229398554
Confusion Matrix: 
[[4953  150]
 [  18  243]]
