# Process data

In [29]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

# Load the dataset
file_path = 'Base.csv'
data = pd.read_csv(file_path)

# Identify categorical and numerical columns
categorical_cols = data.select_dtypes(include=['object']).columns
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns.drop('fraud_bool')

# Define preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Define preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define target and features
X = data.drop('fraud_bool', axis=1)
y = data['fraud_bool']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Apply preprocessing
X_train_prepared = preprocessor.fit_transform(X_train)
X_test_prepared = preprocessor.transform(X_test)


# Random Forest

In [30]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_prepared, y_train)
rf_predictions = rf_model.predict(X_test_prepared)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print('Random Forest Accuracy:', rf_accuracy)

Random Forest Accuracy: 0.9889766666666666


In [31]:
import pickle
with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)

# XGBoost

In [32]:

from xgboost import XGBClassifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train_prepared, y_train)
xgb_predictions = xgb_model.predict(X_test_prepared)
xgb_accuracy = accuracy_score(y_test, xgb_predictions)
print('XGBoost Accuracy:', xgb_accuracy)

XGBoost Accuracy: 0.9888566666666667


In [33]:
import pickle
with open('xgboost_model.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)

# Lightgbm

In [34]:
from lightgbm import LGBMClassifier

lgbm_model = LGBMClassifier(random_state=42)
lgbm_model.fit(X_train_prepared, y_train)
lgbm_predictions = lgbm_model.predict(X_test_prepared)
lgbm_accuracy = accuracy_score(y_test, lgbm_predictions)
print('LightGBM Accuracy:', lgbm_accuracy)

[LightGBM] [Info] Number of positive: 7720, number of negative: 692280
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027672 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3249
[LightGBM] [Info] Number of data points in the train set: 700000, number of used features: 51
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.011029 -> initscore=-4.496176
[LightGBM] [Info] Start training from score -4.496176
LightGBM Accuracy: 0.98868


In [35]:
with open('lightgbm_model.pkl', 'wb') as f:
    pickle.dump(lgbm_model, f)

# Logistic Regression

In [36]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_prepared, y_train)
lr_predictions = lr_model.predict(X_test_prepared)
lr_accuracy = accuracy_score(y_test, lr_predictions)
print('Logistic Regression Accuracy:', lr_accuracy)

Logistic Regression Accuracy: 0.98903


In [37]:
with open('logistic_regression_model.pkl', 'wb') as f:
    pickle.dump(lr_model, f)

# Feature Importance

In [38]:
import numpy as np
import matplotlib.pyplot as plt

feature_names = list(preprocessor.transformers_[0][1].named_steps['scaler'].get_feature_names_out(numerical_cols)) + \
                list(preprocessor.transformers_[1][1].named_steps['onehot'].get_feature_names_out(categorical_cols))


rf_importances = rf_model.feature_importances_
xgb_importances = xgb_model.feature_importances_
lgbm_importances = lgbm_model.feature_importances_
lr_importances = np.abs(lr_model.coef_[0])

In [39]:
rf_normalized = rf_importances / np.sum(rf_importances)
xgb_normalized = xgb_importances / np.sum(xgb_importances)
lgbm_normalized = lgbm_importances / np.sum(lgbm_importances)
lr_normalized = lr_importances / np.sum(lr_importances)

average_importances = (rf_normalized + xgb_normalized + lgbm_normalized + lr_normalized) / 4

def get_top_n_features(importances, feature_names, n=5):
    indices = np.argsort(importances)[-n:][::-1]
    top_features = [(feature_names[i], importances[i]) for i in indices]
    return top_features

top5_overall = get_top_n_features(average_importances, feature_names)
top5_overall

[('housing_status_BA', 0.09746069950179924),
 ('name_email_similarity', 0.0449337856273705),
 ('credit_risk_score', 0.038271385787126404),
 ('device_os_windows', 0.036167628283688094),
 ('days_since_request', 0.03597900197247583)]

Index(['fraud_bool', 'income', 'name_email_similarity',
       'prev_address_months_count', 'current_address_months_count',
       'customer_age', 'days_since_request', 'intended_balcon_amount',
       'payment_type', 'zip_count_4w', 'velocity_6h', 'velocity_24h',
       'velocity_4w', 'bank_branch_count_8w',
       'date_of_birth_distinct_emails_4w', 'employment_status',
       'credit_risk_score', 'email_is_free', 'housing_status',
       'phone_home_valid', 'phone_mobile_valid', 'bank_months_count',
       'has_other_cards', 'proposed_credit_limit', 'foreign_request', 'source',
       'session_length_in_minutes', 'device_os', 'keep_alive_session',
       'device_distinct_emails_8w', 'device_fraud_count', 'month'],
      dtype='object')

In [70]:
top5_feature_names = [feature[0] for feature in top5_features]

In [71]:
top5_sum = sum([feature[1] for feature in top5_overall])
top5_sum

0.25281250117246007

In [82]:
top5_feature_names = [
'housing_status_BA',
 'housing_status_BB',
 'housing_status_BC',
 'housing_status_BD',
 'housing_status_BE',
 'housing_status_BF',
 'housing_status_BG',
 'device_os_linux',
 'device_os_macintosh',
 'device_os_other',
 'device_os_windows',
 'device_os_x11',
 'name_email_similarity',
 'credit_risk_score',
 'days_since_request'
]

# Retrain four models using only top 5 features



In [83]:
from sklearn.ensemble import RandomForestClassifier

X_train_top5 = X_train_prepared[:, [feature_names.index(f) for f in top5_feature_names]]
X_test_top5 = X_test_prepared[:, [feature_names.index(f) for f in top5_feature_names]]
# Initialize the models
rf_model_top5 = RandomForestClassifier(random_state=42)
xgb_model_top5 = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
lgbm_model_top5 = LGBMClassifier(random_state=42)
lr_model_top5 = LogisticRegression(max_iter=1000, random_state=42)

In [84]:
# Fit the models on the reduced dataset
rf_model_top5.fit(X_train_top5, y_train)
xgb_model_top5.fit(X_train_top5, y_train)
lgbm_model_top5.fit(X_train_top5, y_train)
lr_model_top5.fit(X_train_top5, y_train)

# Predict and evaluate the models
rf_accuracy_top5 = accuracy_score(y_test, rf_model_top5.predict(X_test_top5))
xgb_accuracy_top5 = accuracy_score(y_test, xgb_model_top5.predict(X_test_top5))
lgbm_accuracy_top5 = accuracy_score(y_test, lgbm_model_top5.predict(X_test_top5))
lr_accuracy_top5 = accuracy_score(y_test, lr_model_top5.predict(X_test_top5))

# Print the accuracies
print('RF Accuracy with Top 5 Features:', rf_accuracy_top5)
print('XGB Accuracy with Top 5 Features:', xgb_accuracy_top5)
print('LGBM Accuracy with Top 5 Features:', lgbm_accuracy_top5)
print('LR Accuracy with Top 5 Features:', lr_accuracy_top5)

[LightGBM] [Info] Number of positive: 7720, number of negative: 692280
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001551 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 789
[LightGBM] [Info] Number of data points in the train set: 700000, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.011029 -> initscore=-4.496176
[LightGBM] [Info] Start training from score -4.496176
RF Accuracy with Top 5 Features: 0.9888833333333333
XGB Accuracy with Top 5 Features: 0.9889433333333333
LGBM Accuracy with Top 5 Features: 0.9889166666666667
LR Accuracy with Top 5 Features: 0.98897


In [85]:
import pickle
with open('random_forest_model_top5.pkl', 'wb') as f:
    pickle.dump(rf_model_top5, f)
with open('xgboost_model_top5.pkl', 'wb') as f:
    pickle.dump(xgb_model_top5, f)
with open('lightgbm_model_top5.pkl', 'wb') as f:
    pickle.dump(lgbm_model_top5, f)
with open('logistic_regression_model_top5.pkl', 'wb') as f:
    pickle.dump(lr_model_top5, f)