In [None]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.auto import tqdm

In [None]:
#import the data
df_bank = pd.read_csv('train.csv')

print("Total row and column in the data set is:", df_bank.shape)

In [None]:
df_bank

**DATA PREPROCESSING**
- Write a function for preprocessing step so that we can use this function for both train dataset and test dataset.
- Use StandardScaler to standardize numerical variables and using PCA to transfomr these variables. In this case, we did not use PCA to reduce dimensionality of numerical variables. These step increase the performance of the model compared to "not using PCA" and "using PCA to reduce dimensionality.
- Use get_dummies to one-hot encode categorical variables
- Consider preprocessing steps in test data such as return **id** column in prediction file when use trained model on test dataset.

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split

In [None]:
def preprocessing(df, transforms=None, has_label=True, return_id=False):
    id = df.loc[:, "id"]
    # df = df.drop(columns = ['id', 'month', 'day', 'civil', 'employees'])
    df = df.drop(columns = ["id", "day", "employees"])
    if has_label:
        X = df.drop(columns = "outcome")
    else:
        X = df

    non_number_cols = [k for k in X.keys() if str(X[k].dtype) not in ["int64", "float64"]]
    number_cols = list(set(X.keys()) - set(non_number_cols))
    X_numeric = X.loc[:, number_cols]
    X_non_numeric = pd.get_dummies(X[non_number_cols], drop_first=False)
    # X_non_numeric = X[non_number_cols]
    # for col in non_number_cols:
    #     X_non_numeric[col] = LabelEncoder().fit_transform(X_non_numeric[col])

    n_components = min(8, len(X_numeric.keys()))
    if transforms is None:
        std_scaler = StandardScaler()
        pca = PCA(n_components=n_components)
        X_numeric = std_scaler.fit_transform(X_numeric)
        # X_numeric = pca.fit_transform(X_numeric)
    else:
        std_scaler, pca = transforms
        X_numeric = std_scaler.transform(X_numeric)
        # X_numeric = pca.transform(X_numeric)

    X_numeric_labels = [f"pca_{i}" for i in range(n_components)]
    X_numeric = pd.DataFrame(X_numeric, columns=X_numeric_labels)
    X = pd.concat([X_numeric, X_non_numeric], axis=1)

    X = X.loc[:, ~X.columns.str.contains("unknown")]

    transforms = [std_scaler, pca]

    if has_label:
        y = df["outcome"]
        
        return X, y, transforms
    else:
        if return_id:
            return id, X, transforms
        else:
            return X, transforms

In [None]:
X, y, transforms = preprocessing(df_bank)

In [None]:
X

In [None]:
X.keys()

In [None]:
random_seed = 42
test_size = 0.2

In [None]:
#split train dataset into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, 
    y, 
    test_size=test_size, 
    shuffle=True, 
    random_state=random_seed,
)

**HYPERPARAMETER SEARCH WITH RANDOM FOREST MODEL**

In [None]:
len(y[y==0]), len(y[y==1]), len(y[y==1])/len(y)

In [None]:
len(y_val[y_val==0]), len(y_val[y_val==1]), len(y_val[y_val==1])/len(y_val)

In [None]:
len(y_train[y_train==0]), len(y_train[y_train==1]), len(y_train[y_train==1])/len(y_train)

In [None]:
features = set(X.keys())
features

**1. Over Resampling minority class**

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.utils import shuffle
from sklearn.utils.random import sample_without_replacement

In [None]:
X_train_1, y_train_1 = X_train[y_train == 1], y_train[y_train == 1]
X_train_0, y_train_0 = X_train[y_train == 0], y_train[y_train == 0]

In [None]:
#over sample minority class with a specific ratio
ratio_scaler = 3
oversample_ratio = 1
ratio = (ratio_scaler * len(X_train_1))/len(X_train_0)
ratio = 0.999 if ratio > 1.0 else ratio

In [None]:
X_train_0, _, y_train_0, _ = train_test_split(X_train_0, y_train_0, train_size=ratio)

In [None]:
X_train = pd.concat([X_train_0] + [X_train_1] * oversample_ratio)
y_train = pd.concat([y_train_0] + [y_train_1] * oversample_ratio)

In [None]:
# X_train = pd.concat([X_train_0, X_train_1])
# y_train = pd.concat([y_train_0, y_train_1])

In [None]:
X_train, y_train = shuffle(X_train, y_train, random_state=random_seed)

In [None]:
#over resample minority class so that minrity class are equal to majority class. Model using this step has lower performance
# sm1 = SMOTE(random_state=random_seed)
# X_train, y_train = sm1.fit_resample(X_train, y_train)
# sm2 = SMOTE(random_state=random_seed)
# X_val, y_val = sm2.fit_resample(X_val, y_val)

**2. Feature selection and train classifier models**

**a. Train models with full features**

In [None]:
# get_model = lambda: RandomForestClassifier(n_estimators=200, random_state=random_seed)
# get_model = lambda: DecisionTreeClassifier(max_depth=100, random_state=random_seed)
# get_model = lambda: AdaBoostClassifier(n_estimators=600, random_state=random_seed)
# get_model = lambda: KNeighborsClassifier()
get_model = lambda: GradientBoostingClassifier(n_estimators=150, max_depth=3, validation_fraction=0.01, random_state=1, learning_rate=0.05)
# get_model = lambda: SVC(probability=True)
# get_model = lambda: MLPClassifier(hidden_layer_sizes=500, max_iter=1000, verbose=False, learning_rate_init=1e-4, learning_rate="adaptive", early_stopping=True)

In [None]:
# train and test models when choosing ALL available features:

base_model = get_model()
base_model.fit(X_train, y_train)

#evaluate the accuracy on validation set
y_pred = base_model.predict_proba(X_val)[:,1]
auc_ = roc_auc_score(y_val, y_pred)
print(auc_)

*Plot ROC curve*

In [None]:
# get_model1 = lambda: RandomForestClassifier(n_estimators=200, random_state=random_seed)
# get_model2 = lambda: DecisionTreeClassifier(max_depth=100, random_state=random_seed)
# get_model3 = lambda: AdaBoostClassifier(n_estimators=600, random_state=random_seed)
# get_model4 = lambda: KNeighborsClassifier()
# get_model5 = lambda: GradientBoostingClassifier(n_estimators=150, max_depth=3, validation_fraction=0.01, random_state=1, learning_rate=0.05)
# get_model6 = lambda: MLPClassifier(hidden_layer_sizes=500, max_iter=1000, verbose=False, learning_rate_init=1e-4, learning_rate="adaptive", early_stopping=True)


# #random forest
# rf_model = get_model1()
# rf_model.fit(X_train, y_train)
# y_pred_rf = rf_model.predict_proba(X_val)[:,1]
# auc_rf = roc_auc_score(y_val, y_pred_rf)

# #decision tree
# dt_model = get_model2()
# dt_model.fit(X_train, y_train)
# y_pred_dt = dt_model.predict_proba(X_val)[:,1]
# auc_dt = roc_auc_score(y_val, y_pred_dt)

# #adaboost
# ada_model = get_model3()
# ada_model.fit(X_train, y_train)
# y_pred_ada = ada_model.predict_proba(X_val)[:,1]
# auc_ada = roc_auc_score(y_val, y_pred_ada)

# #Kneighbor
# kn_model = get_model4()
# kn_model.fit(X_train, y_train)
# y_pred_kn = kn_model.predict_proba(X_val)[:,1]
# auc_kn = roc_auc_score(y_val, y_pred_kn)

# #Kneighbor
# gb_model = get_model5()
# gb_model.fit(X_train, y_train)
# y_pred_gb = gb_model.predict_proba(X_val)[:,1]
# auc_gb = roc_auc_score(y_val, y_pred_gb)

# #Kneighbor
# mlp_model = get_model5()
# mlp_model.fit(X_train, y_train)
# y_pred_mlp = mlp_model.predict_proba(X_val)[:,1]
# auc_mlp = roc_auc_score(y_val, y_pred_mlp)

# #plot ROC with AUC of each models
# from sklearn.metrics import plot_roc_curve

# # Define the models and their names
# models = [rf_model, dt_model, ada_model, kn_model, gb_model, mlp_model]
# model_names = ["Random Forest", "Decision Tree", "AdaBoost", "KNN", "Gradient Boosting", "MLP"]
# auc_scores = [auc_rf, auc_dt, auc_ada, auc_kn, auc_gb, auc_mlp]

# # Plot the ROC curves
# fig, ax = plt.subplots()
# for i, model in enumerate(models):
#     plot_roc_curve(model, X_val, y_val, ax=ax, name=model_names[i])

# # Add labels and title
# ax.set_xlabel("False Positive Rate")
# ax.set_ylabel("True Positive Rate")
# ax.set_title("ROC Curves of Different Models")

# # Add the AUC scores to the legend
# ax.legend(loc="lower right")
# for i, model in enumerate(models):
#     ax.text(1.0, 0.9-i*0.1, f"{model_names[i]} (AUC={auc_scores[i]:.3f})", transform=ax.transAxes, ha="right")

# # Show the plot
# plt.show()

**2. Features selection**

In [None]:
# features_selected_list = ["age", "cconf", "employment"]
features_selected_list = []
select_k = 1
auc = 0
it = 0
max_iter = 50

prev_selected = None

pbar = tqdm(total=max_iter)
while it < max_iter:
    # select features and label
    available_feature = list(features - set(features_selected_list))
    if len(available_feature) == 0: break
    if len(available_feature) > select_k:
        new_feature = random.sample(available_feature, k=select_k)
    else:
        new_feature = random.sample(available_feature, k=1)
    selected_features = list(features_selected_list) + new_feature

    X_train_feature = X_train.loc[:, selected_features]
    X_val_feature = X_val.loc[:, selected_features]

    #train random forest classifier model with selected feature
    model = get_model()
    model.fit(X_train_feature, y_train)

    #evaluate the accuracy on validation set
    y_pred = model.predict_proba(X_val_feature)[:,1]
    auc_ = roc_auc_score(y_val, y_pred)

    if auc_ > auc:
        features_selected_list.extend(new_feature)
        print(f"Feature '{new_feature}' is selected. AUC={auc_}")
        auc = auc_
    # else:
    #     print(f"Feature '{new_feature}' is NOT selected.")

    it += 1
    pbar.update(1)

    # if len(features_selected_list) > 10:
    #     break
print(f"Final AUC={auc}")

In [None]:
features_selected_list

**c. Train models with selected features**

In [None]:
#train and test model when using selected features in features_selected_list
X_train_feature = X_train.loc[:, features_selected_list]
X_val_feature = X_val.loc[:, features_selected_list]

#train random forest classifier model with selected feature
model = get_model()
model.fit(X_train_feature, y_train)

#evaluate the accuracy on validation set
y_pred = model.predict_proba(X_val_feature)[:,1]
auc_ = roc_auc_score(y_val, y_pred)
auc_

In [None]:
# model.get_depth()

*Plot ROC curve*

In [None]:
# get_model1 = lambda: RandomForestClassifier(n_estimators=200, random_state=random_seed)
# get_model2 = lambda: DecisionTreeClassifier(max_depth=100, random_state=random_seed)
# get_model3 = lambda: AdaBoostClassifier(n_estimators=600, random_state=random_seed)
# get_model4 = lambda: KNeighborsClassifier()
# get_model5 = lambda: GradientBoostingClassifier(n_estimators=150, max_depth=3, validation_fraction=0.01, random_state=1, learning_rate=0.05)
# get_model6 = lambda: MLPClassifier(hidden_layer_sizes=500, max_iter=1000, verbose=False, learning_rate_init=1e-4, learning_rate="adaptive", early_stopping=True)


# #random forest
# rf_model = get_model1()
# rf_model.fit(X_train_feature, y_train)
# y_pred_rf = rf_model.predict_proba(X_val_feature)[:,1]
# auc_rf = roc_auc_score(y_val, y_pred_rf)

# #decision tree
# dt_model = get_model2()
# dt_model.fit(X_train_feature, y_train)
# y_pred_dt = dt_model.predict_proba(X_val_feature)[:,1]
# auc_dt = roc_auc_score(y_val, y_pred_dt)

# #adaboost
# ada_model = get_model3()
# ada_model.fit(X_train_feature, y_train)
# y_pred_ada = ada_model.predict_proba(X_val_feature)[:,1]
# auc_ada = roc_auc_score(y_val, y_pred_ada)

# #Kneighbor
# kn_model = get_model4()
# kn_model.fit(X_train_feature, y_train)
# y_pred_kn = kn_model.predict_proba(X_val_feature)[:,1]
# auc_kn = roc_auc_score(y_val, y_pred_kn)

# #Kneighbor
# gb_model = get_model5()
# gb_model.fit(X_train_feature, y_train)
# y_pred_gb = gb_model.predict_proba(X_val_feature)[:,1]
# auc_gb = roc_auc_score(y_val, y_pred_gb)

# #Kneighbor
# mlp_model = get_model5()
# mlp_model.fit(X_train_feature, y_train)
# y_pred_mlp = mlp_model.predict_proba(X_val_feature)[:,1]
# auc_mlp = roc_auc_score(y_val, y_pred_mlp)

# #plot ROC with AUC of each models
# from sklearn.metrics import plot_roc_curve

# # Define the models and their names
# models = [rf_model, dt_model, ada_model, kn_model, gb_model, mlp_model]
# model_names = ["Random Forest", "Decision Tree", "AdaBoost", "KNN", "Gradient Boosting", "MLP"]
# auc_scores = [auc_rf, auc_dt, auc_ada, auc_kn, auc_gb, auc_mlp]

# # Plot the ROC curves
# fig, ax = plt.subplots()
# for i, model in enumerate(models):
#     plot_roc_curve(model, X_val_feature, y_val, ax=ax, name=model_names[i])

# # Add labels and title
# ax.set_xlabel("False Positive Rate")
# ax.set_ylabel("True Positive Rate")
# ax.set_title("ROC Curves of Different Models")

# # Add the AUC scores to the legend
# ax.legend(loc="lower right")
# for i, model in enumerate(models):
#     ax.text(1.0, 0.9-i*0.1, f"{model_names[i]} (AUC={auc_scores[i]:.3f})", transform=ax.transAxes, ha="right")

# # Show the plot
# plt.show()

**USE TRAINED MODEL**

In [None]:
df_test = pd.read_csv('test.csv')

id, X_test, _ = preprocessing(
    df_test, 
    transforms=transforms, 
    has_label=False, 
    return_id=True,
)

In [None]:
# Get missing columns in the training test
missing_cols = set(X_train.columns) - set(X_test.columns)
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    X_test[c] = 0

In [None]:
y_test_pred = base_model.predict(X_test)
y_test_pred.sum()/len(y_test_pred)

In [None]:
y_test_pred = model.predict(X_test.loc[:, features_selected_list])
y_test_pred.sum()/len(y_test_pred)

In [None]:
# # y_test_pred = base_model.predict_proba(X_test)[:,1]
y_test_pred = model.predict_proba(X_test.loc[:, features_selected_list])[:,1]

In [None]:
raise InterruptedError

**CREATE CSV FILE WITH ID & PREDICTIONS**

In [None]:
with open("test_result_gbc_v14.csv", "w") as f:
    f.write("id,outcome\n")
    for id_, y in zip(id, y_test_pred):
        f.write(f"{id_},{y}\n")
