# ML Assignment 1

Data set 1 - Bank Account Fraud Dataset \
https://www.kaggle.com/datasets/sgpjesus/bank-account-fraud-dataset-neurips-2022 \
Data set 2 - Customer Segmentation \
https://www.kaggle.com/datasets/abisheksudarshan/customer-segmentation?select=train.csv


In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import tree
from datetime import datetime
import seaborn
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from statistics import mean
from sklearn.impute import KNNImputer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from time import process_time
import itertools 

# Load data

In [None]:
def load_data(path, nrows = 5000):
    df = pd.read_csv(path, nrows = nrows)
    # df = df.dropna()
    # df.columns = [i.lower() for i in df.columns]
    # df.drop(columns = ['payment_type', 'employment_status', 'housing_status', 'source','device_os'], inplace=True)
    return df

# df = load_data('Base.csv')
# display(df)

In [None]:
# check missing value in each column
# nacol = pd.DataFrame(df.isnull().sum().sort_values(ascending=False)/len(df))
# nacol = nacol[nacol[0]>0]
# nacol

# Train test split

In [9]:
def train_valid_test_split(df, target = 'fraud_bool'):
    # split df to train set and test set
    from sklearn.model_selection import train_test_split
    X = df.drop(target, axis=1)
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, 
                                                        train_size = 0.8, random_state=0)
    
    
    # split train set to train set and validation set
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, stratify = y_train, 
                                                    train_size = 0.8, random_state=0)
    
    return X_train, y_train, X_valid, y_valid, X_test, y_test

def train_valid_test_split2(df, target = 'Segmentation'):
    # split df to train set and test set
    from sklearn.model_selection import train_test_split
    X = df.drop(target, axis=1)
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, 
                                                        train_size = 0.8, random_state=0)
    
    
    # split train set to train set and validation set
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, stratify = y_train, 
                                                    train_size = 0.8, random_state=0)
    
    return X_train, y_train, X_valid, y_valid, X_test, y_test

# Feature engineering

In [None]:
# def impute_missing(X_train, X_valid, X_test):
#     from sklearn.impute import KNNImputer
#     numcols = X_train.select_dtypes(include=['float','int']).columns
#     imputer = KNNImputer(n_neighbors=3, weights="uniform")
#     imputer.fit(X_train[numcols])
#     X_train[numcols] = pd.DataFrame(imputer.transform(X_train[numcols]), columns=numcols)
#     X_valid[numcols] = pd.DataFrame(imputer.transform(X_train[numcols]), columns=numcols)
#     X_test[numcols] = pd.DataFrame(imputer.transform(X_train[numcols]), columns=numcols)
    
#     return X_train, X_valid, X_test

In [None]:
# knn impute numeric columns and one hot encode category columns
def ohe(X_train, X_valid, X_test):
    # impute numeric columns
    numcols = X_train.select_dtypes(include=['float','int']).columns
    imputer = KNNImputer(n_neighbors=3, weights="uniform")
    imputer.fit(X_train[numcols])
    X_train[numcols] = pd.DataFrame(imputer.transform(X_train[numcols]), columns=numcols)
    X_valid[numcols] = pd.DataFrame(imputer.transform(X_train[numcols]), columns=numcols)
    X_test[numcols] = pd.DataFrame(imputer.transform(X_train[numcols]), columns=numcols)
    
    # impute categorical columns
    ohe_training_predictors = pd.get_dummies(X_train)
    ohe_valid_predictors = pd.get_dummies(X_valid)
    ohe_test_predictors = pd.get_dummies(X_test)
    X_train, X_valid = ohe_training_predictors.align(
        ohe_valid_predictors,join='left', axis=1)

    X_train, X_test = ohe_training_predictors.align(
        ohe_test_predictors,join='left', axis=1)
    
    X_train=X_train.fillna(0)
    X_valid=X_valid.fillna(0)
    X_test=X_test.fillna(0)
    
    return X_train, X_valid, X_test

# X_train, X_valid, X_test = ohe(X_train, X_valid, X_test)

In [None]:
def drop_catcols(X_train, X_valid, X_test):
    catcols = X_train.select_dtypes(include=['object']).columns
    X_train.drop(columns = catcols, inplace=True)
    X_valid.drop(columns = catcols, inplace=True)
    X_test.drop(columns = catcols, inplace=True)
    
    return X_train, X_valid, X_test

In [None]:
def oversampling(X_train, y_train):
    # over-sampling: match the minority class to the majority class
    from collections import Counter
    from imblearn.over_sampling import (RandomOverSampler, SMOTE, ADASYN)
    sampler = RandomOverSampler(sampling_strategy='auto',random_state=0)
    X_train_rs, y_train_rs = sampler.fit_resample(X_train, y_train)
#     print('Before resample {}'.format(Counter(y_train)),' RandomOverSampler {}'.format(Counter(y_train_rs)))
    
    return X_train_rs, y_train_rs

# Modelling function

In [None]:
def model(model, X_train, y_train, X_valid, y_valid):
    t_start = process_time()
    model = model.fit(X_train, y_train)
    # predict
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_valid)
    t_stop = process_time()
    t = t_stop - t_start
    
    # calculate score
    from sklearn.metrics import roc_curve,roc_auc_score, auc, f1_score, accuracy_score, recall_score
    train_roc = roc_auc_score(y_train,train_pred)
    cv_score = cross_val_score(model, X_train, y_train, cv=5)
    test_roc = roc_auc_score(y_valid,test_pred)
    f1_score = f1_score(y_valid, test_pred, average='weighted')
    acc = accuracy_score(y_valid, test_pred)
    recall_score = recall_score(y_valid, test_pred, average='weighted')
    
    perf = pd.DataFrame({'acc':[round(acc,3)],
                         'cv_score':[round(mean(cv_score),3)],
                         'train_roc':[round(train_roc,3)],                
                         'test_roc':[round(test_roc,3)],
                         'recall_score':[round(recall_score,3)],
                         'f1_score':[round(f1_score,3)],
                         'run_time':[round(t,5)]
                        })
    return perf

In [8]:
def model2(model, X_train, y_train, X_valid, y_valid):
    t_start = process_time()
    model = model.fit(X_train, y_train)
    # predict
    train_pred = pd.DataFrame(model.predict(X_train))
    test_pred = pd.DataFrame(model.predict(X_valid))
    train_pred_prob = model.predict_proba(X_train)
    test_pred_prob = model.predict_proba(X_valid)
    t_stop = process_time()
    t = t_stop - t_start
    # calculate score
    from sklearn.metrics import roc_curve,roc_auc_score, auc, f1_score, accuracy_score, recall_score
    train_roc = roc_auc_score(y_train,train_pred_prob, multi_class = 'ovr', average = 'weighted')
    cv_score = cross_val_score(model, X_train, y_train, cv=5)
    test_roc = roc_auc_score(y_valid,test_pred_prob, multi_class = 'ovr', average = 'weighted')
    f1_score = f1_score(y_valid, test_pred, average='weighted')
    acc = accuracy_score(y_valid, test_pred)
    recall_score = recall_score(y_valid, test_pred, average='weighted')
    
    perf = pd.DataFrame({'acc':[round(acc,3)],
                         'cv_score':[round(mean(cv_score),3)],
                         'train_roc':[round(train_roc,3)],                
                         'test_roc':[round(test_roc,3)],
                         'recall_score':[round(recall_score,3)],
                         'f1_score':[round(f1_score,3)],
                         'run_time':[round(t,3)]
                        })
    return perf

In [None]:
# from sklearn import tree
# from sklearn.model_selection import cross_val_score

# clf = tree.DecisionTreeClassifier()
# clf = clf.fit(X_train, y_train)
# print(cross_val_score(estimator=clf, X=X_train, y=y_train, cv=5))
# pred = clf.predict(X_valid)
# pd.crosstab(pred,y_valid)
# # pred
# from sklearn.metrics import roc_curve,roc_auc_score, auc
# print(roc_auc_score(y_train,clf.predict(X_train)))
# print(roc_auc_score(y_valid,clf.predict(X_valid)))
# print(roc_auc_score(y_test,clf.predict(X_test)))

# Graph function

In [3]:
def learning_curve_graph(perf_out):
    plt.plot(perf_out['train_ratio'], perf_out['recall_score'], label = "recall_score")
    plt.plot(perf_out['train_ratio'], perf_out['train_roc'], label = "train_roc")
    plt.plot(perf_out['train_ratio'], perf_out['test_roc'], label = "test_roc")
    plt.plot(perf_out['train_ratio'], perf_out['f1_score'], label = "f1_score")
    # plt.plot(le_perf_out['train_ratio'], perf_out['run_time'], label = "run_time")
    plt.xlabel("train data size")
    plt.ylabel("score")
    plt.legend()
    plt.show()

In [None]:
def importance_graph(clf, X_train):
    importances = pd.DataFrame({'features': X_train.columns, 'importance': np.round(clf.feature_importances_,3)})
    importances = importances.sort_values('importance',ascending=False)
    importances.sort_values('importance', ascending=False)
    import seaborn as sns
    sns.set(rc={'figure.figsize':(25,15)})
    sns.barplot(x='importance', y = 'features', data = importances[:10])

In [1]:
def tree_graph(clf, X_train, y_train):
    fig = plt.figure(figsize=(25,20))
    clf.fit
    _ = tree.plot_tree(clf, 
                       feature_names=X_train.columns,  
                       class_names=y_train.unique().astype(str),
                       filled=True)