In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# !pip install category_encoders

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go
import category_encoders as ce

from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import make_scorer, f1_score, roc_auc_score, accuracy_score, classification_report, confusion_matrix, balanced_accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
# df = pd.read_csv("/content/drive/MyDrive/ML/BNP/train.csv.zip", compression="zip")
# train, test = train_test_split(df, train_size=0.7)

In [None]:
df = pd.read_csv("./dataset/train.csv.zip", compression="zip")
train, test = train_test_split(df, train_size=0.7)

In [None]:
# Function to convert to hexavigesimal base
def az_to_int(az, nanVal=None):
    if az==az:  #catch NaN
        hv = 0
        for i in range(len(az)):
            hv += (ord(az[i].lower())-ord('a')+1)*26**(len(az)-1-i)
        return hv
    else:
        if nanVal is not None:
            return nanVal
        else:
            return az

In [None]:
def clean_data(df, cat_cols):
    df.v22 = df.v22.apply(az_to_int)
    df.drop(columns='ID', inplace=True)
    for cat_col in cat_cols:
        df[cat_col].fillna("__MISS__", inplace=True)
    return df

In [None]:
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
train = clean_data(train, cat_cols)
test = clean_data(test, cat_cols)

In [None]:
def dplot(data, column, target):
    a = data[column].fillna(data[column].median())
    fig, axes = plt.subplots(figsize=(20, 6), ncols=2, nrows=1)
    sns.kdeplot(data=data, x=column, hue=target, ax=axes[0])
    sns.kdeplot(a, hue=data[target], ax=axes[1])

In [None]:
def catplot(data, column, target):
    x = pd.crosstab(data[column], data[target])
    print(x.apply(lambda x: x/x.sum(), axis='rows'))

In [None]:
y_train = train.pop('target')
y_test = test.pop('target')

In [None]:
cat_encoder = ce.TargetEncoder(cols=cat_cols,  smoothing=100)
train = cat_encoder.fit_transform(train, y=y_train)
test = cat_encoder.transform(test, y=y_test)

In [None]:
# imputer = KNNImputer(weights="distance", copy=False)
# imputer.fit_transform(train)

In [None]:
simple_imputer = SimpleImputer(strategy="median", copy=False)
train = simple_imputer.fit_transform(train)
test = simple_imputer.transform(test)

In [None]:
def plot_prob_dist(yhat_prob, y):
    plt.figure()
    preds = pd.DataFrame({"yhat_prob_test": yhat_prob[:,1], "y_test": y})
    sns.kdeplot(x=preds['yhat_prob_test'], hue=preds['y_test'])
    plt.show()
    
def print_classification_report(clf, test, y_test):
    yhat_test = clf.predict(test)
    yhat_prob_test = clf.predict_proba(test)
    print("********************************************************")
    print("Accuracy on Test data ", accuracy_score(y_test, yhat_test))
    print("********************************************************")
    print("F1 on Test data ", f1_score(y_test, yhat_test))
    print("********************************************************")
    print("AUC ROC on Test data ", roc_auc_score(y_test, yhat_test))
    print("********************************************************")
    print("Confusion matrix \n", confusion_matrix(y_test, yhat_test))
    print("********************************************************")
    print(classification_report(y_test, yhat_test))
    print("********************************************************")
    plot_prob_dist(yhat_prob_test, y_test)

## KNN 

In [None]:
scaler = StandardScaler()
train_nn = scaler.fit_transform(train)

In [None]:
n_neighbors = [5, 10, 15, 20, 25]
params = {"n_neighbors": n_neighbors}
scoring = {'AUC': make_scorer(roc_auc_score), 'Accuracy': make_scorer(accuracy_score), "F1 Score": make_scorer(f1_score)}
clf = KNeighborsClassifier(n_jobs=-2)
gs = GridSearchCV(clf, cv=5, n_jobs=1, param_grid=params, scoring=scoring, return_train_score=True, refit="AUC")
gs.fit(train_nn, y_train)

In [None]:
fig = make_subplots(rows=1, cols=3, subplot_titles=("Accuracy vs Num neighbors", "F1 score vs Num neighbors", 
                                                    "AUC ROC vs Num neighbors"))
# accuracy
fig.add_trace(go.Scatter(x=n_neighbors, y=gs.cv_results_['mean_train_Accuracy'], mode='lines', name='AUC ROC on Train Data'),
             row=1, col=1)
fig.add_trace(go.Scatter(x=n_neighbors, y=gs.cv_results_['mean_test_Accuracy'], mode='lines', name='AUC ROC on Validation Data'),
             row=1, col=1)
# f1 score
fig.add_trace(go.Scatter(x=n_neighbors, y=gs.cv_results_['mean_train_F1 Score'], mode='lines', name='f1 Score on Train Data'), 
             row=1, col=2)
fig.add_trace(go.Scatter(x=n_neighbors, y=gs.cv_results_['mean_test_F1 Score'], mode='lines', name='f1 Score on Validation Data'),
             row=1, col=2)
# auc roc
fig.add_trace(go.Scatter(x=n_neighbors, y=gs.cv_results_['mean_train_AUC'], mode='lines', name='AUC ROC on Train Data'), 
             row=1, col=3)
fig.add_trace(go.Scatter(x=n_neighbors, y=gs.cv_results_['mean_test_AUC'], mode='lines', name='AUC ROC on Validation Data'),
             row=1, col=3)

# Update xaxis properties
fig.update_xaxes(title_text="Num neighbors", row=1, col=1, type='category')
fig.update_xaxes(title_text="Num neighbors", row=1, col=2, type='category')
fig.update_xaxes(title_text="Num neighbors", row=1, col=3, type='category')
# Update yaxis properties
fig.update_yaxes(title_text="Accuracy", row=1, col=1)
fig.update_yaxes(title_text="F1 Score", row=1, col=2)
fig.update_yaxes(title_text="AUC ROC",  row=1, col=3)
# update layout
fig.update_layout(title='Model Performance wrt Num neighbors')
fig.show()

In [None]:
test_nn = scaler.transform(test)
print_classification_report(gs.best_estimator_, test_nn, y_test)