In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from utils.utils import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, plot_roc_curve

## Load data

In [None]:
file_path = r'/home/jovyan/descartes_tech_interview/Data/'
train_num = pd.read_csv(file_path + 'train_num.csv')
test_num = pd.read_csv(file_path + 'test_num.csv')

In [None]:
train_num

In [None]:
test_num

In [None]:
train_y = train_num[['INDEX', 'TARGET_FLAG']]
train_x = train_num.drop(['TARGET_FLAG'], axis=1)

In [None]:
test_y = test_num[['INDEX', 'TARGET_FLAG']]
test_x = test_num.drop(['TARGET_FLAG'], axis=1)

## Data exploration

### Correlation heatmap

In [None]:
plot_corr_heatmap(train_num.drop('INDEX', axis=1))

## Data preprocessing

### Standardization

In [None]:
scaler = StandardScaler()

In [None]:
train_x_std = scaler.fit_transform(train_x.drop('INDEX', axis=1))

In [None]:
test_x_std = scaler.fit_transform(test_x.drop('INDEX', axis=1))

### Train_test split

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(train_x_std,
                                                      train_y.drop('INDEX', axis=1),
                                                      test_size=0.2, random_state=0)

## Models, hyperparameter tuning & evaluations

### Random forest
1. Default -> accuracy ~ 0.787 <br/>
2. params = {'n_estimators': 200,<br/>
          'min_samples_split': 10,<br/>
          'min_samples_leaf': 1,<br/>
          'max_features': 'sqrt',<br/>
          'max_depth': 10} -> accuracy ~ 0.794

In [None]:
params = {'n_estimators': 200,
          'min_samples_split': 10,
          'min_samples_leaf': 1,
          'max_features': 'sqrt',
          'max_depth': 10}
rfr = RandomForestClassifier(**params)
rfr.fit(X_train, y_train.values.ravel())

In [None]:
y_pred = rfr.predict(X_valid)

In [None]:
confusion_matrix(y_valid, y_pred)

In [None]:
print(classification_report(y_valid, y_pred))

In [None]:
accuracy_score(y_valid, y_pred)

In [None]:
rfr_disp = plot_roc_curve(rfr, X_valid, y_valid)
plt.show()

In [None]:
random_grid = {'n_estimators': [50, 100, 200],
               'max_features': ['auto', 'sqrt'],
               'max_depth': [5, 8, 10],
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1, 2, 4]}

In [None]:
rf_random = RandomizedSearchCV(estimator=rfr,
                               param_distributions=random_grid,
                               n_iter=100, cv=3, verbose=2,
                               random_state=42, n_jobs=-1)

In [None]:
rf_random.fit(X_train, y_train.values.ravel())

In [None]:
rf_random.best_params_

### KNN
1. n_neighbors=3 -> accuracy ~ 0.746 <br/>
2. n_neighbors=5 -> accuracy ~ 0.754 <br/>
3. n_neighbors=10 -> accuracy ~ 0.768 <br/>

In [None]:
neigh = KNeighborsClassifier(n_neighbors=10)
neigh.fit(X_train, y_train.values.ravel())

In [None]:
y_pred = neigh.predict(X_valid)

In [None]:
confusion_matrix(y_valid, y_pred)

In [None]:
print(classification_report(y_valid, y_pred))

In [None]:
accuracy_score(y_valid, y_pred)

In [None]:
knn_disp = plot_roc_curve(neigh, X_valid, y_valid)
plt.show()

### SVM

In [None]:
svm = SVC(gamma='auto', random_state=42)
svm.fit(X_train, y_train.values.ravel())

In [None]:
y_pred = svm.predict(X_valid)

In [None]:
confusion_matrix(y_valid, y_pred)

In [None]:
print(classification_report(y_valid, y_pred))

In [None]:
accuracy_score(y_valid, y_pred)

In [None]:
svm_disp = plot_roc_curve(svm, X_valid, y_valid)
plt.show()

### XGBoost

In [None]:
params = {'objective': 'binary:logistic'}
xgbm = xgb.XGBClassifier(**params)
xgbm.fit(X_train, y_train.values.ravel())

In [None]:
y_pred = xgbm.predict(X_valid)

In [None]:
confusion_matrix(y_valid, y_pred)

In [None]:
print(classification_report(y_valid, y_pred))

In [None]:
accuracy_score(y_valid, y_pred)

In [None]:
xgbm_disp = plot_roc_curve(xgbm, X_valid, y_valid)
plt.show()

## Prediction

In [None]:
test_y['TARGET_FLAG'] = rfr.predict(test_x_std)

In [None]:
test_y

In [None]:
test_y.to_csv(file_path + 'prediction.csv', index=False)