# My inputs

In [1]:
import os
import sys
import pandas as pd
import re
import joblib
from IPython.display import clear_output

# Kernel inputs

In [2]:
import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Models inputs

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import fbeta_score
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE

# Initialisation path scripts

In [4]:
path_to_scripts = r'..\02_SCRIPTS'

In [5]:
sys.path.insert(0, path_to_scripts)

# Import functions

In [6]:
from P7_dataprep_function import main

In [7]:
from P7_other_functions import TrainTestGrid, \
    heatmap_print, \
    classify_with_proba

In [None]:
feat_importance, top_feat, df = main(test_importance=True, nrows=10000)

Train samples: 10000, test samples: 10000
Bureau df shape: (2011, 108)
Process bureau and bureau_balance - done in 0s
Previous applications df shape: (9734, 242)
Process previous_applications - done in 1s
Pos-cash balance df shape: (9494, 15)
Process POS-CASH balance - done in 0s
Installments payments df shape: (8893, 26)
Process installments payments - done in 0s
Credit card balance df shape: (9520, 131)
Process credit card balance - done in 0s
Starting LightGBM. Train shape: (10000, 768), test shape: (10000, 768)
Fold  1 AUC : 0.725420
Fold  2 AUC : 0.716144
Fold  3 AUC : 0.709659
Fold  4 AUC : 0.731546
Fold  5 AUC : 0.727236
Fold  6 AUC : 0.739950
Fold  7 AUC : 0.716610
Fold  8 AUC : 0.677266


In [None]:
df = main(test_importance=False, nrows=100000)

# Univaried analysis

In [None]:
num_rows = 100000

In [None]:
for feat in top_feat:
    plt.figure(figsize=(20,5))
    plt.subplot(1, 4, 1)
    plt.boxplot(df[feat][:num_rows])
    plt.title('{}: box Train'.format(feat))
    plt.subplot(1, 4, 2)
    plt.boxplot(df[feat][num_rows:])
    plt.title('{}: box Test'.format(feat))
    plt.subplot(1, 4, 3)
    plt.hist(df[feat][num_rows:])
    plt.title('{}: repartition Test'.format(feat))
    plt.subplot(1, 4, 4)
    plt.hist(df[feat][num_rows:])
    plt.title('{}: repartition Test'.format(feat))
    plt.show()

In [None]:
top_feat

In [None]:
df_modele = df[top_feat + ['TARGET']]
df_modele

In [None]:
df_modele_train = df_modele[df_modele['TARGET'].notnull()]
df_modele_test = df_modele[df_modele['TARGET'].isnull()]

In [None]:
imputer = KNNImputer(n_neighbors=2)
fitted_modele_train = imputer.fit_transform(df_modele_train)
df_modele_train = pd.DataFrame(fitted_modele_train, columns=df_modele.columns)

# Target observations

In [None]:
df_modele_train.TARGET.value_counts()

In [None]:
val0 = len(df_modele_train[df_modele_train.TARGET.values == 0])
val1 = len(df_modele_train[df_modele_train.TARGET.values == 1])

In [None]:
labels = ['Not granted', 'Granted']
values = [val0, val1]
colors = ['red', 'green']
explode = (0, 0.3)
plt.figure(figsize=(10, 5))
plt.pie(values,
        explode=explode,
        labels=labels,
        colors=colors,
        autopct='%.2f%%',
        shadow=True,
        startangle=140)
plt.show()

# Classification
#### KNN, SVC

### Undersampling

In [None]:
#TrainTestGrid(df_modele_train, top_feat, prepro_mthd="under", method='dummy')

In [None]:
#under_lreg_results = TrainTestGrid(df_modele_train, top_feat, prepro_mthd="under", method='lreg')

In [None]:
#under_lgbm_results = TrainTestGrid(df_modele_train, top_feat, prepro_mthd="under", method='lgbm')

In [None]:
#under_rfc_results = TrainTestGrid(df_modele_train, top_feat, prepro_mthd="under", method='rfc')

### Oversampling

In [None]:
#TrainTestGrid(df_modele_train, top_feat, prepro_mthd="over", method='dummy')

In [None]:
#over_lgbm_results = TrainTestGrid(df_modele_train, top_feat, prepro_mthd="over", method='lgbm')

In [None]:
#over_rfc_results = TrainTestGrid(df_modele_train, top_feat, prepro_mthd="over", method='rfc')

### SMOTE

In [None]:
#TrainTestGrid(df_modele_train, top_feat, prepro_mthd="smote", method='dummy')

In [None]:
smote_lgbm_results = TrainTestGrid(df_modele_train, top_feat, prepro_mthd="smote", method='lgbm')

In [None]:
#smote_lreg_results = TrainTestGrid(df_modele_train, top_feat, prepro_mthd="smote", method='lreg')

In [None]:
#smote_rfc_results = TrainTestGrid(df_modele_train, top_feat, prepro_mthd="smote", method='rfc')

## Test SGD over entire dataset

In [None]:
df_glo = main(test_importance=False)

In [None]:
df_glo_traintest = df_glo.loc[df_glo['TARGET'].notnull()]
df_glo_appli = df_glo.loc[df_glo['TARGET'].isnull()]

In [None]:
df_glo_traintest = df_glo_traintest[top_feat + ['TARGET']]
df_glo_traintest

In [None]:
imputer = KNNImputer(n_neighbors=2)
fitted_modele_train = imputer.fit_transform(df_glo_traintest)
df_glo_traintest = pd.DataFrame(fitted_modele_train, columns=df_glo_traintest.columns)

In [None]:
#smote_sgd_results = TrainTestGrid(df_glo_traintest, top_feat, prepro_mthd="smote", method='sgd')

##### On peut donc retenir comme méthode :
##### préprocessing/balancing = smote
##### classifier = Random Forest Classifier, KNN classifier
##### bagged = Non

## Prediction score and prediction using score

#### Min Max scaling data

In [None]:
scaler = MinMaxScaler()
df_glo_train = pd.DataFrame(scaler.fit_transform(df_glo_traintest), columns=df_glo_traintest.columns)
x = df_glo_train[top_feat]
y = df_glo_train.TARGET

#### Split data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    x,
    y,
    test_size=0.33,
    random_state=0)

In [None]:
df_train = X_train.copy()
df_train['TARGET'] = y_train

In [None]:
resamp = 'smote'
if resamp == 'under':
    class_count_0, class_count_1 = df_train['TARGET'].value_counts()
    class_0 = df_train[df_train['TARGET'] == 0]
    class_1 = df_train[df_train['TARGET'] == 1]
    class_0_under = class_0.sample(class_count_1)
    df_samp = pd.concat([class_0_under, class_1])
    x = df_samp[top_feat]
    y = df_samp['TARGET']
elif resamp == 'over':
    class_count_0, class_count_1 = df_glo_train['TARGET'].value_counts()
    class_0 = df_glo_train[df_glo_train['TARGET'] == 0]
    class_1 = df_glo_train[df_glo_train['TARGET'] == 1]
    class_1_over = class_1.sample(class_count_0, replace=True)
    df_samp = pd.concat([class_1_over, class_0])
    x = df_samp[top_feat]
    y = df_samp['TARGET']

#### Smote on the training set

In [None]:
if resamp == 'smote':
    smote = SMOTE()

    # fit predictor and target variable
    x_fit, y_fit = smote.fit_resample(X_train, y_train)
else:
    x_fit = x
    y_fit = y
print('Original dataset shape', X_train.shape)
print('Resample dataset shape', x_fit.shape)

### ML

In [None]:
if resamp == 'under':
    cls = under_lgbm_results.cls
elif resamp == 'over':
    cls = over_lgbm_results.cls
elif resamp == 'smote':
    cls = smote_lgbm_results.cls

In [None]:
cls.fit(x_fit, y_fit)
pred = cls.predict(X_test)
true = y_test.values
title_hm = 'Confusion matrix (score = {})'.format(
    cls.score(X_test, y_test))
heatmap_print(true, pred, title_hm)

In [None]:
proba_pred = cls.predict_proba(X_test)

fbeta_values = []
range_values = list(range(10, 100))
for proba_range in range_values:
    proba = proba_range / 100
    pred_with_proba = np.apply_along_axis(classify_with_proba, 1, proba_pred, proba_0=proba)
    cm = confusion_matrix(y_test, pred_with_proba)
    fbeta_values += [fbeta_score(y_test, pred_with_proba, beta=0.5)]
fbeta_df = pd.DataFrame(fbeta_values, np.divide(range_values ,100))
proba = fbeta_df.idxmax()[0]
plt.plot(fbeta_df)
plt.title('Value of fbeta_score according to probability \n Top proba = {}'.format(proba))
plt.show()

In [None]:
proba_pred = cls.predict_proba(X_test)
pred_with_proba = np.apply_along_axis(classify_with_proba, 1, proba_pred, proba_0=proba)
title_hm = 'Results with proba = {}'.format(proba)
heatmap_print(true, pred_with_proba, title_hm)

##### unsatisfying results

In [None]:
top_x = 30 # Top X importance (top 10 for example)
color_list =  sns.color_palette("dark", len(top_feat)) 
feat_imp = cls.feature_importances_
ind = np.argsort(feat_imp)
ind = ind[-top_x:]
fig, axs = plt.subplots(1,1, figsize=(15, 5), facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace = 0.5, wspace=0.8)
bars = axs.barh(range(len(ind)), feat_imp[ind], color='b', align='center') 
axs.set_title("Feature importance", fontweight="normal", fontsize=16)
plt.sca(axs)
plt.yticks(range(len(ind)), [top_feat[j] for j in ind], fontweight="normal", fontsize=16)

for i, ticklabel in enumerate(plt.gca().get_yticklabels()):
    ticklabel.set_color(color_list[ind[i]])

for i,bar in enumerate(bars):
    bar.set_color(color_list[ind[i]])

sorted_val_imp = np.around(sorted(cls.feature_importances_), 3)[-top_x:]
for i, v in enumerate(sorted_val_imp):
    axs.text(v+0.001 , i - 0.15, str(v), color=color_list[ind[i]], fontweight='bold')
plt.savefig(r'..\08_WEBSITE_AND_MODELS\featureimportance.png')
plt.box(False)

In [None]:
inputer_file_name = r'..\06_MODEL\knn_inputer.sav'
joblib.dump(imputer, inputer_file_name)

In [None]:
filename = r'..\06_MODEL\final_model.sav'
joblib.dump(cls, filename)
df_glo[['SK_ID_CURR', 'TARGET'] + top_feat].to_csv(r'..\06_MODEL\all_data.csv', index=False)

In [None]:
f = open(r'..\06_MODEL\parameters.py', 'w')
f.write(
    '"""\nParameter file for api\n"""\n\
from sklearn.preprocessing import MinMaxScaler\n\n\n\
class PredictParams:\n\
    """\n\
    class containing every parameter\n\
    """\n\n\
    def __init__(self):\n\
        self.topfeat = {}\
    '.format(
        top_feat)
)