### Imported libraries and Scripts

In [299]:
### imported Libraries
import pandas as pd
import numpy as np
import importlib
import sqlite3
import datetime

### Sklearn imported libraries
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import MinMaxScaler

from pprint import pprint
from sklearn.cluster import KMeans

# Pipeline for Oversampling
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

from sklearn import metrics
import matplotlib.pyplot as plt

### Imported Scripts
import utils

%load_ext autoreload
%autoreload 2

importlib.reload(utils)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<module 'utils' from 'c:\\Users\\utilizador\\Desktop\\ac-feup\\jupyters\\utils.py'>

### Choose Macros

In [300]:
OVERSAMPLE = True
DEBUG = True
WRITE = True
DUMMIES = True
CATEGORY_ENCODING = False
MIN_MAX_SCALER = False
SPLIT_RATIO = 0.8
N_COLUMNS = 15
N_SPLITS = 3

## Defined Methods

### Choose Dataset Options

In [301]:
def add_dummy(df,columns):
    copy= df.copy()

    for column in columns:
        dummies = pd.get_dummies(copy[column])
        copy = copy.drop(column,axis=1)
        copy = copy.join(dummies)
    
    return copy

def convert_dates(df):
    copy = df.copy()
    columns = ["loan_date","account_creation","birth_number"]

    for column in columns:
        copy[column] = copy[column].apply(lambda x: datetime.datetime.strptime(x, '%d-%m-%Y').strftime('%Y')).astype(int)

    copy["age_on_loan"] = copy["loan_date"] - copy["birth_number"]
    copy = copy.drop(columns = ["loan_date","account_creation","birth_number"])

    copy['card_issued'] = pd.to_numeric(copy["card_issued"].astype(str), errors='coerce').fillna(1).astype(int)

    return copy

def get_df(test=False):
    con = sqlite3.connect("../database/banking_data")
    if test:
        df = pd.read_sql_query("SELECT * FROM loan_united_test", con)
    else:
        df = pd.read_sql_query("SELECT * FROM loan_united_train", con)

    df = convert_dates(df)
    con.close()

    if DUMMIES:
        columns = ["account_frequency","gender","card_type"]
        df = add_dummy(df, columns)

    if CATEGORY_ENCODING:
        df = utils.normalize_category(df)

    
    if MIN_MAX_SCALER:
        scaler = MinMaxScaler()
        copy = df.copy()
        y = copy["status"]
        X = copy.drop(columns=["status"])
        transf = scaler.fit_transform(X)
        copy = pd.DataFrame(transf,index=X.index,columns=X.columns)
        copy["status"] = y
        df = copy
        
    
    return df


### Data splitting

In [302]:
### Split the data
def split_dataset(df):

    ### Seperate the precition columns from output
    
    X = df.drop(columns=['status'])
    y = df['status']

    select = SelectKBest(f_classif, k= N_COLUMNS)
    X_new = select.fit_transform(X, y)

    split_filter = select.get_support()
    features = X.columns

    ### Apply splitting
    X_train, X_test, y_train, y_test = train_test_split(X_new,y,train_size=SPLIT_RATIO,test_size=1-SPLIT_RATIO)

    return X_train,X_test,y_train,y_test, features[split_filter]

### Algorithms

In [303]:
def get_random_forest():
    return RandomForestClassifier(bootstrap = False,
                                    max_depth =110,
                                    max_features = 2,
                                    min_samples_leaf = 3,
                                    min_samples_split = 8,
                                    n_estimators = 300)

In [304]:
def get_logistic_regression():
    return LogisticRegression(random_state=30,
                                solver='sag',
                                max_iter=400,
                                dual=False,
                                multi_class='auto',
                                penalty='none',
                                tol=0.1)

In [305]:
def get_decision_tree():
    return DecisionTreeClassifier(class_weight='balanced',
                                    criterion='gini',
                                    max_depth=30,
                                    max_features='log2',
                                    min_impurity_decrease=0.05,
                                    min_samples_leaf=6,
                                    min_samples_split=6,
                                    splitter='best')

In [306]:
def get_knn():
    return KNeighborsClassifier(algorithm="ball_tree",
                                    leaf_size=150,
                                    metric="chebyshev",
                                    n_neighbors=10,
                                    p=3,
                                    weights="distance")

### Use *Grid Search Cross Validation* to find the best grid for an algorithm

In [307]:
def create_rf():
    return  RandomForestClassifier()
def create_knn():
    return  KNeighborsClassifier()
def create_dt():
    return  DecisionTreeClassifier()
def create_lr():
    return  LogisticRegression()

In [308]:
### Uses a grid search to generate random parameters to find the best grid model
def getBestSearch(algorithm,grid):
    train = get_df()

    X = train.drop(columns=['status'])
    y = train['status']

    if algorithm == "RF":
        alg = build_pipeline(create_rf())
    elif algorithm == "KNN":
        alg = build_pipeline(create_knn())
    elif algorithm == "DT":
        alg = build_pipeline(create_dt())
    elif algorithm == "LR":
        alg = build_pipeline(create_lr())
    print(alg.get_params().keys())
    
    grid_search = GridSearchCV(estimator = alg,
                               param_grid = grid, 
                               scoring=metrics.make_scorer(utils.get_auc, greater_is_better=True),
                               cv=StratifiedKFold(N_SPLITS,random_state=True,shuffle=True),
                               n_jobs = -1,
                               verbose = 2)

    model = grid_search.fit(X,y)

    if DEBUG:
        print('Best Score: ', model.best_score_)
        print('Best Params: ', model.best_params_)
    
    return model.best_score_, model.best_params_


### Build a Pipeline to apply a sampling and a classification algorithm

In [309]:
def build_pipeline(algorithm):

    if(OVERSAMPLE):
        return Pipeline([
            ('sampling',SMOTE()),
            ('classification',algorithm)
        ])
    else:
        return  Pipeline([
            ('classification',algorithm)
        ])

### Final Stratified Cross Validation

In [310]:
def final_CV(algorithm):
    train = get_df()

    X = train.drop(columns=['status'])
    y = train['status']

    select = SelectKBest(f_classif, k=N_COLUMNS)
    X_new = select.fit_transform(X, y)

    split_filter = select.get_support()
    features = X.columns[split_filter]

    skf = StratifiedKFold(n_splits=N_SPLITS, random_state=True, shuffle=True)

    model_list = []
    auc_list = []
    
    for train_index, test_index in skf.split(X_new, y):

        X_train, X_test = X_new[train_index], X_new[test_index]
        y_train, y_test = y[train_index], y[test_index]

        ### Train the model
        pipe = build_pipeline(algorithm())
        model = pipe.fit(X_train, y_train)

        ### Predict the outcome with the test data
        y_pred = model.predict_proba(X_test)
        y_final = y_pred.transpose()[0]
        
        auc = utils.get_auc(y_test, y_final)
        auc_list.append(auc)
        model_list.append(model)
        print(f"AUC={auc}")
        
    ### Get the best model

    best_score = max(auc_list)
    best_model = model_list[auc_list.index(best_score)]

    avg = sum(auc_list)/len(auc_list)
    print(f"Average AUC = {avg}")
    print(f"Max AUC = {best_score}")
    
    
    ### Use the best model to get a prediction
    test = get_df(test=True)
    
    X2 = test.drop(columns=['status'])
    X2 = X2[features]

    y_predicted = best_model.predict_proba(X2)
    y_final = y_predicted.transpose()[0]
    
    final_df = pd.DataFrame()
    final_df['Id'] = test["loan_id"]
    final_df['Predicted'] = y_final
    
    
    if DEBUG:
        print(f"Predictions:\n {final_df}")
    
    if WRITE:
        final_df.to_csv('../csvs/results/final.csv', index=False)
        print("Sucessfully stored the predictions in a file named 'final.csv'")

    ### TODO: display statistics?
    


### Running an algorithm

In [311]:
def training_algorithm(algorithm):
    ### Getting the dataset
    train = get_df(test=False)
    ### Getting a Model from training
    
    X_train,X_test,y_train,y_test, features = split_dataset(train)

    
    pipe = build_pipeline(algorithm())
    
    model = pipe.fit(X_train,y_train)

    y_predicted = model.predict_proba(X_test)

    y_final = y_predicted.transpose()[0]

    if DEBUG:
        score = model.score(X_test,y_test)
        auc = utils.get_auc(y_test,y_final,label=-1)
        # utils.plot_auc(y_test,y_final)
       
        print(f"Score: {score}")
        print(f"Auc: {auc}")
    
    return model, features

In [330]:
def testing_model(model,features):

    test = get_df(test=True)

    X = test.drop(columns=['status'])
    X = X[features]

    y_predicted = model.predict_proba(X)
    y_final = y_predicted.transpose()[0]
    
    final_df = pd.DataFrame()
    final_df['Id'] = test["loan_id"]
    final_df['Predicted'] = y_final

    if DEBUG:
        print(f"Predictions:\n {final_df}")
        
    if WRITE:
        final_df.to_csv('../csvs/results/testing_model.csv', index=False)
        print("Sucessfully stored the predictions in a file named 'testing_model.csv'")

In [313]:
def run_algorithm(algorithm):
    
    if(DEBUG):
        print("Running the provided algorithm")
    
    model, features = training_algorithm(algorithm)
    testing_model(model,features)

## Running the algorithms

### Choose your algorithm:

In [324]:
### Leave the one you want to run uncommented
algorithm = get_random_forest
# algorithm = get_logistic_regression
# algorithm = get_decision_tree
# algorithm = get_knn

### Run with train_test_split

In [341]:
run_algorithm(algorithm)

Running the provided algorithm
Score: 0.9242424242424242
Auc: 0.9637096774193549
Predictions:
        Id  Predicted
0    5895   0.105437
1    7122   0.744565
2    6173   0.196680
3    6142   0.446458
4    5358   0.507302
5    6095   0.213911
6    6878   0.140506
7    6554   0.319711
8    6793   0.241757
9    7286   0.229596
10   6076   0.164224
11   5134   0.175513
12   5419   0.408942
13   6255   0.673891
14   5656   0.237962
15   6934   0.253840
16   6028   0.336536
17   6490   0.142610
18   6415   0.445658
19   7087   0.133462
20   5420   0.206671
21   5977   0.145566
22   6824   0.574320
23   5207   0.127411
24   7115   0.278123
25   7250   0.052074
26   6010   0.331449
27   6088   0.398058
28   5682   0.266699
29   7201   0.145712
..    ...        ...
324  5698   0.247907
325  5169   0.124972
326  7294   0.090037
327  5318   0.193514
328  5368   0.120305
329  6923   0.079239
330  5463   0.157664
331  5265   0.390384
332  6321   0.175833
333  5226   0.395730
334  6868   0.075914
33

  f"X has feature names, but {self.__class__.__name__} was fitted without"


### Run with StratifiedKFold

In [342]:
final_CV(algorithm)

AUC=0.6569148936170214
AUC=0.8290780141843972
AUC=0.7397163120567376
Average AUC = 0.741903073286052
Max AUC = 0.8290780141843972
Predictions:
        Id  Predicted
0    5895   0.100317
1    7122   0.630873
2    6173   0.183399
3    6142   0.354970
4    5358   0.386480
5    6095   0.074617
6    6878   0.180115
7    6554   0.188026
8    6793   0.250431
9    7286   0.239305
10   6076   0.176855
11   5134   0.093537
12   5419   0.455328
13   6255   0.450574
14   5656   0.201285
15   6934   0.283235
16   6028   0.257836
17   6490   0.117443
18   6415   0.345857
19   7087   0.067602
20   5420   0.168757
21   5977   0.092511
22   6824   0.524551
23   5207   0.108596
24   7115   0.186087
25   7250   0.029649
26   6010   0.274482
27   6088   0.269816
28   5682   0.126083
29   7201   0.053980
..    ...        ...
324  5698   0.374375
325  5169   0.137740
326  7294   0.139082
327  5318   0.180384
328  5368   0.187613
329  6923   0.102423
330  5463   0.161504
331  5265   0.432510
332  6321   0.20

  f"X has feature names, but {self.__class__.__name__} was fitted without"


### Finding the best grid

#### Decision Tree

In [318]:
param_grid = {
    'classification__criterion':['gini', 'entropy'],
    'classification__splitter': ["best", "random"],
    'classification__max_depth': [5, 10, 20, 30, 40],
    'classification__min_samples_split': [2, 4, 6, 8],
    'classification__min_samples_leaf': [1, 2, 4, 6],
    'classification__max_features': ["auto", "sqrt", "log2"],
    'classification__min_impurity_decrease': [0.05, 0.1, 0.2, 0.3],
    'classification__class_weight': ["balanced", None]
}

getBestSearch("DT",param_grid)

#### Random Forest

In [None]:
param_grid = {
    'classification__max_depth': [80, 90, 100, 110],
    'classification__max_features': [2, 3],
    'classification__min_samples_leaf': [3, 4, 5],
    'classification__min_samples_split': [8, 10, 12],
    'classification__n_estimators': [100, 200, 300, 1000]
}

getBestSearch("RF",param_grid)

#### Logistic Regression

In [None]:
param_grid = {
    'classification__penalty': ['none', 'l1', 'l2', 'elasticnet'],
    'classification__dual': [True, False],
    'classification__tol': [0.1, 0.01, 0.001, 1e-4, 1e-5, 1e-6],
    'classification__random_state': [8, 10, 12, 20, 30, 50],
    'classification__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'classification__max_iter': [100, 200, 300, 400],
    'classification__multi_class': ['auto', 'ovr', 'multinomial']
}

getBestSearch("LR",param_grid)

#### K-Nearest Neighbours

In [None]:
param_grid = {
    'classification__n_neighbors': [2, 3, 4, 5, 8, 10],
    'classification__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'classification__weights': ['uniform', 'distance'],
    'classification__metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski', 'wminkowski', 'seuclidean', 'mahalanobis', 'haversine', 'hamming', 'canberra', 'braycurtis', 'cityblock', 'infinity', 'l1', 'l2', 'p'],
    'classification__leaf_size': [10, 20, 30, 50, 80, 150],
    'classification__p': [1, 2, 3, 5, 10]
}

getBestSearch("KNN",param_grid)