### Imported libraries and Scripts

In [190]:
### imported Libraries
import pandas as pd
import numpy as np
import importlib
import sqlite3
import datetime

### Sklearn imported libraries
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import MinMaxScaler

from pprint import pprint
from sklearn.cluster import KMeans

# Pipeline for Oversampling
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

### Imported Scripts
import utils

%load_ext autoreload
%autoreload 2

importlib.reload(utils)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<module 'utils' from 'd:\\escola\\fac\\4o_ano\\1_semestre\\ac-feup\\jupyters\\utils.py'>

### Choose Macros

In [170]:
OVERSMAPLE = True
DEBUG = True
WRITE = True
DUMMIES = True
CATEGORY_ENCONDING = False
MIN_MAX_SCALER = False
SPLIT_RATIO = 0.8
N_COLUMNS = 15
N_SPLITS = 3


## Defined Methods

### Choose Dataset Options

In [171]:
def add_dummy(df,columns):
    copy= df.copy()

    for column in columns:
        dummies = pd.get_dummies(copy[column])
        copy = copy.drop(column,axis=1)
        copy = copy.join(dummies)
    
    return copy

def convert_dates(df):
    copy = df.copy()
    columns = ["loan_date","account_creation","birth_number"]

    for column in columns:
        copy[column] = copy[column].apply(lambda x: datetime.datetime.strptime(x, '%d-%m-%Y').strftime('%Y')).astype(int)

    copy["age_on_loan"] = copy["loan_date"] - copy["birth_number"]
    copy = copy.drop(columns = ["loan_date","account_creation","birth_number"])

    copy['card_issued'] = pd.to_numeric(copy["card_issued"].astype(str), errors='coerce').fillna(1).astype(int)

    return copy

def get_df(test=False):
    con = sqlite3.connect("../database/banking_data")
    if test:
        df = pd.read_sql_query("SELECT * FROM loan_united_test", con)
    else:
        df = pd.read_sql_query("SELECT * FROM loan_united_train", con)

    df = convert_dates(df)
    con.close()

    if DUMMIES:
        columns = ["account_frequency","gender","card_type"]
        df = add_dummy(df, columns)

    if CATEGORY_ENCONDING:
        df = utils.normalize_category(df)

    
    if MIN_MAX_SCALER:
        scaler = MinMaxScaler()
        copy = df.copy()
        y = copy["status"]
        X = copy.drop(columns=["status"])
        transf = scaler.fit_transform(X)
        copy = pd.DataFrame(transf,index=X.index,columns=X.columns)
        copy["status"] = y
        df = copy
        
    
    return df


### Data splitting

In [172]:
### Split the data
def split_dataset(df):

    ### Seperate the precition columns from output
    
    X = df.drop(columns=['status'])
    y = df['status']

    select = SelectKBest(f_classif, k= N_COLUMNS)
    X_new = select.fit_transform(X, y)

    split_filter = select.get_support()
    features = X.columns

    ### Apply splitting
    X_train, X_test, y_train, y_test = train_test_split(X_new,y,train_size=SPLIT_RATIO,test_size=1-SPLIT_RATIO)

    return X_train,X_test,y_train,y_test, features[split_filter]

### Algorithms

In [173]:
def get_random_forest():
    return RandomForestClassifier(bootstrap = False,
                                    max_depth = 80,
                                    max_features = 2,
                                    min_samples_leaf = 3,
                                    min_samples_split = 8,
                                    n_estimators = 100)

In [174]:
def get_logistic_regression():
    return LogisticRegression(random_state=10,solver='lbfgs',max_iter=200)

In [175]:
def get_decision_tree():
    return DecisionTreeClassifier(random_state=0)

In [176]:
def get_knn():
    return KNeighborsClassifier(n_neighbors=3)

### Use *Grid Search Cross Validation* to find the best grid for an algorithm

In [177]:
def create_rf():
    return  RandomForestClassifier(random_state=20)

In [178]:
### Uses a grid search to generate random parameters to find the best grid model
def getBestSearch(algorithm,grid):
    train = get_df()

    X = train.drop(columns=['status'])
    y = train['status']

    alg = build_pipeline(create_rf())
    
    grid_search = GridSearchCV(estimator = alg,
                               param_grid = grid, 
                               scoring=metrics.make_scorer(utils.get_auc, greater_is_better=True),
                               cv=StratifiedKFold(2,random_state=30,shuffle=True),
                               n_jobs = -1,
                               verbose = 2)

    model = grid_search.fit(X,y)

    if DEBUG:
        print('Best Score: ', model.best_score_)
        print('Best Params: ', model.best_params_)
    
    return model.best_score_, model.best_params_


### Build a Pipeline to apply a sampling and a classification algorithm

In [179]:
### TODO: Add undersample before final delivery
def build_pipeline(algorithm):

    if(OVERSMAPLE):
        return Pipeline([
            ('sampling',SMOTE()),
            ('classification',algorithm)
        ])
    else:
        return  Pipeline([
            ('classification',algorithm)
        ])

### Final Stratified Cross Validation

In [180]:
def final_CV(algorithm):
    train = get_df()
    # train = utils.normalization(train,'status')

    X = train.drop(columns=['status'])
    y = train['status']

    select = SelectKBest(f_classif, k=N_COLUMNS)
    X_new = select.fit_transform(X, y)

    split_filter = select.get_support()
    features = X.columns[split_filter]

    skf = StratifiedKFold(n_splits=N_SPLITS, random_state=True, shuffle=True)

    model_list = []
    auc_list = []
    
    for train_index, test_index in skf.split(X_new, y):

        X_train, X_test = X_new[train_index], X_new[test_index]
        y_train, y_test = y[train_index], y[test_index]

        ### Train the model
        pipe = build_pipeline(algorithm())
        model = pipe.fit(X_train, y_train)

        ### Predict the outcome with the test data
        y_pred = model.predict_proba(X_test)
        y_final = y_pred.transpose()[0]
        
        auc = utils.get_auc(y_test, y_final)
        auc_list.append(auc)
        model_list.append(model)
        print(f"AUC={auc}")
        
    ### Get the best model

    best_score = max(auc_list)
    best_model = model_list[auc_list.index(best_score)]
    
    
    ### Use the best model to get a prediction
    test = get_df(test=True)
    
    X2 = test.drop(columns=['status'])
    X2 = X2[features]

    y_predicted = best_model.predict_proba(X2)
    y_final = y_predicted.transpose()[0]
    
    final_df = pd.DataFrame()
    final_df['Id'] = test["loan_id"]
    final_df['Predicted'] = y_final
    
    
    if DEBUG:
        print(f"Predictions:\n {final_df}")
    
    if WRITE:
        final_df.to_csv('../csvs/results/final.csv', index=False)
        print("Sucessfully stored the predictions in a file named 'final.csv'")

    ### TODO: display statistics?
    avg = sum(auc_list)/len(auc_list)
    print(f"Average AUC = {avg}")


### Running an algorithm

In [181]:
def training_algorithm(algorithm):
    ### Getting the dataset
    train = get_df(test=False)
    ### Getting a Model from training
    
    X_train,X_test,y_train,y_test, features = split_dataset(train)

    
    pipe = build_pipeline(algorithm())
    model = pipe.fit(X_train,y_train)

    y_predicted = model.predict_proba(X_test)

    y_final = y_predicted.transpose()[0]

    if DEBUG:
        score = model.score(X_test,y_test)
        auc = utils.get_auc(y_test,y_final,label=-1)
        # plot_auc(y_test,y_final)
        # conf_matrix(model,y_test,y_final)
        print(f"Score: {score}")
        print(f"Auc: {auc}")
    
    return model, features

In [182]:
def testing_model(model,features):

    test = get_df(test=True)

    X = test.drop(columns=['status'])
    X = X[features]

    print(X.columns)

    y_predicted = model.predict_proba(X)
    y_final = y_predicted.transpose()[0]
    
    final_df = pd.DataFrame()
    final_df['Id'] = test["loan_id"]
    final_df['Predicted'] = y_final

    if DEBUG:
        print(f"Predictions:\n {final_df}")
        
    
    if WRITE:
        final_df.to_csv('../csvs/results/testing_model.csv', index=False)
        print("Sucessfully stored the predictions in a file named 'testing_model.csv'")

In [183]:
def run_algorithm(algorithm):
    
    if(DEBUG):
        print("Running the provided algorithm")
    
    model, features = training_algorithm(algorithm)
    testing_model(model,features)

## Running the algorithms

### Choose your algorithm:

In [184]:
### Leave the one you want to run uncommented
algorithm = get_random_forest
# algorithm = get_logistic_regression
# algorithm = get_decision_tree
# algorithm = get_knn

### Run with train_test_split

In [185]:
run_algorithm(algorithm)

Running the provided algorithm
Score: 0.8636363636363636
Auc: 0.8050682261208577
Index(['index', 'loan_id', 'loan_amount', 'payments',
       'client_district no. of municipalities with inhabitants < 499',
       'client_district no. of cities', 'client_district unemploymant_growth',
       'card_issued', 'no. movements', 'min no. trans', 'min balance',
       'avg balance', 'monthly issuance', 'None', 'classic'],
      dtype='object')
Predictions:
        Id  Predicted
0    5895   0.095948
1    7122   0.785190
2    6173   0.237561
3    6142   0.462903
4    5358   0.561065
..    ...        ...
349  4989   0.368826
350  5221   0.267762
351  6402   0.206597
352  5346   0.298799
353  6748   0.242755

[354 rows x 2 columns]
Sucessfully stored the predictions in a file named 'testing_model.csv'


### Run with StratifiedKFold

In [186]:
final_CV(algorithm)

AUC=0.6808510638297872
AUC=0.8262411347517731
AUC=0.7163120567375887
Predictions:
        Id  Predicted
0    5895   0.102153
1    7122   0.736509
2    6173   0.176320
3    6142   0.298133
4    5358   0.364714
..    ...        ...
349  4989   0.251572
350  5221   0.192419
351  6402   0.246849
352  5346   0.371988
353  6748   0.334161

[354 rows x 2 columns]
Sucessfully stored the predictions in a file named 'final.csv'
Average AUC = 0.7411347517730497


### Finding the best grid

In [187]:
param_grid = {
    'classification__max_depth': [80, 90, 100, 110],
    'classification__max_features': [2, 3],
    'classification__min_samples_leaf': [3, 4, 5],
    'classification__min_samples_split': [8, 10, 12],
    'classification__n_estimators': [100, 200, 300, 1000]
}
# param_grid = {
#     'classification__max_depth': [80, 90, 100, 110],
# }
### Uncomment to run (WARNING: Takes like 5 minutes)
# getBestSearch(get_random_forest,param_grid)

In [188]:
from sklearn import metrics
import matplotlib.pyplot as plt


def plot_auc(y_test,y_predicted):
    
    fpr, tpr, _ = metrics.roc_curve(y_test, y_predicted,pos_label=-1)
    roc_auc = metrics.auc(fpr, tpr)
    
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.fill_between(fpr,tpr,color="lightskyblue")
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'w--')
    plt.xlim([0, 1])
    plt.ylim([0, 1.01])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

### Clustering

In [192]:
con = sqlite3.connect("../database/banking_data")

train = pd.read_sql_query("SELECT * FROM loan_united_train", con)

con.close()

df = utils.normalize_category(train)
# df = utils.normalization(df,'status')

X = df.drop(columns=['status'])

kmeans_model = KMeans(n_clusters=3).fit(X)
# labels = kmeans_model.labels_
# metrics.silhouette_score(X, labels, metric='euclidean')

pprint(kmeans_model)    


KMeans(n_clusters=3)


In [193]:
get_df().columns

Index(['index', 'loan_id', 'loan_amount', 'loan_duration', 'payments',
       'status', 'no. of inhabitants',
       'account_district no. of municipalities with inhabitants < 499',
       'account_district no. of municipalities with inhabitants 500-1999',
       'account_district no. of municipalities with inhabitants 2000-9999',
       'account_district no. of municipalities with inhabitants >10000',
       'account_district no. of cities',
       'account_district ratio of urban inhabitants',
       'account_district average salary',
       'account_district unemploymant rate '95',
       'account_district unemploymant rate '96',
       'account_district unemploymant_growth',
       'account_district no. of enterpreneurs per 1000 inhabitants',
       'account_district no. of commited crimes '95',
       'account_district no. of commited crimes '96',
       'account_district crime_growth', 'account_district total_crime', 'code',
       'client_district no. of inhabitants',
       'cl