### Imported libraries and Scripts

In [71]:
### imported Libraries
import pandas as pd
import numpy as np
import importlib
from pprint import pprint
from datetime import datetime

### Sklearn imported libraries
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.feature_selection import SelectKBest, f_classif


# Pipeline for Oversampling
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

### Imported Scripts
import utils

%load_ext autoreload
%autoreload 2

importlib.reload(utils)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<module 'utils' from 'c:\\Users\\utilizador\\Desktop\\ac-feup\\jupyters\\hugo\\utils.py'>

### Choose Dataset Options

In [72]:
def add_dummy(df,columns):
    copy= df.copy()

    for column in columns:
        print(copy.shape)
        dummies = pd.get_dummies(copy[column])
        copy = copy.drop(column,axis=1)
        copy = copy.join(dummies)
    
    return copy

def convert_dates(df):
    copy = df.copy()
    columns = ["loan_date","account_creation","birth_number"]

    for column in columns:
        copy[column] = copy[column].apply(lambda x: datetime.strptime(x, '%d-%m-%Y').strftime('%Y')).astype(int)

    copy["age_on_loan"] = copy["loan_date"] - copy["birth_number"]
    copy = copy.drop(columns = ["loan_date","account_creation","birth_number"])

    copy['card_issued'] = pd.to_numeric(copy["card_issued"].astype(str), errors='coerce').fillna(1).astype(int)

    return copy

def get_df(test=False, dummies=False, category_encoding=False):
    if test:
        df = pd.read_csv('../../csvs/loan_united_test.csv', sep=',')
    else:
        df = pd.read_csv('../../csvs/loan_united_train.csv', sep=',')
    
    columns = ["account_frequency","gender","card_type"]

    if dummies:
        df = add_dummy(df, columns)
    
    if category_encoding:
        df = utils.normalize_category(df)
    
    df = convert_dates(df)

### Data splitting

In [73]:
### Split the data
def split_dataset(df,ratio=0.7,debug=False,n_columns = 15):

    ### Seperate the precition columns from output
    X = df.drop(columns=['loan_success'])
    y = df['loan_success']

    select = SelectKBest(f_classif, k= n_columns)
    X_new = select.fit_transform(X, y)

    filter = select.get_support()
    features = X.columns

    if debug:
        print("Selected best 3:")
        print(features[filter])
        print(X_new) 

    ### Apply splitting
    X_train, X_test, y_train, y_test = train_test_split(X_new,y,train_size=ratio,test_size=1-ratio)

    return X_train,X_test,y_train,y_test, features[filter]

### Algorithms

In [74]:
def get_random_forest():
    return RandomForestClassifier(bootstrap = False,
                                    max_depth = 80,
                                    max_features = 3,
                                    min_samples_leaf = 3,
                                    min_samples_split = 12,
                                    n_estimators = 100)

In [75]:
def get_logistic_regression():
    return LogisticRegression(random_state=10,solver='lbfgs',max_iter=200)

In [76]:
def get_decision_tree():
    return DecisionTreeClassifier(random_state=0)

In [77]:
def get_knn():
    return KNeighborsClassifier(n_neighbors=3)

### Use *Grid Search Cross Validation* to find the best grid for an algorithm

In [78]:
### Uses a grid search to generate random parameters to find the best grid model
def getBestSearch(algorithm,grid,debug=True):

    # train = pd.read_csv('../../project/banking_data/loanUnitedTrain.csv', sep=',')
    train = pd.read_csv('../../csvs/loan_united_train.csv', sep=',')
    train = utils.normalize_category(train)

    X = train.drop(columns=['loan_success'])
    y = train['loan_success']

    alg = algorithm()

    grid_search = GridSearchCV(estimator = alg, param_grid = grid, cv = 2, n_jobs = -1, verbose = 2)

    model = grid_search.fit(X,y)

    if debug:
        print('Best Score: ', model.best_score_)
        print('Best Params: ', model.best_params_)
    
    return model.best_score_, model.best_params_


### Build a Pipeline to apply a sampling and a classification algorithm

In [79]:
### Sampling (This should work if the ASHDASJDSAKD LIBRARY IS IMPORTED)
### TODO: Add undersample before final delivery
def build_pipeline(algorithm,oversample=True,undersample=True):

    if(oversample):
        return Pipeline([
            ('sampling',SMOTE(random_state = 20)),
            ('classification',algorithm)
        ])
    else:
        return  Pipeline([
            ('classification',algorithm)
        ])

    return pipeline

### Final Stratified Cross Validation

In [80]:
def final_CV(algorithm,n_splits = 3):
    # train = pd.read_csv('../../project/banking_data/loanUnitedTrain.csv', sep=',')
    train = pd.read_csv('../../csvs/loan_united_train.csv', sep=',')
    df = utils.normalize_category(train)
    # df = utils.normalization(df,'loan_success')

    X = df.drop(columns=['loan_success'])
    y = df['loan_success']

    select = SelectKBest(f_classif, k=15)
    X_new = select.fit_transform(X, y)

    algorithm = algorithm()

    skf = StratifiedKFold(n_splits=n_splits, random_state=True, shuffle=True)
    skf.get_n_splits(X_new, y)

    model_list = []
    auc_list = []


    for train_index, test_index in skf.split(X, y):

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        ### Train the model
        model = algorithm.fit(X_train, y_train)

        ### Predict the outcome with the test data
        y_pred = algorithm.predict_proba(X_test)
        y_final = y_pred.transpose()[0]

        auc = utils.get_auc(y_test, y_final)
        auc_list.append(auc)
        model_list.append(model)
        print(f"AUC={auc}")
        
    ### Get the best model
    best_score = max(auc_list)
    best_model = model_list[auc_list.index(best_score)]
    
    
    ### Use the best model to get a prediction
    test = pd.read_csv('../../csvs/loan_united_test.csv', sep=',')
    test = utils.normalize_category(test)
    
    X = test.drop(columns=['loan_success'])
    y_predicted = best_model.predict_proba(X)
    y_final = y_predicted.transpose()[0]
    
    final_df = pd.DataFrame()
    final_df['Id'] = test["loan_id"]
    final_df['Predicted'] = y_final
    
    
    if True:
        print(f"Predictions:\n {final_df}")
    
    if True:
        final_df.to_csv('CV.csv', index=False)
        print("Sucessfully stored the predictions in a file named 'CV.csv'")

    ### TODO: display statistics?
    avg = sum(auc_list)/len(auc_list)
    print(f"Average AUC = {avg}")


### Running an algorithm

In [81]:
def training_algorithm(algorithm,n_columns=15,debug=False):
    ### Getting the dataset
    # train = pd.read_csv('../../project/banking_data/loanUnitedTrain.csv', sep=',')
    
    # train = reverse_dates()
    
    train = pd.read_csv('../../csvs/loan_united_train.csv', sep=',')
    train = utils.normalize_category(train)
    
    ### Getting a Model from training
    
    X_train,X_test,y_train,y_test, features = split_dataset(train,n_columns = n_columns)

    
    pipe = build_pipeline(algorithm())
    model = pipe.fit(X_train,y_train)

    y_predicted = model.predict_proba(X_test)

    y_final = y_predicted.transpose()[0]

    if debug:
        score = model.score(X_test,y_test)
        auc = utils.get_auc(y_test,y_final)
        print(f"Score: {score}")
        print(f"Auc: {auc}")
    
    return model, features

In [82]:
def testing_model(model,features,debug=False,write=False):
    test = pd.read_csv('../../csvs/loan_united_test.csv', sep=',')
    # test = pd.read_csv('../../project/banking_data/loanUnitedTest.csv', sep=',')
    test = utils.normalize_category(test)

    X = test.drop(columns=['loan_success'])
    X = X[features]
    # print(X)
    # X_new = SelectKBest(f_classif, k=20).fit_transform(X)

    y_predicted = model.predict_proba(X)
    y_final = y_predicted.transpose()[0]
    
    final_df = pd.DataFrame()
    final_df['Id'] = test["loan_id"]
    final_df['Predicted'] = y_final

    if debug:
        print(f"Predictions:\n {final_df}")
    
    if write:
        final_df.to_csv('out.csv', index=False)
        print("Sucessfully stored the predictions in a file named 'out.csv'")

In [83]:
def run_algorithm(algorithm,debug = True,write=False):
    
    if(debug):
        print("Running the provided algorithm")
    
    model, features = training_algorithm(algorithm,debug=debug)
    # print(features)
    testing_model(model,features,debug=debug,write=write)

### Choose your algorithm:

In [84]:
### Leave the one you want to run uncommented
algorithm = get_random_forest
# algorithm = get_logistic_regression
# algorithm = get_decision_tree
# algorithm = get_knn

### Run the Chosen Algorithm

In [97]:
run_algorithm(algorithm,debug=True,write=True)

Running the provided algorithm
Score: 0.8282828282828283
Auc: 0.6370481927710844
Predictions:
        Id  Predicted
0    5895   0.114356
1    7122   0.738131
2    6173   0.094978
3    6142   0.416769
4    5358   0.464194
5    6095   0.106075
6    6878   0.047405
7    6554   0.469121
8    6793   0.277629
9    7286   0.162249
10   6076   0.045818
11   5134   0.185271
12   5419   0.131212
13   6255   0.442132
14   5656   0.135685
15   6934   0.351881
16   6028   0.102327
17   6490   0.239786
18   6415   0.440972
19   7087   0.052100
20   5420   0.100003
21   5977   0.062076
22   6824   0.486648
23   5207   0.199961
24   7115   0.134744
25   7250   0.010317
26   6010   0.095024
27   6088   0.534584
28   5682   0.067758
29   7201   0.040806
..    ...        ...
324  5698   0.490873
325  5169   0.052117
326  7294   0.083641
327  5318   0.368653
328  5368   0.174990
329  6923   0.066996
330  5463   0.194523
331  5265   0.188591
332  6321   0.112409
333  5226   0.415569
334  6868   0.106837
33

  f"X has feature names, but {self.__class__.__name__} was fitted without"


### Execute Final Cross Validation

In [86]:
final_CV(algorithm)

AUC=0.7227393617021276
AUC=0.7666666666666666
AUC=0.7333333333333334
Predictions:
        Id  Predicted
0    5895   0.033163
1    7122   0.427416
2    6173   0.058175
3    6142   0.102339
4    5358   0.493692
5    6095   0.084053
6    6878   0.075648
7    6554   0.140303
8    6793   0.120538
9    7286   0.145471
10   6076   0.074315
11   5134   0.127673
12   5419   0.402554
13   6255   0.202907
14   5656   0.151114
15   6934   0.353400
16   6028   0.130194
17   6490   0.117759
18   6415   0.222100
19   7087   0.054107
20   5420   0.109486
21   5977   0.084928
22   6824   0.245381
23   5207   0.169652
24   7115   0.233245
25   7250   0.022611
26   6010   0.130106
27   6088   0.203942
28   5682   0.041326
29   7201   0.037770
..    ...        ...
324  5698   0.218632
325  5169   0.104790
326  7294   0.064533
327  5318   0.146638
328  5368   0.141914
329  6923   0.124275
330  5463   0.167760
331  5265   0.044610
332  6321   0.037867
333  5226   0.135229
334  6868   0.069110
335  4967   0.

### Finding the best grid

In [87]:
param_grid = {
    'bootstrap': [True,False],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
### Uncomment to run (WARNING: Takes like 5 minutes)
# getBestSearch(get_random_forest,param_grid)

In [88]:
def convert_date(df,column):
    copy = df.copy()
    date = copy[column]
    
    copy[column] =  date.apply(lambda x: datetime.datetime.strptime(x, '%d-%m-%Y').strftime('%Y%m%d'))

    return copy

import datetime

def reverse_dates():
    
    train = pd.read_csv('../../csvs/loan_united_train.csv', sep=',')
    columns = ['loan_date','account_creation']
    df = train.copy()

    for column in columns:
        df = convert_date(df,column)
        
    return df

### Clustering

In [89]:
import numpy as np
from sklearn.cluster import KMeans
from pprint import pprint

train = pd.read_csv('../../csvs/loan_united_train.csv', sep=',')
df = utils.normalize_category(train)
# df = utils.normalization(df,'loan_success')

X = df.drop(columns=['loan_success'])

kmeans_model = KMeans(n_clusters=3).fit(X)
# labels = kmeans_model.labels_
# metrics.silhouette_score(X, labels, metric='euclidean')

pprint(kmeans_model)


KMeans(n_clusters=3)


In [90]:
def add_dummy(df,columns):
    copy= df.copy()

    for column in columns:
        print(copy.shape)
        dummies = pd.get_dummies(copy[column])
        copy = copy.drop(column,axis=1)
        copy = copy.join(dummies)
    
    return copy

def df_dummies(test=False):
    if test:
        df = pd.read_csv('../../csvs/loan_united_test.csv', sep=',')
    else:
        df = pd.read_csv('../../csvs/loan_united_train.csv', sep=',')

    columns = ["account_frequency","gender","card_type"]
    return add_dummy(df,columns)


In [91]:
### Cconverts dates to numerical values
def convert_year(df):

    copy = df.copy()
    columns = ["loan_date","account_creation","birth_number"]

    for column in columns:
        copy[column] = copy[column].apply(lambda x: datetime.strptime(x, '%d-%m-%Y').strftime('%Y')).astype(int)

    copy["age_on_loan"] = copy["loan_date"] - copy["birth_number"]
    copy = copy.drop(columns = ["loan_date","account_creation","birth_number"])

    return copy

### Replace the card issed column : 0-> no issue 1 -> issue
def convert_date_issues(df):
    copy = df.copy()
    copy['card_issued'] = pd.to_numeric(copy["card_issued"].astype(str), errors='coerce').fillna(1).astype(int)

    return copy

In [92]:
train = pd.read_csv('../../csvs/loan_united_train.csv', sep=',')

train2 = convert_year(train)

convert_date_issues(train2)

AttributeError: module 'datetime' has no attribute 'strptime'

In [None]:
from sklearn.model_selection import cross_val_score


train = pd.read_csv('../../csvs/loan_united_train.csv', sep=',')
test = pd.read_csv('../../csvs/loan_united_test.csv', sep=',')



# iris = load_iris()
# cross_val_score(clf, iris.data, iris.target, cv=10)

run_algorithm(knn,debug=True,write=False)

Running the provided algorithm
Score: 0.6464646464646465
Auc: 0.5603174603174603
Predictions:
        Id  Predicted
0    5895   0.000000
1    7122   0.333333
2    6173   0.000000
3    6142   0.333333
4    5358   0.000000
5    6095   1.000000
6    6878   0.000000
7    6554   0.333333
8    6793   0.000000
9    7286   0.666667
10   6076   0.000000
11   5134   0.333333
12   5419   0.333333
13   6255   0.000000
14   5656   0.000000
15   6934   0.333333
16   6028   1.000000
17   6490   0.000000
18   6415   1.000000
19   7087   0.000000
20   5420   1.000000
21   5977   0.000000
22   6824   1.000000
23   5207   0.000000
24   7115   0.333333
25   7250   0.000000
26   6010   0.000000
27   6088   0.000000
28   5682   0.666667
29   7201   0.333333
..    ...        ...
324  5698   1.000000
325  5169   0.000000
326  7294   0.000000
327  5318   0.000000
328  5368   0.333333
329  6923   0.000000
330  5463   0.000000
331  5265   1.000000
332  6321   0.333333
333  5226   1.000000
334  6868   0.000000
33

  f"X has feature names, but {self.__class__.__name__} was fitted without"
