### Imported libraries and Scripts

In [279]:
### imported Libraries
import pandas as pd
import numpy as np
import importlib
from pprint import pprint
from datetime import datetime


from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.feature_selection import SelectKBest, chi2, f_classif

# from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

# from sklearn.pipeline import Pipeline

### Imported Scripts
import utils

%load_ext autoreload
%autoreload 2

importlib.reload(utils)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<module 'utils' from 'c:\\Users\\utilizador\\Desktop\\ac-feup\\jupyters\\hugo\\utils.py'>

### Data splitting

In [216]:
### Split the data
def split_dataset(df,ratio=0.7,debug=False):

    ### Seperate the precition columns from output
    X = df.drop(columns=['loan_success'])
    y = df['loan_success']

    select = SelectKBest(f_classif, k=15)
    X_new = select.fit_transform(X, y)

    filter = select.get_support()
    features = X.columns

    if debug:
        print("Selected best 3:")
        print(features[filter])
        print(X_new) 

    ### Apply splitting
    X_train, X_test, y_train, y_test = train_test_split(X_new,y,train_size=ratio,test_size=1-ratio)

    return X_train,X_test,y_train,y_test, features[filter]

### Algorithms

In [180]:
def get_random_forest():
    return RandomForestClassifier(bootstrap = False,
                                    max_depth = 80,
                                    max_features = 3,
                                    min_samples_leaf = 3,
                                    min_samples_split = 12,
                                    n_estimators = 100)

In [181]:
def get_logistic_regression():
    return LogisticRegression(random_state=10,solver='lbfgs',max_iter=200)

### Auc Curve (Pass this to utils later)

In [182]:
def get_auc(y_test,y_predicted):
    fpr, tpr, _ = metrics.roc_curve(y_test, y_predicted,pos_label=-1)    
    return metrics.auc(fpr, tpr)


### Use *Grid Search Cross Validation* to find the best grid for an algorithm

In [183]:
### Uses a grid search to generate random parameters to find the best grid model
def getBestSearch(algorithm,grid,debug=True):

    # train = pd.read_csv('../../project/banking_data/loanUnitedTrain.csv', sep=',')
    train = pd.read_csv('../../csvs/loan_united_train.csv', sep=',')
    train = utils.normalize_category(train)

    X = train.drop(columns=['loan_success'])
    y = train['loan_success']

    alg = algorithm()

    grid_search = GridSearchCV(estimator = alg, param_grid = grid, cv = 2, n_jobs = -1, verbose = 2)

    model = grid_search.fit(X,y)

    if debug:
        print('Best Score: ', model.best_score_)
        print('Best Params: ', model.best_params_)
    
    return model.best_score_, model.best_params_


### Build a Pipeline to apply a sampling and a classification algorithm

In [184]:
### Sampling (This should work if the ASHDASJDSAKD LIBRARY IS IMPORTED)
### Don't do this
def build_pipeline(algorithm,oversample=True,undersample=True):

    if(oversample):
        return Pipeline([
            ('sampling',SMOTE(random_state = 20)),
            ('classification',algorithm)
        ])
    else:
        return  Pipeline([
            ('classification',algorithm)
        ])

    return pipeline

### Final Stratified Cross Validation

In [185]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

def final_CV(algorithm):
    # train = pd.read_csv('../../project/banking_data/loanUnitedTrain.csv', sep=',')
    train = pd.read_csv('../../csvs/loan_united_train.csv', sep=',')
    df = utils.normalize_category(train)
    # df = utils.normalization(df,'loan_success')

    X = df.drop(columns=['loan_success'])
    y = df['loan_success']

    algorithm = algorithm()

    skf = StratifiedKFold(n_splits=3, random_state=True, shuffle=True)
    skf.get_n_splits(X, y)

    model_list = []
    auc_list = []

    for train_index, test_index in skf.split(X, y):

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        ### Train the model
        model = algorithm.fit(X_train, y_train)

        ### Predict the outcome with the test data
        y_pred = algorithm.predict_proba(X_test)
        y_final = y_pred.transpose()[0]

        auc = get_auc(y_test, y_final)
        auc_list.append(auc)
        model_list.append(model)
        print(f"AUC={auc}")
        
    ### Get the best model
    best_score = max(auc_list)
    best_model = model_list[auc_list.index(best_score)]
    
    
    ### Use the best model to get a prediction
    test = pd.read_csv('../../csvs/loan_united_test.csv', sep=',')
    test = utils.normalize_category(test)
    
    X = test.drop(columns=['loan_success'])
    y_predicted = best_model.predict_proba(X)
    y_final = y_predicted.transpose()[0]
    
    final_df = pd.DataFrame()
    final_df['Id'] = test["loan_id"]
    final_df['Predicted'] = y_final
    
    
    if True:
        print(f"Predictions:\n {final_df}")
    
    if True:
        final_df.to_csv('CV.csv', index=False)
        print("Sucessfully stored the predictions in a file named 'CV.csv'")

    ### TODO: display statistics?
    avg = sum(auc_list)/len(auc_list)
    print(f"Average AUC = {avg}")


### Running an algorithm

In [217]:
def training_algorithm(algorithm,debug=False):
    ### Getting the dataset
    # train = pd.read_csv('../../project/banking_data/loanUnitedTrain.csv', sep=',')
    
    # train = reverse_dates()
    
    train = pd.read_csv('../../csvs/loan_united_train.csv', sep=',')
    train = utils.normalize_category(train)
    
    ### Getting a Model from training
    
    X_train,X_test,y_train,y_test, features = split_dataset(train)

    
    pipe = build_pipeline(algorithm())
    model = pipe.fit(X_train,y_train)

    y_predicted = model.predict_proba(X_test)

    y_final = y_predicted.transpose()[0]

    if debug:
        score = model.score(X_test,y_test)
        auc = get_auc(y_test,y_final)
        print(f"Score: {score}")
        print(f"Auc: {auc}")
    
    return model, features

In [218]:
def testing_model(model,features,debug=False,write=False):
    test = pd.read_csv('../../csvs/loan_united_test.csv', sep=',')
    # test = pd.read_csv('../../project/banking_data/loanUnitedTest.csv', sep=',')
    test = utils.normalize_category(test)

    X = test.drop(columns=['loan_success'])
    X = X[features]
    # print(X)
    # X_new = SelectKBest(f_classif, k=20).fit_transform(X)

    y_predicted = model.predict_proba(X)
    y_final = y_predicted.transpose()[0]
    
    final_df = pd.DataFrame()
    final_df['Id'] = test["loan_id"]
    final_df['Predicted'] = y_final

    if debug:
        print(f"Predictions:\n {final_df}")
    
    if write:
        final_df.to_csv('out.csv', index=False)
        print("Sucessfully stored the predictions in a file named 'out.csv'")

In [219]:
def run_algorithm(algorithm,debug = True,write=False):
    
    if(debug):
        print("Running the provided algorithm")
    
    model, features = training_algorithm(algorithm,debug=debug)
    # print(features)
    testing_model(model,features,debug=debug,write=write)

### Choose your algorithm:

In [189]:
### Leave the one you want to run uncommented
algorithm = get_random_forest
# algorithm = get_logistic_regression

### Run the Chosen Algorithm

In [223]:
run_algorithm(algorithm,debug=True,write=True)

Running the provided algorithm
Score: 0.8686868686868687
Auc: 0.9008264462809917
Predictions:
        Id  Predicted
0    5895   0.081188
1    7122   0.597977
2    6173   0.305047
3    6142   0.327332
4    5358   0.267666
5    6095   0.093468
6    6878   0.040429
7    6554   0.449514
8    6793   0.094993
9    7286   0.198902
10   6076   0.037992
11   5134   0.139453
12   5419   0.648082
13   6255   0.738017
14   5656   0.156983
15   6934   0.268815
16   6028   0.172215
17   6490   0.065953
18   6415   0.470684
19   7087   0.045905
20   5420   0.148334
21   5977   0.087326
22   6824   0.578600
23   5207   0.023149
24   7115   0.158817
25   7250   0.044833
26   6010   0.520837
27   6088   0.631793
28   5682   0.174350
29   7201   0.078365
..    ...        ...
324  5698   0.402218
325  5169   0.057571
326  7294   0.040198
327  5318   0.200401
328  5368   0.088958
329  6923   0.055778
330  5463   0.063514
331  5265   0.155171
332  6321   0.114177
333  5226   0.270032
334  6868   0.106096
33

  f"X has feature names, but {self.__class__.__name__} was fitted without"


### Execute Final Cross Validation

In [191]:
final_CV(algorithm)

AUC=0.6941489361702128
AUC=0.7758865248226949
AUC=0.7553191489361702
Predictions:
        Id  Predicted
0    5895   0.023387
1    7122   0.354556
2    6173   0.047091
3    6142   0.101518
4    5358   0.413213
5    6095   0.096489
6    6878   0.087152
7    6554   0.155903
8    6793   0.128714
9    7286   0.180019
10   6076   0.079755
11   5134   0.139328
12   5419   0.409593
13   6255   0.212873
14   5656   0.158139
15   6934   0.318219
16   6028   0.146354
17   6490   0.147312
18   6415   0.251320
19   7087   0.037345
20   5420   0.159592
21   5977   0.103782
22   6824   0.266071
23   5207   0.164319
24   7115   0.174611
25   7250   0.014873
26   6010   0.112976
27   6088   0.163156
28   5682   0.045520
29   7201   0.040156
..    ...        ...
324  5698   0.242536
325  5169   0.082994
326  7294   0.069026
327  5318   0.154086
328  5368   0.144734
329  6923   0.119647
330  5463   0.155613
331  5265   0.011667
332  6321   0.031879
333  5226   0.111872
334  6868   0.085320
335  4967   0.

### Finding the best grid

In [192]:
param_grid = {
    'bootstrap': [True,False],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
### Uncomment to run (WARNING: Takes like 5 minutes)
# getBestSearch(get_random_forest,param_grid)

In [193]:
def convert_date(df,column):
    copy = df.copy()
    date = copy[column]
    
    copy[column] =  date.apply(lambda x: datetime.datetime.strptime(x, '%d-%m-%Y').strftime('%Y%m%d'))

    return copy

import datetime

def reverse_dates():
    
    train = pd.read_csv('../../csvs/loan_united_train.csv', sep=',')
    columns = ['loan_date','account_creation']
    df = train.copy()

    for column in columns:
        df = convert_date(df,column)
        
    return df

### Clustering

In [194]:
import numpy as np
from sklearn.cluster import KMeans
from pprint import pprint

train = pd.read_csv('../../csvs/loan_united_train.csv', sep=',')
df = utils.normalize_category(train)
# df = utils.normalization(df,'loan_success')

X = df.drop(columns=['loan_success'])

kmeans_model = KMeans(n_clusters=3).fit(X)
# labels = kmeans_model.labels_
# metrics.silhouette_score(X, labels, metric='euclidean')

pprint(kmeans_model)


KMeans(n_clusters=3)


In [256]:
def add_dummy(df,columns):
    copy= df.copy()

    for column in columns:
        print(copy.shape)
        dummies = pd.get_dummies(copy[column])
        copy = copy.drop(column,axis=1)
        copy = copy.join(dummies)
    
    return copy

def df_dummies(test=False):
    if test:
        df = pd.read_csv('../../csvs/loan_united_test.csv', sep=',')
    else:
        df = pd.read_csv('../../csvs/loan_united_train.csv', sep=',')

    columns = ["account_frequency","gender","card_type"]
    return add_dummy(df,columns)


In [318]:
### Cconverts dates to numerical values
def convert_year(df):

    copy = df.copy()
    columns = ["loan_date","account_creation","birth_number"]

    for column in columns:
        copy[column] = copy[column].apply(lambda x: datetime.strptime(x, '%d-%m-%Y').strftime('%Y')).astype(int)

    copy["age_on_loan"] = copy["loan_date"] - copy["birth_number"]
    copy = copy.drop(columns = ["loan_date","account_creation","birth_number"])

    return copy

### Replace the card issed column : 0-> no issue 1 -> issue
def convert_date_issues(df):
    copy = df.copy()
    copy['card_issued'] = pd.to_numeric(copy["card_issued"].astype(str), errors='coerce').fillna(1).astype(int)

    return copy

In [325]:
train = pd.read_csv('../../csvs/loan_united_train.csv', sep=',')

train2 = convert_year(train)

convert_date_issues(train2)

Unnamed: 0,loan_id,loan_amount,loan_duration,payments,loan_success,account_frequency,no. of inhabitants,account_district no. of municipalities with inhabitants < 499,account_district no. of municipalities with inhabitants 500-1999,account_district no. of municipalities with inhabitants 2000-9999,...,card_type,no. movements,min no. trans,max no. trans,avg no. trans,min balance,max balance,avg balance,age_on_loan,card_issed
0,5314,96396,12,8033,-1,weekly issuance,94812,15,13,8,...,,4.0,1100.0,9900.0,5025.000000,1100.0,20100.0,12250.000000,46,0
1,5316,165960,36,4610,1,monthly issuance,112709,48,20,7,...,,37.0,2.9,54300.0,11015.635135,700.0,120512.8,52083.859459,25,0
2,6863,127080,60,2118,1,monthly issuance,77917,85,19,6,...,,24.0,48.6,19065.0,5417.458333,800.0,49590.4,30060.954167,57,0
3,5325,105804,36,2939,1,monthly issuance,107870,84,29,6,...,,25.0,14.6,26448.0,8253.080000,1000.0,65898.5,41297.480000,53,0
4,7240,274740,60,4579,1,weekly issuance,1204953,0,0,0,...,,27.0,30.0,63366.0,18945.966667,600.0,122893.1,57188.211111,15,0
5,6687,87840,24,3660,1,monthly issuance,53921,61,22,1,...,,17.0,51.3,32938.0,9496.929412,500.0,65847.8,46318.552941,13,0
6,7284,52788,12,4399,1,monthly issuance,58796,22,16,7,...,,43.0,14.6,17920.0,3714.897674,1000.0,41469.1,22198.179070,20,0
7,6111,174744,24,7281,-1,monthly issuance,122603,25,21,6,...,,32.0,14.6,52600.0,11557.950000,1000.0,105628.0,37485.456250,45,0
8,7235,154416,48,3217,1,weekly issuance,70699,60,13,2,...,,32.0,13.6,39098.0,9740.150000,8897.0,81723.5,46715.800000,24,0
9,5997,117024,24,4876,1,monthly issuance,177686,69,27,10,...,,13.0,45.1,29255.0,14122.576923,800.0,76985.2,44131.169231,58,0
