In [88]:
### imported Libraries
import pandas as pd
import numpy as np
import importlib
from pprint import pprint

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

# from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE 

### Imported Scripts
import utils

%load_ext autoreload
%autoreload 2

importlib.reload(utils)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<module 'utils' from 'c:\\Users\\utilizador\\Desktop\\ac-feup\\jupyters\\hugo\\utils.py'>

In [89]:
### Split the data
def split_dataset(df,ratio=0.7,debug=False):

    ### Seperate the precition columns from output
    X = df.drop(columns=['loan_success'])
    y = df['loan_success']

    ### Apply splitting
    X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=ratio,test_size=1-ratio)

    return X_train,X_test,y_train,y_test

In [90]:
def get_random_forest():
    return RandomForestClassifier(bootstrap = False,
                                    max_depth = 80,
                                    max_features = 3,
                                    min_samples_leaf = 3,
                                    min_samples_split = 12,
                                    n_estimators = 100)

In [91]:
def get_auc(y_test,y_predicted):
    fpr, tpr, _ = metrics.roc_curve(y_test, y_predicted,pos_label=-1)    
    return metrics.auc(fpr, tpr)


In [92]:
def run_random_forest(debug = False,write=False):

    print("Running Random Forest Algorithm")
    model = training_forest(debug)
    testing_forest(model,debug=debug,write=write)
    

In [93]:
def training_forest(debug=False):
    ### Getting the dataset
    train = pd.read_csv('../../project/banking_data/loanUnitedTrain.csv', sep=',')
    train = utils.normalize_category(train)

    ### Getting a Model from training
    X_train,X_test,y_train,y_test = split_dataset(train)

    # rf = get_random_forest()

    pipe = build_pipeline(get_random_forest())

    model = pipe.fit(X_train,y_train)

    y_predicted = model.predict_proba(X_test)

    y_final = y_predicted.transpose()[0]

    if debug:
        score = model.score(X_test,y_test)
        auc = get_auc(y_test,y_final)
        print(f"Score: {score}")
        print(f"Auc: {auc}")
    
    return model

In [94]:
def testing_forest(model,debug=False,write=False):
    test = pd.read_csv('../../project/banking_data/loanUnitedTest.csv', sep=',')
    test = utils.normalize_category(test)

    X = test.drop(columns=['loan_success'])

    y_predicted = model.predict_proba(X)
    y_final = y_predicted.transpose()[0]
    
    final_df = pd.DataFrame()
    final_df['Id'] = test["loan_id"]
    final_df['Predicted'] = y_final

    if debug:
        print(f"Predictions:\n {final_df}")
    
    if write:
        final_df.to_csv('out.csv', index=False)
        print("Sucessfully stored the predictions in a file named 'out.csv'")

In [95]:
run_random_forest(debug=True,write=False)

Running Random Forest Algorithm
Score: 0.8888888888888888
Auc: 0.7920634920634921
Predictions:
        Id  Predicted
0    5895   0.140141
1    7122   0.692797
2    6173   0.149821
3    6142   0.257725
4    5358   0.334182
5    6095   0.096698
6    6878   0.149149
7    6554   0.190455
8    6793   0.195571
9    7286   0.240025
10   6076   0.117654
11   5134   0.344882
12   5419   0.303600
13   6255   0.344102
14   5656   0.157006
15   6934   0.553405
16   6028   0.134954
17   6490   0.172095
18   6415   0.207190
19   7087   0.068705
20   5420   0.321655
21   5977   0.097540
22   6824   0.502326
23   5207   0.219626
24   7115   0.261188
25   7250   0.055418
26   6010   0.241762
27   6088   0.265848
28   5682   0.175951
29   7201   0.185201
..    ...        ...
324  5698   0.285307
325  5169   0.157695
326  7294   0.116969
327  5318   0.327777
328  5368   0.226131
329  6923   0.162034
330  5463   0.258408
331  5265   0.153610
332  6321   0.083801
333  5226   0.545264
334  6868   0.180696
3

In [96]:
### Uses a grid search to generate random parameters to find the best grid model
def getBestSearch(algorithm,grid,debug=True):

    train = pd.read_csv('../../project/banking_data/loanUnitedTrain.csv', sep=',')
    train = utils.normalize_category(train)

    X = train.drop(columns=['loan_success'])
    y = train['loan_success']

    alg = algorithm()

    grid_search = GridSearchCV(estimator = alg, param_grid = grid, cv = 2, n_jobs = -1, verbose = 2)

    model = grid_search.fit(X,y)

    if debug:
        print('Best Score: ', model.best_score_)
        print('Best Params: ', model.best_params_)
    
    return model.best_score_, model.best_params_


In [97]:
param_grid = {
    'bootstrap': [True,False],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
### Uncomment to run (WARNING: Takes like 5 minutes)
# getBestSearch(get_random_forest,param_grid)

In [99]:
### Sampling (This should work if the ASHDASJDSAKD LIBRARY IS IMPORTED)
def build_pipeline(algorithm,oversample=True,undersample=True):


    if(oversample):
        return Pipeline([
            ('sampling',SMOTE(random_state = 20)),
            ('classification',algorithm)
        ])
    else:
        return  Pipeline([
            ('classification',algorithm)
        ])

    return pipeline

In [100]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

train = pd.read_csv('../../project/banking_data/loanUnitedTrain.csv', sep=',')
df = utils.normalize_category(train)
# df = utils.normalization(df,'loan_success')

X = df.drop(columns=['loan_success'])
y = df['loan_success']

algorithm = RandomForestClassifier(bootstrap = False,
                                    max_depth = 80,
                                    max_features = 3,
                                    min_samples_leaf = 3,
                                    min_samples_split = 12,
                                    n_estimators = 100)

skf = StratifiedKFold(n_splits=5, random_state=True, shuffle=True)
skf.get_n_splits(X, y)

auc_list = []

for train_index, test_index in skf.split(X, y):

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    ### Train the model
    algorithm.fit(X_train, y_train)

    ### Predict the outcome with the test data
    y_pred = algorithm.predict_proba(X_test)
    y_final = y_pred.transpose()[0]

    auc = get_auc(y_test, y_final)
    auc_list.append(auc)
    print(f"AUC={auc}")

### TODO: display statistics?
avg = sum(auc_list)/len(auc_list)
print(f"Average AUC = {avg}")

    



AUC=0.842857142857143
AUC=0.8557504873294347
AUC=0.6920077972709552
AUC=0.7023809523809523
AUC=0.6984126984126984
Average AUC = 0.7582818156502368
