In [103]:
### imported Libraries
import pandas as pd
import numpy as np
import importlib
from pprint import pprint

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

# from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE 

### Imported Scripts
import utils

%load_ext autoreload
%autoreload 2

importlib.reload(utils)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<module 'utils' from 'c:\\Users\\utilizador\\Desktop\\ac-feup\\jupyters\\hugo\\utils.py'>

In [104]:
### Split the data
def split_dataset(df,ratio=0.7,debug=False):

    ### Seperate the precition columns from output
    X = df.drop(columns=['loan_success'])
    y = df['loan_success']

    ### Apply splitting
    X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=ratio,test_size=1-ratio)

    return X_train,X_test,y_train,y_test

In [105]:
def get_random_forest():
    return RandomForestClassifier(bootstrap = False,
                                    max_depth = 80,
                                    max_features = 3,
                                    min_samples_leaf = 3,
                                    min_samples_split = 12,
                                    n_estimators = 100)

In [115]:
def get_logistic_regression():
    return LogisticRegression(random_state=10,solver='lbfgs',max_iter=200)

In [106]:
def get_auc(y_test,y_predicted):
    fpr, tpr, _ = metrics.roc_curve(y_test, y_predicted,pos_label=-1)    
    return metrics.auc(fpr, tpr)


In [107]:
def run_random_forest(debug = False,write=False):

    print("Running Random Forest Algorithm")
    model = training_forest(debug)
    testing_forest(model,debug=debug,write=write)
    

In [108]:
def training_forest(debug=False):
    ### Getting the dataset
    # train = pd.read_csv('../../project/banking_data/loanUnitedTrain.csv', sep=',')
    train = pd.read_csv('../../csvs/loan_united_train.csv', sep=',')
    train = utils.normalize_category(train)

    ### Getting a Model from training
    X_train,X_test,y_train,y_test = split_dataset(train)

    # rf = get_random_forest()

    pipe = build_pipeline(get_random_forest())

    model = pipe.fit(X_train,y_train)

    y_predicted = model.predict_proba(X_test)

    y_final = y_predicted.transpose()[0]

    if debug:
        score = model.score(X_test,y_test)
        auc = get_auc(y_test,y_final)
        print(f"Score: {score}")
        print(f"Auc: {auc}")
    
    return model

In [109]:
def testing_forest(model,debug=False,write=False):
    test = pd.read_csv('../../csvs/loan_united_test.csv', sep=',')
    # test = pd.read_csv('../../project/banking_data/loanUnitedTest.csv', sep=',')
    test = utils.normalize_category(test)

    X = test.drop(columns=['loan_success'])

    y_predicted = model.predict_proba(X)
    y_final = y_predicted.transpose()[0]
    
    final_df = pd.DataFrame()
    final_df['Id'] = test["loan_id"]
    final_df['Predicted'] = y_final

    if debug:
        print(f"Predictions:\n {final_df}")
    
    if write:
        final_df.to_csv('out.csv', index=False)
        print("Sucessfully stored the predictions in a file named 'out.csv'")

In [110]:
### Uses a grid search to generate random parameters to find the best grid model
def getBestSearch(algorithm,grid,debug=True):

    train = pd.read_csv('../../project/banking_data/loanUnitedTrain.csv', sep=',')
    train = utils.normalize_category(train)

    X = train.drop(columns=['loan_success'])
    y = train['loan_success']

    alg = algorithm()

    grid_search = GridSearchCV(estimator = alg, param_grid = grid, cv = 2, n_jobs = -1, verbose = 2)

    model = grid_search.fit(X,y)

    if debug:
        print('Best Score: ', model.best_score_)
        print('Best Params: ', model.best_params_)
    
    return model.best_score_, model.best_params_


In [111]:
### Sampling (This should work if the ASHDASJDSAKD LIBRARY IS IMPORTED)
def build_pipeline(algorithm,oversample=True,undersample=True):


    if(oversample):
        return Pipeline([
            ('sampling',SMOTE(random_state = 20)),
            ('classification',algorithm)
        ])
    else:
        return  Pipeline([
            ('classification',algorithm)
        ])

    return pipeline

In [112]:
run_random_forest(debug=True,write=False)

Running Random Forest Algorithm
Score: 0.8282828282828283
Auc: 0.5722222222222222
Predictions:
        Id  Predicted
0    5895   0.055521
1    7122   0.403191
2    6173   0.070627
3    6142   0.214411
4    5358   0.474723
5    6095   0.213376
6    6878   0.084318
7    6554   0.186612
8    6793   0.175687
9    7286   0.219207
10   6076   0.110528
11   5134   0.228408
12   5419   0.484578
13   6255   0.432756
14   5656   0.303512
15   6934   0.533170
16   6028   0.202129
17   6490   0.046153
18   6415   0.359737
19   7087   0.018088
20   5420   0.413797
21   5977   0.224644
22   6824   0.398350
23   5207   0.216303
24   7115   0.164609
25   7250   0.201414
26   6010   0.186315
27   6088   0.295866
28   5682   0.026065
29   7201   0.015636
..    ...        ...
324  5698   0.431060
325  5169   0.041986
326  7294   0.105233
327  5318   0.140478
328  5368   0.458552
329  6923   0.272208
330  5463   0.115818
331  5265   0.059690
332  6321   0.186250
333  5226   0.550065
334  6868   0.176669
3

In [113]:
param_grid = {
    'bootstrap': [True,False],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
### Uncomment to run (WARNING: Takes like 5 minutes)
# getBestSearch(get_random_forest,param_grid)

In [114]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

train = pd.read_csv('../../project/banking_data/loanUnitedTrain.csv', sep=',')
df = utils.normalize_category(train)
# df = utils.normalization(df,'loan_success')

X = df.drop(columns=['loan_success'])
y = df['loan_success']

algorithm = get_random_forest()

skf = StratifiedKFold(n_splits=5, random_state=True, shuffle=True)
skf.get_n_splits(X, y)

auc_list = []

for train_index, test_index in skf.split(X, y):

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    ### Train the model
    algorithm.fit(X_train, y_train)

    ### Predict the outcome with the test data
    y_pred = algorithm.predict_proba(X_test)
    y_final = y_pred.transpose()[0]

    auc = get_auc(y_test, y_final)
    auc_list.append(auc)
    print(f"AUC={auc}")

### TODO: display statistics?
avg = sum(auc_list)/len(auc_list)
print(f"Average AUC = {avg}")


AUC=0.8571428571428571
AUC=0.8382066276803118
AUC=0.7426900584795322
AUC=0.7341269841269842
AUC=0.6825396825396826
Average AUC = 0.7709412419938737


In [154]:
def training_algorithm(algorithm,debug=False):
    ### Getting the dataset
    # train = pd.read_csv('../../project/banking_data/loanUnitedTrain.csv', sep=',')
    train = pd.read_csv('../../csvs/loan_united_train.csv', sep=',')
    train = utils.normalize_category(train)

    ### Getting a Model from training
    X_train,X_test,y_train,y_test = split_dataset(train)

    # rf = get_random_forest()

    pipe = build_pipeline(algorithm())

    model = pipe.fit(X_train,y_train)

    y_predicted = model.predict_proba(X_test)

    y_final = y_predicted.transpose()[0]

    if debug:
        score = model.score(X_test,y_test)
        auc = get_auc(y_test,y_final)
        print(f"Score: {score}")
        print(f"Auc: {auc}")
    
    return model

In [121]:
def testing_model(model,debug=False,write=False):
    test = pd.read_csv('../../csvs/loan_united_test.csv', sep=',')
    # test = pd.read_csv('../../project/banking_data/loanUnitedTest.csv', sep=',')
    test = utils.normalize_category(test)

    X = test.drop(columns=['loan_success'])

    y_predicted = model.predict_proba(X)
    y_final = y_predicted.transpose()[0]
    
    final_df = pd.DataFrame()
    final_df['Id'] = test["loan_id"]
    final_df['Predicted'] = y_final

    if debug:
        print(f"Predictions:\n {final_df}")
    
    if write:
        final_df.to_csv('out.csv', index=False)
        print("Sucessfully stored the predictions in a file named 'out.csv'")

In [153]:
def run_algorithm(algorithm,debug = True,write=False):
    
    print("Running")
    
    model = training_algorithm(algorithm,debug=debug)
    testing_model(model,debug=debug,write=write)

In [152]:
# algorithm = get_logistic_regression
algorithm = get_random_forest
run_algorithm(algorithm)

Running
Score: 0.8080808080808081
Auc: 0.6746031746031745
Predictions:
        Id  Predicted
0    5895   0.178099
1    7122   0.541726
2    6173   0.065231
3    6142   0.290558
4    5358   0.253939
5    6095   0.111472
6    6878   0.095690
7    6554   0.094371
8    6793   0.213632
9    7286   0.125070
10   6076   0.183109
11   5134   0.196169
12   5419   0.440479
13   6255   0.365760
14   5656   0.311892
15   6934   0.611261
16   6028   0.176084
17   6490   0.078018
18   6415   0.245951
19   7087   0.085592
20   5420   0.038398
21   5977   0.136276
22   6824   0.425258
23   5207   0.152192
24   7115   0.119998
25   7250   0.056496
26   6010   0.176610
27   6088   0.421277
28   5682   0.067143
29   7201   0.060632
..    ...        ...
324  5698   0.417301
325  5169   0.130800
326  7294   0.080859
327  5318   0.235460
328  5368   0.491662
329  6923   0.350722
330  5463   0.121912
331  5265   0.154906
332  6321   0.078194
333  5226   0.380001
334  6868   0.289440
335  4967   0.552459
336 