In [41]:
### imported Libraries
import pandas as pd
import numpy as np
import importlib
from pprint import pprint

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

### Imported Scripts
import utils

%load_ext autoreload
%autoreload 2

importlib.reload(utils)


ModuleNotFoundError: No module named 'imblearn'

In [29]:
### Split the data
def split_dataset(df,ratio=0.7,debug=False):

    ### Seperate the precition columns from output
    X = df.drop(columns=['loan_success'])
    y = df['loan_success']

    ### Apply splitting
    X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=ratio,test_size=1-ratio)

    return X_train,X_test,y_train,y_test

In [30]:
def get_random_forest(X_train,y_train):
    return RandomForestClassifier(n_estimators=15)

In [31]:
def get_auc(y_test,y_predicted):
    fpr, tpr, _ = metrics.roc_curve(y_test, y_predicted,pos_label=-1)    
    return metrics.auc(fpr, tpr)


In [32]:
def run_random_forest(debug = False,write=False):

    print("Running Random Forest Algorithm")
    model = training_forest(debug)
    testing_forest(model,debug=debug,write=write)
    

In [33]:
def training_forest(debug=False):
    ### Getting the dataset
    train = pd.read_csv('../../project/banking_data/loanUnitedTrain.csv', sep=',')
    train = utils.normalize_category(train)

    ### Getting a Model from training
    X_train,X_test,y_train,y_test = split_dataset(train)

    rf = get_random_forest(X_train,y_train)
    model = rf.fit(X_train,y_train)

    y_predicted = model.predict_proba(X_test)

    y_final = y_predicted.transpose()[0]

    if debug:
        score = model.score(X_test,y_test)
        auc = get_auc(y_test,y_final)
        print(f"Score: {score}")
        print(f"Auc: {auc}")
    
    return model

In [34]:
def testing_forest(model,debug=False,write=False):
    test = pd.read_csv('../../project/banking_data/loanUnitedTest.csv', sep=',')
    test = utils.normalize_category(test)

    X = test.drop(columns=['loan_success'])

    y_predicted = model.predict_proba(X)
    y_final = y_predicted.transpose()[0]
    
    final_df = pd.DataFrame()
    final_df['Id'] = test["loan_id"]
    final_df['Predicted'] = y_final

    if debug:
        print(f"Predictions:\n {final_df}")
    
    if write:
        final_df.to_csv('out.csv', index=False)
        print("Sucessfully stored the predictions in a file named 'out.csv'")

In [35]:
run_random_forest(debug=True,write=False)

Running Random Forest Algorithm
Score: 0.8787878787878788
Auc: 0.7491596638655463
Predictions:
        Id  Predicted
0    5895   0.000000
1    7122   0.733333
2    6173   0.000000
3    6142   0.066667
4    5358   0.400000
5    6095   0.000000
6    6878   0.066667
7    6554   0.333333
8    6793   0.200000
9    7286   0.000000
10   6076   0.000000
11   5134   0.066667
12   5419   0.400000
13   6255   0.266667
14   5656   0.000000
15   6934   0.333333
16   6028   0.000000
17   6490   0.066667
18   6415   0.266667
19   7087   0.000000
20   5420   0.133333
21   5977   0.066667
22   6824   0.533333
23   5207   0.333333
24   7115   0.266667
25   7250   0.000000
26   6010   0.066667
27   6088   0.066667
28   5682   0.333333
29   7201   0.266667
..    ...        ...
324  5698   0.000000
325  5169   0.133333
326  7294   0.066667
327  5318   0.066667
328  5368   0.066667
329  6923   0.066667
330  5463   0.000000
331  5265   0.000000
332  6321   0.133333
333  5226   0.200000
334  6868   0.000000
3

In [36]:
### Uses a grid search to generate random parameters to find the best grid model
def getBestSearch(algorithm,grid,debug=True):

    train = pd.read_csv('../../project/banking_data/loanUnitedTrain.csv', sep=',')
    train = utils.normalize_category(train)

    X = train.drop(columns=['loan_success'])
    y = train['loan_success']

    alg = algorithm(X,y)

    grid_search = GridSearchCV(estimator = alg, param_grid = grid, cv = 2, n_jobs = -1, verbose = 2)

    model = grid_search.fit(X,y)

    if debug:
        print('Best Score: ', model.best_score_)
        print('Best Params: ', model.best_params_)
    
    return model.best_score_, model.best_params_


In [37]:
param_grid = {
    'bootstrap': [True,False],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
### Uncomment to run (WARNING: Takes like 5 minutes)
# getBestSearch(get_random_forest,param_grid)

Fitting 2 folds for each of 576 candidates, totalling 1152 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   32.0s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 1152 out of 1152 | elapsed:  8.0min finished


Best Score:  0.875
Best Params:  {'bootstrap': False, 'max_depth': 80, 'max_features': 3, 'min_samples_leaf': 3, 'min_samples_split': 10, 'n_estimators': 300}


(0.875,
 {'bootstrap': False,
  'max_depth': 80,
  'max_features': 3,
  'min_samples_leaf': 3,
  'min_samples_split': 10,
  'n_estimators': 300})

In [38]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

X, y = make_classification(random_state=0)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

pipe = Pipeline([('scaler', StandardScaler()), ('svc', SVC())])
# The pipeline can be used as any other estimator
# and avoids leaking the test set into the train set
pipe.fit(X_train, y_train)
Pipeline(steps=[('scaler', StandardScaler()), ('svc', SVC())])
pipe.score(X_test, y_test)

0.88

In [39]:
### Sampling (This should work if the ASHDASJDSAKD LIBRARY IS IMPORTED)
def build_pipeline(algorithm,oversample=True):


    if(oversample):
        return Pipeline(['classification',algorithm,
                ('sampling',SMOTE(random_state = 20))
        ])
    else:
        return  Pipeline(['classification',algorithm
        ])

    return pipeline

In [40]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

train = pd.read_csv('../../project/banking_data/loanUnitedTrain.csv', sep=',')
df = utils.normalize_category(train)
# df = utils.normalization(df,'loan_success')

X = df.drop(columns=['loan_success'])
y = df['loan_success']

algorithm = RandomForestClassifier(bootstrap = False,
                                    max_depth = 80,
                                    max_features = 3,
                                    min_samples_leaf = 3,
                                    min_samples_split = 12,
                                    n_estimators = 100)

skf = StratifiedKFold(n_splits=5, random_state=True, shuffle=True)
skf.get_n_splits(X, y)

auc_list = []

for train_index, test_index in skf.split(X, y):

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    ### Train the model
    algorithm.fit(X_train, y_train)

    ### Predict the outcome with the test data
    y_pred = algorithm.predict_proba(X_test)
    y_final = y_pred.transpose()[0]

    auc = get_auc(y_test, y_final)
    auc_list.append(auc)
    print(f"AUC={auc}")

### TODO: display statistics?
avg = sum(auc_list)/len(auc_list)
print(f"Average AUC = {avg}")

    



AUC=0.7298245614035088
AUC=0.7134502923976608
AUC=0.9047619047619049
AUC=0.7956349206349207
AUC=0.8313492063492063
Average AUC = 0.7950041771094403
