In [66]:
### imported Libraries
import pandas as pd
import numpy as np
import importlib

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

### Imported Scripts
import utils

%load_ext autoreload
%autoreload 2

importlib.reload(utils)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<module 'utils' from 'c:\\Users\\utilizador\\Desktop\\ac-feup\\jupyters\\hugo\\utils.py'>

In [56]:
### Split the data
def split_dataset(df,ratio=0.7,debug=False):

    ### Seperate the precition columns from output
    X = df.drop(columns=['loan_success'])
    y = df['loan_success']

    ### Apply splitting
    X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=ratio,test_size=1-ratio)

    return X_train,X_test,y_train,y_test

In [57]:
def get_random_forest(X_train,y_train):
    model = RandomForestClassifier(n_estimators=15)
    model.fit(X_train,y_train)

    return model

In [58]:
def get_auc(y_test,y_predicted):

    y_final = y_predicted.transpose()[0]

    fpr, tpr, _ = metrics.roc_curve(y_test, y_final,pos_label=-1)    
    return metrics.auc(fpr, tpr)


In [59]:
def run_random_forest(debug = False,write=False):

    print("Running Random Forest Algorithm")
    model = training_forest(debug)
    testing_forest(model,debug=debug,write=write)
    

In [60]:
def training_forest(debug=False):
    ### Getting the dataset
    train = pd.read_csv('../../project/banking_data/loanUnitedTrain.csv', sep=',')
    train = utils.normalize_category(train)

    ### Getting a Model from training
    X_train,X_test,y_train,y_test = split_dataset(train)

    model = get_random_forest(X_train,y_train)

    y_predicted = model.predict_proba(X_test)

    if debug:
        score = model.score(X_test,y_test)
        auc = get_auc(y_test,y_predicted)
        print(f"Score: {score}")
        print(f"Auc: {auc}")
    
    return model

In [61]:
def testing_forest(model,debug=False,write=False):
    test = pd.read_csv('../../project/banking_data/loanUnitedTest.csv', sep=',')
    test = utils.normalize_category(test)

    X = test.drop(columns=['loan_success'])

    y_predicted = model.predict_proba(X)
    y_final = y_predicted.transpose()[0]
    
    final_df = pd.DataFrame()
    final_df['Id'] = test["loan_id"]
    final_df['Predicted'] = y_final

    if debug:
        print(f"Predictions:\n {final_df}")
    
    if write:
        final_df.to_csv('out.csv', index=False)
        print("Sucessfully stored the predictions in a file named 'out.csv'")

In [67]:
run_random_forest(debug=True,write=False)

Score: 0.8888888888888888
Auc: 0.6891771019677997
Predictions:
        Id  Predicted
0    5895   0.066667
1    7122   0.866667
2    6173   0.066667
3    6142   0.133333
4    5358   0.333333
5    6095   0.066667
6    6878   0.133333
7    6554   0.066667
8    6793   0.066667
9    7286   0.200000
10   6076   0.066667
11   5134   0.066667
12   5419   0.533333
13   6255   0.133333
14   5656   0.000000
15   6934   0.266667
16   6028   0.066667
17   6490   0.200000
18   6415   0.266667
19   7087   0.066667
20   5420   0.066667
21   5977   0.066667
22   6824   0.400000
23   5207   0.133333
24   7115   0.466667
25   7250   0.000000
26   6010   0.266667
27   6088   0.266667
28   5682   0.400000
29   7201   0.400000
..    ...        ...
324  5698   0.133333
325  5169   0.133333
326  7294   0.200000
327  5318   0.200000
328  5368   0.066667
329  6923   0.066667
330  5463   0.000000
331  5265   0.000000
332  6321   0.133333
333  5226   0.466667
334  6868   0.133333
335  4967   0.800000
336  5293   