### Imported libraries and Scripts

In [263]:
### imported Libraries
import pandas as pd
import numpy as np
import importlib
from pprint import pprint

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

# from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE 

### Imported Scripts
import utils

%load_ext autoreload
%autoreload 2

importlib.reload(utils)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<module 'utils' from 'c:\\Users\\utilizador\\Desktop\\ac-feup\\jupyters\\hugo\\utils.py'>

### Data splitting

In [264]:
### Split the data
def split_dataset(df,ratio=0.7,debug=False):

    ### Seperate the precition columns from output
    X = df.drop(columns=['loan_success'])
    y = df['loan_success']

    ### Apply splitting
    X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=ratio,test_size=1-ratio)

    return X_train,X_test,y_train,y_test

### Algorithms

In [265]:
def get_random_forest():
    return RandomForestClassifier(bootstrap = False,
                                    max_depth = 80,
                                    max_features = 3,
                                    min_samples_leaf = 3,
                                    min_samples_split = 12,
                                    n_estimators = 100)

In [266]:
def get_logistic_regression():
    return LogisticRegression(random_state=10,solver='lbfgs',max_iter=200)

### Auc Curve (Pass this to utils later)

In [267]:
def get_auc(y_test,y_predicted):
    fpr, tpr, _ = metrics.roc_curve(y_test, y_predicted,pos_label=-1)    
    return metrics.auc(fpr, tpr)


### Use *Grid Search Cross Validation* to find the best grid for an algorithm

In [268]:
### Uses a grid search to generate random parameters to find the best grid model
def getBestSearch(algorithm,grid,debug=True):

    # train = pd.read_csv('../../project/banking_data/loanUnitedTrain.csv', sep=',')
    train = pd.read_csv('../../csvs/loan_united_train.csv', sep=',')
    train = utils.normalize_category(train)

    X = train.drop(columns=['loan_success'])
    y = train['loan_success']

    alg = algorithm()

    grid_search = GridSearchCV(estimator = alg, param_grid = grid, cv = 2, n_jobs = -1, verbose = 2)

    model = grid_search.fit(X,y)

    if debug:
        print('Best Score: ', model.best_score_)
        print('Best Params: ', model.best_params_)
    
    return model.best_score_, model.best_params_


### Build a Pipeline to apply a sampling and a classification algorithm

In [269]:
### Sampling (This should work if the ASHDASJDSAKD LIBRARY IS IMPORTED)
### Don't do this
def build_pipeline(algorithm,oversample=True,undersample=True):

    if(oversample):
        return Pipeline([
            ('sampling',SMOTE(random_state = 20)),
            ('classification',algorithm)
        ])
    else:
        return  Pipeline([
            ('classification',algorithm)
        ])

    return pipeline

### Final Stratified Cross Validation

In [270]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

def final_CV(algorithm):
    # train = pd.read_csv('../../project/banking_data/loanUnitedTrain.csv', sep=',')
    train = pd.read_csv('../../csvs/loan_united_train.csv', sep=',')
    df = utils.normalize_category(train)
    # df = utils.normalization(df,'loan_success')

    X = df.drop(columns=['loan_success'])
    y = df['loan_success']

    algorithm = algorithm()

    skf = StratifiedKFold(n_splits=5, random_state=True, shuffle=True)
    skf.get_n_splits(X, y)

    auc_list = []

    for train_index, test_index in skf.split(X, y):

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        ### Train the model
        algorithm.fit(X_train, y_train)

        ### Predict the outcome with the test data
        y_pred = algorithm.predict_proba(X_test)
        y_final = y_pred.transpose()[0]

        auc = get_auc(y_test, y_final)
        auc_list.append(auc)
        print(f"AUC={auc}")

    ### TODO: display statistics?
    avg = sum(auc_list)/len(auc_list)
    print(f"Average AUC = {avg}")


### Running an algorithm

In [280]:
def training_algorithm(algorithm,debug=False):
    ### Getting the dataset
    # train = pd.read_csv('../../project/banking_data/loanUnitedTrain.csv', sep=',')
    train = pd.read_csv('../../csvs/loan_united_train.csv', sep=',')
    train = utils.normalize_category(train)

    ### Getting a Model from training
    X_train,X_test,y_train,y_test = split_dataset(train)


    # pipe = build_pipeline(algorithm())
    # model = pipe.fit(X_train,y_train)
    
    model = algorithm().fit(X_train,y_train)

    y_predicted = model.predict_proba(X_test)

    y_final = y_predicted.transpose()[0]

    if debug:
        score = model.score(X_test,y_test)
        auc = get_auc(y_test,y_final)
        print(f"Score: {score}")
        print(f"Auc: {auc}")
    
    return model

In [272]:
def testing_model(model,debug=False,write=False):
    test = pd.read_csv('../../csvs/loan_united_test.csv', sep=',')
    # test = pd.read_csv('../../project/banking_data/loanUnitedTest.csv', sep=',')
    test = utils.normalize_category(test)

    X = test.drop(columns=['loan_success'])

    y_predicted = model.predict_proba(X)
    y_final = y_predicted.transpose()[0]
    
    final_df = pd.DataFrame()
    final_df['Id'] = test["loan_id"]
    final_df['Predicted'] = y_final

    if debug:
        print(f"Predictions:\n {final_df}")
    
    if write:
        final_df.to_csv('out.csv', index=False)
        print("Sucessfully stored the predictions in a file named 'out.csv'")

In [273]:
def run_algorithm(algorithm,debug = True,write=False):
    
    if(debug):
        print("Running the provided algorithm")
    
    model = training_algorithm(algorithm,debug=debug)
    testing_model(model,debug=debug,write=write)

### Choose your algorithm:

In [281]:
### Leave the one you want to run uncommented
# algorithm = get_random_forest
algorithm = get_logistic_regression

### Run the Chosen Algorithm

In [291]:
run_algorithm(algorithm,debug=True,write=True)

Running the provided algorithm
Score: 0.6363636363636364
Auc: 0.7638157894736841
Predictions:
        Id  Predicted
0    5895   0.019973
1    7122   0.951866
2    6173   0.257820
3    6142   0.035110
4    5358   0.510330
5    6095   0.273158
6    6878   0.084366
7    6554   0.418084
8    6793   0.177762
9    7286   0.576698
10   6076   0.189568
11   5134   0.602987
12   5419   0.853081
13   6255   0.440811
14   5656   0.168548
15   6934   0.953107
16   6028   0.013963
17   6490   0.166590
18   6415   0.685115
19   7087   0.080655
20   5420   0.382917
21   5977   0.001174
22   6824   0.999950
23   5207   0.226175
24   7115   0.526158
25   7250   0.112956
26   6010   0.365572
27   6088   0.598365
28   5682   0.675752
29   7201   0.567592
..    ...        ...
324  5698   0.197630
325  5169   0.411656
326  7294   0.051403
327  5318   0.037189
328  5368   0.101929
329  6923   0.090402
330  5463   0.429037
331  5265   0.356556
332  6321   0.404063
333  5226   0.881123
334  6868   0.498185
33

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### Execute Final Cross Validation

In [276]:
final_CV(algorithm)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


AUC=0.8392857142857142
AUC=0.7056530214424952
AUC=0.7894736842105263
AUC=0.6547619047619048
AUC=0.6785714285714286
Average AUC = 0.7335491506544137


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

### Finding the best grid

In [277]:
param_grid = {
    'bootstrap': [True,False],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
### Uncomment to run (WARNING: Takes like 5 minutes)
# getBestSearch(get_random_forest,param_grid)