### Imported libraries and Scripts

In [38]:
### imported Libraries
import pandas as pd
import numpy as np
import importlib
from pprint import pprint

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans

# from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

# from sklearn.pipeline import Pipeline

### Imported Scripts
import utils

%load_ext autoreload
%autoreload 2

importlib.reload(utils)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<module 'utils' from 'c:\\Users\\Hugo\\Desktop\\ac-feup\\jupyters\\hugo\\utils.py'>

### Data splitting

In [39]:
### Split the data
def split_dataset(df,ratio=0.7,debug=False):

    ### Seperate the precition columns from output
    X = df.drop(columns=['loan_success'])
    y = df['loan_success']

    ### Apply splitting
    X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=ratio,test_size=1-ratio)

    return X_train,X_test,y_train,y_test

### Algorithms

In [40]:
def get_random_forest():
    return RandomForestClassifier(bootstrap = False,
                                    max_depth = 80,
                                    max_features = 3,
                                    min_samples_leaf = 3,
                                    min_samples_split = 12,
                                    n_estimators = 100)

In [41]:
def get_logistic_regression():
    return LogisticRegression(random_state=10,solver='lbfgs',max_iter=200)

### Auc Curve (Pass this to utils later)

In [42]:
def get_auc(y_test,y_predicted):
    fpr, tpr, _ = metrics.roc_curve(y_test, y_predicted,pos_label=-1)    
    return metrics.auc(fpr, tpr)


### Use *Grid Search Cross Validation* to find the best grid for an algorithm

In [43]:
### Uses a grid search to generate random parameters to find the best grid model
def getBestSearch(algorithm,grid,debug=True):

    # train = pd.read_csv('../../project/banking_data/loanUnitedTrain.csv', sep=',')
    train = pd.read_csv('../../csvs/loan_united_train.csv', sep=',')
    train = utils.normalize_category(train)

    X = train.drop(columns=['loan_success'])
    y = train['loan_success']

    alg = algorithm()

    grid_search = GridSearchCV(estimator = alg, param_grid = grid, cv = 2, n_jobs = -1, verbose = 2)

    model = grid_search.fit(X,y)

    if debug:
        print('Best Score: ', model.best_score_)
        print('Best Params: ', model.best_params_)
    
    return model.best_score_, model.best_params_


### Build a Pipeline to apply a sampling and a classification algorithm

In [79]:
### Sampling (This should work if the ASHDASJDSAKD LIBRARY IS IMPORTED)
### Don't do this
def build_pipeline(algorithm,oversample=True,undersample=True):

    if(oversample):
        return Pipeline([
            ('sampling',SMOTE(random_state = 20)),
            ('classification',algorithm)
        ])
    else:
        return  Pipeline([
            ('classification',algorithm)
        ])

    return pipeline

### Final Stratified Cross Validation

In [45]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

def final_CV(algorithm):
    # train = pd.read_csv('../../project/banking_data/loanUnitedTrain.csv', sep=',')
    train = pd.read_csv('../../csvs/loan_united_train.csv', sep=',')
    df = utils.normalize_category(train)
    # df = utils.normalization(df,'loan_success')

    X = df.drop(columns=['loan_success'])
    y = df['loan_success']

    algorithm = algorithm()

    skf = StratifiedKFold(n_splits=3, random_state=True, shuffle=True)
    skf.get_n_splits(X, y)

    model_list = []
    auc_list = []

    for train_index, test_index in skf.split(X, y):

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        ### Train the model
        model = algorithm.fit(X_train, y_train)

        ### Predict the outcome with the test data
        y_pred = algorithm.predict_proba(X_test)
        y_final = y_pred.transpose()[0]

        auc = get_auc(y_test, y_final)
        auc_list.append(auc)
        model_list.append(model)
        print(f"AUC={auc}")
        
    ### Get the best model
    best_score = max(auc_list)
    best_model = model_list[auc_list.index(best_score)]
    
    
    ### Use the best model to get a prediction
    test = pd.read_csv('../../csvs/loan_united_test.csv', sep=',')
    test = utils.normalize_category(test)
    
    X = test.drop(columns=['loan_success'])
    y_predicted = best_model.predict_proba(X)
    y_final = y_predicted.transpose()[0]
    
    final_df = pd.DataFrame()
    final_df['Id'] = test["loan_id"]
    final_df['Predicted'] = y_final
    
    
    if True:
        print(f"Predictions:\n {final_df}")
    
    if True:
        final_df.to_csv('CV.csv', index=False)
        print("Sucessfully stored the predictions in a file named 'CV.csv'")

    ### TODO: display statistics?
    avg = sum(auc_list)/len(auc_list)
    print(f"Average AUC = {avg}")


### Running an algorithm

In [72]:
def training_algorithm(algorithm,debug=False):
    ### Getting the dataset
    # train = pd.read_csv('../../project/banking_data/loanUnitedTrain.csv', sep=',')
    
    train = reverse_dates()
    
    train = pd.read_csv('../../csvs/loan_united_train.csv', sep=',')
    train = utils.normalize_category(train)
    
    

    ### Getting a Model from training
    X_train,X_test,y_train,y_test = split_dataset(train)

    pipe = build_pipeline(algorithm())
    model = pipe.fit(X_train,y_train)
    

    y_predicted = model.predict_proba(X_test)

    y_final = y_predicted.transpose()[0]

    if debug:
        score = model.score(X_test,y_test)
        auc = get_auc(y_test,y_final)
        print(f"Score: {score}")
        print(f"Auc: {auc}")
    
    return model

In [47]:
def testing_model(model,debug=False,write=False):
    test = pd.read_csv('../../csvs/loan_united_test.csv', sep=',')
    # test = pd.read_csv('../../project/banking_data/loanUnitedTest.csv', sep=',')
    test = utils.normalize_category(test)

    X = test.drop(columns=['loan_success'])

    y_predicted = model.predict_proba(X)
    y_final = y_predicted.transpose()[0]
    
    final_df = pd.DataFrame()
    final_df['Id'] = test["loan_id"]
    final_df['Predicted'] = y_final

    if debug:
        print(f"Predictions:\n {final_df}")
    
    if write:
        final_df.to_csv('out.csv', index=False)
        print("Sucessfully stored the predictions in a file named 'out.csv'")

In [48]:
def run_algorithm(algorithm,debug = True,write=False):
    
    if(debug):
        print("Running the provided algorithm")
    
    model = training_algorithm(algorithm,debug=debug)
    testing_model(model,debug=debug,write=write)

### Choose your algorithm:

In [61]:
### Leave the one you want to run uncommented
algorithm = get_random_forest
# algorithm = get_logistic_regression

### Run the Chosen Algorithm

In [78]:
run_algorithm(algorithm,debug=True,write=True)

Running the provided algorithm


TypeError: All intermediate steps of the chain should be estimators that implement fit and transform or fit_resample (but not both) or be a string 'passthrough' 'RandomForestClassifier(bootstrap=False, max_depth=80, max_features=3,
                       min_samples_leaf=3, min_samples_split=12)' (type <class 'sklearn.ensemble._forest.RandomForestClassifier'>) doesn't)

### Execute Final Cross Validation

In [None]:
final_CV(algorithm)

AUC=0.711436170212766
AUC=0.7595744680851064
AUC=0.7546099290780143
Predictions:
        Id  Predicted
0    5895   0.039504
1    7122   0.452351
2    6173   0.045842
3    6142   0.030997
4    5358   0.502908
..    ...        ...
349  4989   0.276445
350  5221   0.028243
351  6402   0.081868
352  5346   0.037687
353  6748   0.073684

[354 rows x 2 columns]
Sucessfully stored the predictions in a file named 'CV.csv'
Average AUC = 0.7418735224586289


### Finding the best grid

In [None]:
param_grid = {
    'bootstrap': [True,False],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
### Uncomment to run (WARNING: Takes like 5 minutes)
# getBestSearch(get_random_forest,param_grid)

In [None]:
def convert_date(df,column):
    copy = df.copy()
    date = copy[column]
    
    copy[column] =  date.apply(lambda x: datetime.datetime.strptime(x, '%d-%m-%Y').strftime('%Y%m%d'))

    return copy

import datetime

def reverse_dates():
    
    train = pd.read_csv('../../csvs/loan_united_train.csv', sep=',')
    columns = ['loan_date','account_creation']
    df = train.copy()

    for column in columns:
        df = convert_date(df,column)
        
    return df

### Clustering

In [77]:
import numpy as np
from sklearn.cluster import KMeans
from pprint import pprint

train = pd.read_csv('../../csvs/loan_united_train.csv', sep=',')
df = utils.normalize_category(train)
# df = utils.normalization(df,'loan_success')

X = df.drop(columns=['loan_success'])

kmeans_model = KMeans(n_clusters=3).fit(X)
# labels = kmeans_model.labels_
# metrics.silhouette_score(X, labels, metric='euclidean')

pprint(kmeans_model)


KMeans(n_clusters=3)
