### Imported libraries and Scripts

In [281]:
### imported Libraries
import pandas as pd
import numpy as np
import importlib
from pprint import pprint
import datetime

### Sklearn imported libraries
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.feature_selection import SelectKBest, f_classif


# Pipeline for Oversampling
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

### Imported Scripts
import utils

%load_ext autoreload
%autoreload 2

importlib.reload(utils)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<module 'utils' from 'c:\\Users\\utilizador\\Desktop\\ac-feup\\jupyters\\hugo\\utils.py'>

### Choose Macros

In [282]:
TEST = True
OVERSMAPLE = True
DEBUG = True
WRITE = True
DUMMIES = True
CATEGORY_ENCONDING = False
SPLIT_RATIO = 0.8
N_COLUMNS = 15
N_SPLITS = 3

## Defined Methods

### Choose Dataset Options

In [283]:
def add_dummy(df,columns):
    copy= df.copy()

    for column in columns:
        dummies = pd.get_dummies(copy[column])
        copy = copy.drop(column,axis=1)
        copy = copy.join(dummies)
    
    return copy

def convert_dates(df):
    copy = df.copy()
    columns = ["loan_date","account_creation","birth_number"]

    for column in columns:
        copy[column] = copy[column].apply(lambda x: datetime.datetime.strptime(x, '%d-%m-%Y').strftime('%Y')).astype(int)

    copy["age_on_loan"] = copy["loan_date"] - copy["birth_number"]
    copy = copy.drop(columns = ["loan_date","account_creation","birth_number"])

    copy['card_issued'] = pd.to_numeric(copy["card_issued"].astype(str), errors='coerce').fillna(1).astype(int)

    return copy

def get_df(test=False):
    if test:
        df = pd.read_csv('../../csvs/loan_united_test.csv', sep=',')
    else:
        df = pd.read_csv('../../csvs/loan_united_train.csv', sep=',')
    
    df = convert_dates(df)

    if DUMMIES:
        columns = ["account_frequency","gender","card_type"]
        df = add_dummy(df, columns)
    
    if CATEGORY_ENCONDING:
        df = utils.normalize_category(df)
    
    return df


### Data splitting

In [284]:
### Split the data
def split_dataset(df):

    ### Seperate the precition columns from output

    X = df.drop(columns=['loan_success'])
    y = df['loan_success']

    select = SelectKBest(f_classif, k= N_COLUMNS)
    X_new = select.fit_transform(X, y)

    print(X_new.shape)

    split_filter = select.get_support()
    features = X.columns

    ### Apply splitting
    X_train, X_test, y_train, y_test = train_test_split(X_new,y,train_size=SPLIT_RATIO,test_size=1-SPLIT_RATIO)

    return X_train,X_test,y_train,y_test, features[split_filter]

### Algorithms

In [285]:
def get_random_forest():
    return RandomForestClassifier(bootstrap = False,
                                    max_depth = 80,
                                    max_features = 3,
                                    min_samples_leaf = 3,
                                    min_samples_split = 12,
                                    n_estimators = 100)

In [286]:
def get_logistic_regression():
    return LogisticRegression(random_state=10,solver='lbfgs',max_iter=200)

In [287]:
def get_decision_tree():
    return DecisionTreeClassifier(random_state=0)

In [288]:
def get_knn():
    return KNeighborsClassifier(n_neighbors=3)

### Use *Grid Search Cross Validation* to find the best grid for an algorithm

In [289]:
### Uses a grid search to generate random parameters to find the best grid model
def getBestSearch(algorithm,grid):
    train = get_df()

    X = train.drop(columns=['loan_success'])
    y = train['loan_success']

    alg = algorithm()

    grid_search = GridSearchCV(estimator = alg, param_grid = grid, cv = 2, n_jobs = -1, verbose = 2)

    model = grid_search.fit(X,y)

    if DEBUG:
        print('Best Score: ', model.best_score_)
        print('Best Params: ', model.best_params_)
    
    return model.best_score_, model.best_params_


### Build a Pipeline to apply a sampling and a classification algorithm

In [290]:
### TODO: Add undersample before final delivery
def build_pipeline(algorithm,oversample=True,undersample=True):

    if(oversample):
        return Pipeline([
            ('sampling',SMOTE(random_state = 20)),
            ('classification',algorithm)
        ])
    else:
        return  Pipeline([
            ('classification',algorithm)
        ])

    return pipeline

### Final Stratified Cross Validation

In [291]:
def final_CV(algorithm):
    train = get_df()
    # train = utils.normalization(train,'loan_success')

    X = train.drop(columns=['loan_success'])
    y = train['loan_success']

    select = SelectKBest(f_classif, k=N_COLUMNS)
    X_new = select.fit_transform(X, y)

    split_filter = select.get_support()
    features = X.columns[split_filter]

    algorithm = algorithm()
    skf = StratifiedKFold(n_splits=N_SPLITS, random_state=True, shuffle=True)

    model_list = []
    auc_list = []
    


    for train_index, test_index in skf.split(X_new, y):

        X_train, X_test = X_new[train_index], X_new[test_index]
        y_train, y_test = y[train_index], y[test_index]

        ### Train the model
        model = algorithm.fit(X_train, y_train)

        ### Predict the outcome with the test data
        y_pred = algorithm.predict_proba(X_test)
        y_final = y_pred.transpose()[0]
        
        auc = utils.get_auc(y_test, y_final)
        auc_list.append(auc)
        model_list.append(model)
        print(f"AUC={auc}")
        
    ### Get the best model

    best_score = max(auc_list)
    best_model = model_list[auc_list.index(best_score)]
    
    
    ### Use the best model to get a prediction
    test = get_df(test=True)
    
    X2 = test.drop(columns=['loan_success'])
    X2 = X2[features]

    # print(f"colums={X2.columns}")

    y_predicted = best_model.predict_proba(X2)
    y_final = y_predicted.transpose()[0]
    
    final_df = pd.DataFrame()
    final_df['Id'] = test["loan_id"]
    final_df['Predicted'] = y_final
    
    
    if DEBUG:
        print(f"Predictions:\n {final_df}")
    
    if WRITE:
        final_df.to_csv('CV.csv', index=False)
        print("Sucessfully stored the predictions in a file named 'CV.csv'")

    ### TODO: display statistics?
    avg = sum(auc_list)/len(auc_list)
    print(f"Average AUC = {avg}")


### Running an algorithm

In [292]:
def training_algorithm(algorithm):
    ### Getting the dataset
    train = get_df(test=False)
    
    ### Getting a Model from training
    X_train,X_test,y_train,y_test, features = split_dataset(train)

    
    pipe = build_pipeline(algorithm())
    model = pipe.fit(X_train,y_train)

    y_predicted = model.predict_proba(X_test)

    y_final = y_predicted.transpose()[0]

    if DEBUG:
        score = model.score(X_test,y_test)
        auc = utils.get_auc(y_test,y_final)
        print(f"Score: {score}")
        print(f"Auc: {auc}")
    
    return model, features

In [293]:
def testing_model(model,features):

    test = get_df(test=True)

    X = test.drop(columns=['loan_success'])
    X = X[features]

    print(X.columns)

    y_predicted = model.predict_proba(X)
    y_final = y_predicted.transpose()[0]
    
    final_df = pd.DataFrame()
    final_df['Id'] = test["loan_id"]
    final_df['Predicted'] = y_final

    if DEBUG:
        print(f"Predictions:\n {final_df}")
        
    
    if WRITE:
        final_df.to_csv('out.csv', index=False)
        print("Sucessfully stored the predictions in a file named 'out.csv'")

In [294]:
def run_algorithm(algorithm):
    
    if(DEBUG):
        print("Running the provided algorithm")
    
    model, features = training_algorithm(algorithm)
    testing_model(model,features)

## Running the algorithms

### Choose your algorithm:

In [295]:
### Leave the one you want to run uncommented
algorithm = get_random_forest
# algorithm = get_logistic_regression
# algorithm = get_decision_tree
# algorithm = get_knn

### Run with train_test_split

In [296]:
run_algorithm(algorithm)

Running the provided algorithm
(328, 15)
Score: 0.9090909090909091
Auc: 0.7738791423001949
Index(['loan_id', 'loan_amount', 'payments',
       'account_district no. of municipalities with inhabitants 500-1999',
       'account_district unemploymant_growth',
       'client_district no. of municipalities with inhabitants < 499 ',
       'client_district no. of cities ', 'card_issued', 'no. movements',
       'min no. trans', 'min balance', 'avg balance', 'monthly issuance',
       'None', 'classic'],
      dtype='object')
Predictions:
        Id  Predicted
0    5895   0.118896
1    7122   0.806682
2    6173   0.319842
3    6142   0.539724
4    5358   0.357225
5    6095   0.153733
6    6878   0.058568
7    6554   0.252338
8    6793   0.240786
9    7286   0.110147
10   6076   0.119406
11   5134   0.145341
12   5419   0.477573
13   6255   0.617353
14   5656   0.113688
15   6934   0.184442
16   6028   0.385847
17   6490   0.179084
18   6415   0.286771
19   7087   0.051308
20   5420   0.16818

  f"X has feature names, but {self.__class__.__name__} was fitted without"


### Run with StratifiedKFold

In [303]:
final_CV(algorithm)

AUC=0.7652925531914893
AUC=0.7730496453900709
AUC=0.7063829787234043
Predictions:
        Id  Predicted
0    5895   0.003333
1    7122   0.744810
2    6173   0.064565
3    6142   0.076018
4    5358   0.622271
5    6095   0.035544
6    6878   0.008925
7    6554   0.237785
8    6793   0.070668
9    7286   0.076869
10   6076   0.019357
11   5134   0.028753
12   5419   0.330236
13   6255   0.289925
14   5656   0.086880
15   6934   0.266382
16   6028   0.109866
17   6490   0.120810
18   6415   0.376159
19   7087   0.024484
20   5420   0.031886
21   5977   0.053520
22   6824   0.610617
23   5207   0.065428
24   7115   0.342963
25   7250   0.022567
26   6010   0.130277
27   6088   0.334813
28   5682   0.024394
29   7201   0.040282
..    ...        ...
324  5698   0.085911
325  5169   0.023873
326  7294   0.034175
327  5318   0.199883
328  5368   0.013524
329  6923   0.009968
330  5463   0.068272
331  5265   0.063535
332  6321   0.037148
333  5226   0.102993
334  6868   0.004750
335  4967   0.

  f"X has feature names, but {self.__class__.__name__} was fitted without"


### Finding the best grid

In [298]:
param_grid = {
    'bootstrap': [True,False],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
### Uncomment to run (WARNING: Takes like 5 minutes)
# getBestSearch(get_random_forest,param_grid)

### Clustering

In [299]:
import numpy as np
from sklearn.cluster import KMeans
from pprint import pprint

train = pd.read_csv('../../csvs/loan_united_train.csv', sep=',')
df = utils.normalize_category(train)
# df = utils.normalization(df,'loan_success')

X = df.drop(columns=['loan_success'])

kmeans_model = KMeans(n_clusters=3).fit(X)
# labels = kmeans_model.labels_
# metrics.silhouette_score(X, labels, metric='euclidean')

pprint(kmeans_model)    


KMeans(n_clusters=3)
