### Modelling Notebook

In [1]:
# importing external libraries
from pathlib import Path
import os
import pandas as pd
import pickle
import json
import matplotlib.pyplot as plt
import numpy as np
import logging
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn import set_config
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier
set_config(transform_output = "pandas")

# Importing function to load data

# Making sure any changes are instantly added
%load_ext autoreload
%autoreload 2

from Modules.load_data import load_data
from Modules.preprocessing import missing_summary, merge_dfs, dollar_to_int, find_unique_values
from Modules.plotting import Plotter
from Modules.transforming import *
from Modules.modelling import MLearner

# Importing Pipelines
from Modules.Pipelines import Pipeline1, Pipeline2, Pipeline3


/home/as3620/FODS_coursework/LoanPrediction/data
/home/as3620/FODS_coursework/LoanPrediction


In [2]:
# Obtaining Root dir

root = str(Path.cwd())


# Obtaining seed from config.yaml

# Load the config file
with open(root + "/config.yaml", "r") as file:
    config = yaml.safe_load(file)

seed = config["global"]["seed"]

#print(f"seed: {seed}")

# Set global seeds for reproducibility
random.seed(seed)        
np.random.seed(seed)     

# Use the seed in scikit-learn
random_state = check_random_state(seed)

In [3]:
# Obtaining absolute path to data folder

data_folder = str(Path(os.getcwd()) / "data")

In [4]:
# Loading the data from pickle

merged_df = pd.read_pickle(data_folder + "/merged_data.pkl")

In [5]:
# Defining params for CV grid search

params_simple = {"penalty": [None], "solver":["saga"], "class_weight": ["balanced"], "max_iter":[1000]}

params = [
    {"penalty": [None], "solver":["saga"], "class_weight": [None, "balanced"]},
    {"penalty": ["elasticnet"], "l1_ratio" : np.linspace(0,1,10).tolist(), 
     "C": np.linspace(0.01,1,10).tolist(), "solver":["saga"], "class_weight": ["balanced"], "max_iter":[1000]}    
]


We will first use the simple parametrs to save computational time and answer the following questions:
1. Is Pipeline2 better than Pipeline1. Is better granularity for date-like columns more predictive?
2. Does using a dataset with a larger amount of "Non-fraudulent transactions" improve performance? As dataset is very imbalanced.

Then we will carry out hyperparameter tuning on GLM and LGBM estimators

### Pipeline1

Transforming date into a time series

In [7]:
ML_pipe1 = MLearner(dataset=merged_df, transformation_pipeline=Pipeline1, params=params_simple, estimator=LogisticRegression())
ML_pipe1.fit()
ML_pipe1.predict()

% of fraudulent transactions in y_train: 0.1304200586128297
% of fraudulent transactions in y_test: 0.12978532045516755

Best parameters found: {'penalty': None, 'solver': 'saga'}
score on training set: 0.9043293975174084
score on testing set: 0.8587041373926619


### Pipeline2

Decomposing date-like features into hours, weeks, etc.

In [9]:
ML_pipe2 = MLearner(dataset=merged_df, transformation_pipeline=Pipeline2, params=params_simple, estimator=LogisticRegression())
ML_pipe2.fit()
ML_pipe2.predict()

% of fraudulent transactions in y_train: 0.12889799326739906
% of fraudulent transactions in y_test: 0.13433184205407508

Best parameters found: {'penalty': None, 'solver': 'saga'}
score on training set: 0.924456217207478
score on testing set: 0.8782399035563593


### Pipeline 3

As we can see the date transformations improve performance. We will now see what an effect increasing the size of the dataset has.

In [None]:
# Set up logging configuration
logging.basicConfig(level=logging.DEBUG)

reduction_p = [0.00125, 0.0025, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1]

for p in reduction_p:
    print(f"\nMachine learning with {p} of dataset...\n")
    merged_df3_copy = Target0_Reducer(percentage=p).fit_transform(merged_df)

    ML_pipe3 = MLearner(dataset=merged_df3_copy, transformation_pipeline=Pipeline3, params=params_simple, estimator=LogisticRegression(), scoring="f1")
    ML_pipe3.fit()
    ML_pipe3.predict()
    
    


Machine learning with 0.00125 of dataset...



DEBUG:root:X_train indices: Index([11761,  1991,  2536, 12063, 11352, 22487, 15299, 14979,  5170, 18912,
       ...
       16850,  6265, 22118, 11284, 11964, 21575,  5390,   860, 15795, 23654],
      dtype='int64', length=18344)
DEBUG:root:X_test indices: Index([ 3265, 23387, 14115, 12223, 22099, 20716,  2434, 19006,  2442,  2905,
       ...
        8144,  1335, 17473,  2872,  9575, 11123,  7698, 13075,  9932, 12724],
      dtype='int64', length=6115)


% of fraudulent transactions in y_train: 0.5446467509812473
% of fraudulent transactions in y_test: 0.5463614063777597

Best parameters found: {'class_weight': 'balanced', 'max_iter': 1000, 'penalty': None, 'solver': 'saga'}
score on training set: 0.9878182725911133
score on testing set: 0.9453357100415924

Machine learning with 0.0025 of dataset...



DEBUG:root:X_train indices: Index([23585, 22829, 32624, 27566, 20170, 27627, 20325,  7063, 34987,  2458,
       ...
       29865, 27994, 33583, 23009, 10932, 31852, 29493, 33428, 22597, 24838],
      dtype='int64', length=26689)
DEBUG:root:X_test indices: Index([ 5253, 17429,  6944,  3870, 10100, 28878, 15630, 10133, 18265,  5028,
       ...
       22938,  7436, 21257, 11650, 23223, 14411, 33642, 17683, 12196, 17109],
      dtype='int64', length=8897)


% of fraudulent transactions in y_train: 0.3739368279066282
% of fraudulent transactions in y_test: 0.37675620995841297

Best parameters found: {'class_weight': 'balanced', 'max_iter': 1000, 'penalty': None, 'solver': 'saga'}
score on training set: 0.9759310618066561
score on testing set: 0.9317340441285354

Machine learning with 0.005 of dataset...



DEBUG:root:X_train indices: Index([  503, 47059,  8990, 53169, 16400, 48696, 30795, 23479, 55592, 14185,
       ...
        9674, 50397, 10050,  1627, 33913, 42556, 47692, 42936, 47211, 52668],
      dtype='int64', length=43380)
DEBUG:root:X_test indices: Index([ 8707, 14489, 57448, 38066, 22032, 34948,   659, 24369,   886, 29494,
       ...
       31217, 20001, 32699, 38196, 24806, 21759,  2441, 38485, 13246, 42258],
      dtype='int64', length=14460)


% of fraudulent transactions in y_train: 0.2306362378976487
% of fraudulent transactions in y_test: 0.2300829875518672

Best parameters found: {'class_weight': 'balanced', 'max_iter': 1000, 'penalty': None, 'solver': 'saga'}
score on training set: 0.953308146856534
score on testing set: 0.9078055964653903

Machine learning with 0.01 of dataset...



DEBUG:root:X_train indices: Index([16443,  1678, 18324, 79979, 37172, 61655, 82023, 53126, 69883, 45334,
       ...
       57293,  7347, 56232, 10299, 67456, 11703, 62602, 91463, 44717, 90861],
      dtype='int64', length=76761)
DEBUG:root:X_test indices: Index([50788,  7319, 74273, 43181, 96467, 27354,  3997, 83166, 56857, 85894,
       ...
       14655, 74813, 22928,  5918, 18097, 97402, 35131, 52754, 33421, 29208],
      dtype='int64', length=25587)


% of fraudulent transactions in y_train: 0.1292974296843449
% of fraudulent transactions in y_test: 0.13315355453941455

Best parameters found: {'class_weight': 'balanced', 'max_iter': 1000, 'penalty': None, 'solver': 'saga'}
score on training set: 0.9064547609089642
score on testing set: 0.866592859119845

Machine learning with 0.05 of dataset...



DEBUG:root:X_train indices: Index([174341, 101615,  95635, 381220,  53429, 274944, 322834, 158869, 182536,
       164351,
       ...
       143295, 400401,   1217, 231794, 132926, 143790, 244046,   3420, 176597,
       215685],
      dtype='int64', length=343809)
DEBUG:root:X_test indices: Index([ 94757,  93696,  56560,  85753, 397929, 287170, 452909, 317140, 451090,
       297544,
       ...
       104799, 391592,  72085,  48897, 232323, 426606, 328865, 256348,  55426,
       362302],
      dtype='int64', length=114604)


% of fraudulent transactions in y_train: 0.02942040493413494
% of fraudulent transactions in y_test: 0.02807057345293358





Interestingly we see that as we increase the dataframe size performance decreases. Perhaps the model cannot generalise well to so few fraudulent transactions

In [11]:
# Obtaining score for balanced class

# Set up logging configuration
logging.basicConfig(level=logging.CRITICAL)


merged_df3_copy = Target0_Reducer(balanced=True).fit_transform(merged_df)

ML_pipe3 = MLearner(dataset=merged_df3_copy, transformation_pipeline=Pipeline3, params=params_simple, estimator=LogisticRegression(), scoring="f1")
ML_pipe3.fit()
ML_pipe3.predict()

DEBUG:root:X_train indices: Index([19973, 18804, 26056, 23547, 23256, 11170, 20521,  3638,  6547,  7827,
       ...
       16850,  6265, 22118, 11284, 11964, 21575,  5390,   860, 15795, 23654],
      dtype='int64', length=19998)
DEBUG:root:X_test indices: Index([20966, 26365, 20166,  8763,  7335, 25135, 18791, 12228, 11812, 19142,
       ...
        8390, 17202,  6161,  7801, 26575,  3388,  3310, 23973, 18548, 24660],
      dtype='int64', length=6666)


% of fraudulent transactions in y_train: 0.5011501150115012
% of fraudulent transactions in y_test: 0.4965496549654965

Best parameters found: {'class_weight': 'balanced', 'max_iter': 1000, 'penalty': None, 'solver': 'saga'}
score on training set: 0.9849840891010342
score on testing set: 0.9477623110312827


In [13]:
# Now hyperparameter tuning

# Set up logging configuration
logging.basicConfig(level=logging.CRITICAL)


merged_df3_copy = Target0_Reducer(balanced=True).fit_transform(merged_df)

ML_pipe3 = MLearner(dataset=merged_df3_copy, transformation_pipeline=Pipeline3, params=params, estimator=LogisticRegression(), scoring="accuracy")
ML_pipe3.fit()
ML_pipe3.predict()

DEBUG:root:X_train indices: Index([18137, 19765, 18101, 26603,  3625, 16291,  5454, 12365, 20777, 15780,
       ...
        5566,   284, 23527,  5093, 14301, 15058, 25230, 12134,  2082,  7590],
      dtype='int64', length=19998)
DEBUG:root:X_test indices: Index([18988, 24283,  4969, 12047, 23953,   812, 23493, 10565, 20631, 22202,
       ...
        5582, 12400,  4382, 16181, 26619,  6945,  2818,  5969, 21674, 21480],
      dtype='int64', length=6666)


% of fraudulent transactions in y_train: 0.502050205020502
% of fraudulent transactions in y_test: 0.49384938493849384





Best parameters found: {'class_weight': None, 'penalty': None, 'solver': 'saga'}
score on training set: 0.9857485748574858
score on testing set: 0.9426942694269427




### LGBM set up with Pipeline3



In [None]:
# LGBM params

lgbm_params =     {"learning_rate": [0.01, 0.02],
                   "n_estimators": [50],
                   #"reg_alpha": np.linspace(0,1,10).tolist(),
                   #"reg_lambda": np.linspace(0,1,10).tolist(),
                   }


merged_lgbm = Target0_Reducer(balanced=True).fit_transform(merged_df)

ML_pipe3 = MLearner(dataset=merged_lgbm, transformation_pipeline=Pipeline3, params=lgbm_params, estimator=LGBMClassifier(), scoring="accuracy",cv=3)
ML_pipe3.fit()
ML_pipe3.predict()

% of fraudulent transactions in y_train: 0.5025502550255025
% of fraudulent transactions in y_test: 0.49234923492349236

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[LightGBM] [Info] Number of positive: 6700, number of negative: 6632
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030179 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1703
[LightGBM] [Info] Number of data points in the train set: 13332, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502550 -> initscore=0.010201
[LightGBM] [Info] Start training from score 0.010201
[CV 1/3] END learning_rate=0.01, n_estimators=50;, score=0.988 total time=  45.4s
[LightGBM] [Info] Number of positive: 6700, number of negative: 6632
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030536 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info]

: 

In [None]:
lgbm_params =     {"learning_rate": [0.02, 0.1],
                   "n_estimators": [50],
                   #"reg_alpha": np.linspace(0,1,10).tolist(),
                   #"reg_lambda": np.linspace(0,1,10).tolist(),
                   }


merged_lgbm = Target0_Reducer(balanced=True).fit_transform(merged_df)

ML_pipe3 = MLearner(dataset=merged_lgbm, transformation_pipeline=Pipeline3, params=lgbm_params, estimator=LGBMClassifier(), scoring="accuracy",cv=3)
ML_pipe3.fit()
ML_pipe3.predict()

% of fraudulent transactions in y_train: 0.5011501150115012
% of fraudulent transactions in y_test: 0.4965496549654965

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[LightGBM] [Info] Number of positive: 6682, number of negative: 6650
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029896 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1718
[LightGBM] [Info] Number of data points in the train set: 13332, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501200 -> initscore=0.004800
[LightGBM] [Info] Start training from score 0.004800
[CV 1/3] END learning_rate=0.02, n_estimators=50;, score=0.987 total time=  43.3s
[LightGBM] [Info] Number of positive: 6681, number of negative: 6651
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.028516 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] 

: 

In [5]:
lgbm_params =     {"learning_rate": [0.1, 0.2],
                   "n_estimators": [50],
                   #"reg_alpha": np.linspace(0,1,10).tolist(),
                   #"reg_lambda": np.linspace(0,1,10).tolist(),
                   }


merged_lgbm = Target0_Reducer(balanced=True).fit_transform(merged_df)

ML_pipe3 = MLearner(dataset=merged_lgbm, transformation_pipeline=Pipeline3, params=lgbm_params, estimator=LGBMClassifier(), scoring="accuracy",cv=3)
ML_pipe3.fit()
ML_pipe3.predict()

% of fraudulent transactions in y_train: 0.5011501150115012
% of fraudulent transactions in y_test: 0.4965496549654965

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[LightGBM] [Info] Number of positive: 6682, number of negative: 6650
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027913 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1718
[LightGBM] [Info] Number of data points in the train set: 13332, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501200 -> initscore=0.004800
[LightGBM] [Info] Start training from score 0.004800
[CV 1/3] END learning_rate=0.1, n_estimators=50;, score=0.989 total time=  47.3s
[LightGBM] [Info] Number of positive: 6681, number of negative: 6651
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032950 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] T

In [1]:
lgbm_params =     {"learning_rate": [0.2],
                   "n_estimators": [100],
                   #"reg_alpha": np.linspace(0,1,10).tolist(),
                   #"reg_lambda": np.linspace(0,1,10).tolist(),
                   }


merged_lgbm = Target0_Reducer(balanced=True).fit_transform(merged_df)

ML_pipe3 = MLearner(dataset=merged_lgbm, transformation_pipeline=Pipeline3, params=lgbm_params, estimator=LGBMClassifier(), scoring="accuracy",cv=3)
ML_pipe3.fit()
ML_pipe3.predict()

NameError: name 'Target0_Reducer' is not defined