### Modelling Notebook

In [34]:
# importing external libraries
from pathlib import Path
import os
import pandas as pd
import pickle
import json
import matplotlib.pyplot as plt
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn import set_config
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
set_config(transform_output = "pandas")

# Importing function to load data

# Making sure any changes are instantly added
%load_ext autoreload
%autoreload 2

from Modules.load_data import load_data
from Modules.preprocessing import missing_summary, merge_dfs, dollar_to_int, find_unique_values
from Modules.plotting import Plotter
from Modules.transforming import *

# Importing Pipelines
from Modules.Pipelines import general_transformation_pipeline1, general_transformation_pipeline2, general_transformation_pipeline3, Pipeline1, Pipeline2, Pipeline3
#from Pipelines.Pipeline1 import Pipeline_for_exploration, Pipeline1, general_transformation_pipeline
#from Pipelines.Pipeline2 import Pipeline_for_exploration2, Pipeline2, general_transformation_pipeline2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [29]:
# Obtaining Root dir

root = str(Path.cwd())


# Obtaining seed from config.yaml

# Load the config file
with open(root + "/config.yaml", "r") as file:
    config = yaml.safe_load(file)

seed = config["global"]["seed"]

#print(f"seed: {seed}")

# Set global seeds for reproducibility
random.seed(seed)        
np.random.seed(seed)     

# Use the seed in scikit-learn
random_state = check_random_state(seed)

In [30]:
# Obtaining absolute path to data folder

data_folder = str(Path(os.getcwd()) / "data")

In [35]:
# Loading the data from pickle

merged_df = pd.read_pickle(data_folder + "/merged_data.pkl")

In [36]:
# Applying the general transformation pipelines

merged_df1 = general_transformation_pipeline1.fit_transform(merged_df)

merged_df2 = general_transformation_pipeline2.fit_transform(merged_df)

merged_df3 = general_transformation_pipeline3.fit_transform(merged_df)

  
  
  
  
  
  


In [8]:
# Following the transformation from pipeline1

# Setting the target column

#y = merged_df["target"]

# Test Train Split

X_train1, X_test1 = train_test_split(merged_df, random_state=42)

# Applying Pipeline1

X_train1 = Pipeline1.fit_transform(X_train1)
X_test1 = Pipeline1.transform(X_test1)

# Obtaining target column

y_train1 = X_train1["is_fraud"]
y_test1 = X_test1["is_fraud"]

# Dropping is fraud column

X_train1 = X_train1.drop(columns=["is_fraud"])
X_test1 = X_test1.drop(columns=["is_fraud"])

# Checkiing the proportion of positive values

print(f"% of fraudulent transactions in y_train: {y_train1.mean()}")
print(f"% of fraudulent transactions in y_test: {y_test1.mean()}")

# Setting params to do grid search over
numerical_parameter_grid = np.linspace(0,1,10)

params = [
    {"penalty": [None], "class_weight": [None, "balanced"], "solver":["saga"]},
    {"penalty": ["elasticnet"], "l1_ratio" : np.linspace(0,1,10).tolist(), 
     "C": np.linspace(0.01,1,10).tolist(), "solver":["saga"], "class_weight": [None, "balanced"]}    
]

gs_logit = GridSearchCV(
    LogisticRegression(),
    param_grid=params,
    scoring = "f1",
    cv=5)

gs_logit.fit(X_train1, y_train1)

print(f"score on training set: {gs_logit.score(X_train1, y_train1)}")
print(f"score on testing set: {gs_logit.score(X_test1, y_test1)}")


  return pd.to_datetime(X).to_frame()
  return pd.to_datetime(X).to_frame()
  return pd.to_datetime(X).to_frame()
  return pd.to_datetime(X).to_frame()


% of fraudulent transactions in y_train: 0.1302501302761855
% of fraudulent transactions in y_test: 0.1302954509926528
score on training set: 0.9023049377112019
score on testing set: 0.8466833541927409


In [10]:
print(f"score on training set: {gs_logit.score(X_train1, y_train1)}")
print(f"score on testing set: {gs_logit.score(X_test1, y_test1)}")

score on training set: 0.9023049377112019
score on testing set: 0.8466833541927409


In [None]:
# Following the transformation from pipeline2

# Setting the target column

#y = merged_df["target"]

# Test Train Split

X_train2, X_test2 = train_test_split(merged_df, test_size=0.25)

# Applying Pipeline1

X_train2 = Pipeline2.fit_transform(X_train2)
X_test2 = Pipeline2.transform(X_test2)

# Obtaining target column

y_train2 = X_train2["is_fraud"]
y_test2 = X_test2["is_fraud"]

# Dropping is fraud column

X_train2 = X_train2.drop(columns=["is_fraud"])
X_test2 = X_test2.drop(columns=["is_fraud"])

# Checkiing the proportion of positive values

print(f"% of fraudulent transactions in y_train: {y_train2.mean()}")
print(f"% of fraudulent transactions in y_test: {y_test2.mean()}")

# Setting params to do grid search over
numerical_parameter_grid = np.linspace(0,1,10)

params = [
    {"penalty": [None], "class_weight": [None, "balanced"], "solver":["saga"]},
    {"penalty": ["elasticnet"], "l1_ratio" : np.linspace(0,1,10).tolist(), 
     "C": np.linspace(0.01,1,10).tolist(), "solver":["saga"], "class_weight": [None, "balanced"]}    
]

gs_logit = GridSearchCV(
    LogisticRegression(),
    param_grid=params,
    scoring = "f1",
    cv=5)

gs_logit.fit(X_train2, y_train2)

print(f"score on training set: {gs_logit.score(X_train2, y_train2)}")
print(f"score on testing set: {gs_logit.score(X_test2, y_test2)}")

In [47]:
print(f"score on training set: {gs_logit.score(X_train2, y_train2)}")
print(f"score on testing set: {gs_logit.score(X_test2, y_test2)}")

score on training set: 0.923634898247028
score on testing set: 0.875253075844884


As we can see the date transformations improve performance. We will now see what an effect increasing the size of the dataset has.

In [39]:
reduction_p = [0.01, 0.05, 0.1, 0.25, 0.5, 1]

for p in reduction_p:
    merged_df3_copy = Target0_Reducer(percentage=p).fit_transform(merged_df)

    # Test Train Split

    X_train3, X_test3 = train_test_split(merged_df3_copy, test_size=0.25)

    # Applying Pipeline1

    X_train3 = Pipeline3.fit_transform(X_train3)
    X_test3 = Pipeline3.transform(X_test3)

    # Obtaining target column

    y_train3 = X_train3["is_fraud"]
    y_test3 = X_test3["is_fraud"]

    # Dropping is fraud column

    X_train3 = X_train3.drop(columns=["is_fraud"])
    X_test3 = X_test3.drop(columns=["is_fraud"])

    # Checkiing the proportion of positive values
    print(f"when p={p}...")
    print(f"% of fraudulent transactions in y_train: {y_train3.mean()}")
    print(f"% of fraudulent transactions in y_test: {y_test3.mean()}\n")
    

  return pd.to_datetime(X).to_frame()
  return pd.to_datetime(X).to_frame()
  return pd.to_datetime(X).to_frame()
  return pd.to_datetime(X).to_frame()


when p=0.01...
% of fraudulent transactions in y_train: 0.1307955863003348
% of fraudulent transactions in y_test: 0.12865908469144488



  return pd.to_datetime(X).to_frame()
  return pd.to_datetime(X).to_frame()
  return pd.to_datetime(X).to_frame()
  return pd.to_datetime(X).to_frame()


when p=0.05...
% of fraudulent transactions in y_train: 0.029193534782393712
% of fraudulent transactions in y_test: 0.028751177969355345



  return pd.to_datetime(X).to_frame()
  return pd.to_datetime(X).to_frame()
  return pd.to_datetime(X).to_frame()
  return pd.to_datetime(X).to_frame()


when p=0.1...
% of fraudulent transactions in y_train: 0.014732424172214261
% of fraudulent transactions in y_test: 0.01482685036790423



  return pd.to_datetime(X).to_frame()
  return pd.to_datetime(X).to_frame()
  return pd.to_datetime(X).to_frame()
  return pd.to_datetime(X).to_frame()


when p=0.25...
% of fraudulent transactions in y_train: 0.00594977886357437
% of fraudulent transactions in y_test: 0.005971215951830047



  return pd.to_datetime(X).to_frame()
  return pd.to_datetime(X).to_frame()


KeyboardInterrupt: 