In [1]:
# autoreload
%load_ext autoreload
%autoreload 2

# change current working directory to the root of the project
import os
os.chdir(os.path.dirname(os.getcwd()))

# Purpose
- Purpose of this notebook is to build baseline models

In [15]:
import warnings
from IPython.display import display

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import brier_score_loss

In [3]:
# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True, figsize=(11, 4))
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
)

  plt.style.use("seaborn-whitegrid")


# Load Data

In [4]:
df_data = pd.read_csv("data/transformed/df_transformed.csv")
df_data.head()

  df_data = pd.read_csv("data/transformed/df_transformed.csv")


Unnamed: 0,customer_ID,target,B_30_count,B_30_last,B_30_first,B_30_nunique,B_38_count,B_38_last,B_38_first,B_38_nunique,...,D_141_sub,D_141_frac,D_142_sub,D_142_frac,D_143_sub,D_143_frac,D_144_sub,D_144_frac,D_145_sub,D_145_frac
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0,13.0,0.0,0.0,1.0,13.0,2.0,2.0,1.0,...,0.001268,1.332093,,,0.005241,10.206754,0.00236,4.870063,0.005858,3.190701
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0,13.0,0.0,0.0,1.0,13.0,2.0,2.0,1.0,...,-0.000329,0.959816,,,0.002223,3.094873,0.003142,117.99158,0.005886,3.239017
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0,13.0,0.0,0.0,1.0,13.0,1.0,1.0,1.0,...,-0.001214,0.442683,,,0.001006,1.840784,-0.001904,0.304443,0.003001,7.780396
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,0,13.0,0.0,0.0,1.0,13.0,2.0,2.0,1.0,...,0.001657,7.835286,,,0.001917,1.305865,0.003687,2.968755,-0.004605,0.393142
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,0,13.0,0.0,0.0,1.0,13.0,1.0,2.0,2.0,...,0.000343,1.063353,,,0.002606,1.469782,0.003077,1.795758,-0.00047,0.658051


# Train - Test Split

In [6]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df_data.drop("target", axis=1), df_data["target"], test_size=0.2, random_state=42)

In [7]:
print(f"X_train shape: {X_train.shape}")
display(X_train.head())
print(f"y_train shape: {y_train.shape}")
display(y_train.head())
print(f"X_test shape: {X_test.shape}")
display(X_test.head())
print(f"y_test shape: {y_test.shape}")
display(y_test.head())

X_train shape: (8000, 1461)


Unnamed: 0,customer_ID,B_30_count,B_30_last,B_30_first,B_30_nunique,B_38_count,B_38_last,B_38_first,B_38_nunique,D_114_count,...,D_141_sub,D_141_frac,D_142_sub,D_142_frac,D_143_sub,D_143_frac,D_144_sub,D_144_frac,D_145_sub,D_145_frac
9254,052b87720d09600d6d83df8e02630d8f97b8b1a5f85811...,,,,,,,,,,...,,,,,,,,,,
1561,00e22dcf5f54d257ba27bd0f09fe744c22868551ef4360...,,,,,,,,,,...,,,,,,,,,,
1670,00f264139f21ee025eed8e8bd28c52bb0b20eb9406ff65...,,,,,,,,,,...,,,,,,,,,,
6087,0369f4640689a6b7c2a03d296cf903005a06dec306b99f...,,,,,,,,,,...,,,,,,,,,,
6669,03b95fc029c235d8fc244258abf88a02a0250592e26225...,,,,,,,,,,...,,,,,,,,,,


y_train shape: (8000,)


9254    0
1561    1
1670    0
6087    0
6669    1
Name: target, dtype: int64

X_test shape: (2000, 1461)


Unnamed: 0,customer_ID,B_30_count,B_30_last,B_30_first,B_30_nunique,B_38_count,B_38_last,B_38_first,B_38_nunique,D_114_count,...,D_141_sub,D_141_frac,D_142_sub,D_142_frac,D_143_sub,D_143_frac,D_144_sub,D_144_frac,D_145_sub,D_145_frac
6252,037fff225a6699be40aba0bcc53b2bbf85dfbff9a633aa...,,,,,,,,,,...,,,,,,,,,,
4684,029fad127830daf9076405dd331058e17b6756f7c22f94...,,,,,,,,,,...,,,,,,,,,,
1731,00f9bb44f56384dfafec837ba21d264d3ed8a92e1318a2...,,,,,,,,,,...,,,,,,,,,,
4742,02a7264a3d590aa35bf1fad8d1ff4758b2d77f94dc50db...,,,,,,,,,,...,,,,,,,,,,
4521,0287a53e8d56d7ba436e6abb9bee67b578b66a7ac42ad3...,,,,,,,,,,...,,,,,,,,,,


y_test shape: (2000,)


6252    0
4684    0
1731    0
4742    0
4521    0
Name: target, dtype: int64

# Baseline Model 1
- Zero-Rate Model
- Predcit the most frequent class

In [29]:
class BaselineModelZeroR:
    '''ZeroR baseline model that always predicts the most frequent class'''
    
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass
        
    def predict(self, X: pd.DataFrame, y: pd.DataFrame) -> pd.DataFrame:

        # merge X and y on index
        df = X.merge(y, left_index=True, right_index=True)

        # get the most frequent class
        most_frequent_class = df["target"].value_counts().index[0]

        # create predictions
        predictions = [most_frequent_class] * len(df)

        return np.array(predictions)

In [30]:
model = BaselineModelZeroR()
model.fit(X_train, y_train)
predictions = model.predict(X_test, y_test)

print(f"Brier: {brier_score_loss(y_test, predictions)}")

Brier: 0.2595


# Baseline Model 2
- Random-Rate Model
- Predicts a random class with the same probability as the most frequent class

In [31]:
class BaselineModelRandom:
    '''Random baseline model that randomly predicts 0 or 1 based on the distribution of the training set'''

    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass
        
    def predict(self, X: pd.DataFrame, y: pd.DataFrame) -> pd.DataFrame:

        # merge X and y on index
        df = X.merge(y, left_index=True, right_index=True)

        # predict 0 or 1 based on the distribution of the training set
        predictions = np.random.choice([0, 1], size=len(df), p=df["target"].value_counts(normalize=True))

        return np.array(predictions)

In [32]:
model = BaselineModelRandom()
model.fit(X_train, y_train)
predictions = model.predict(X_test, y_test)

print(f"Brier: {brier_score_loss(y_test, predictions)}")

Brier: 0.379


# Summary

In [33]:
print(f"ZeroR baseline model: {accuracy_score(y_test, predictions)}")
print(f"Random baseline model: {accuracy_score(y_test, predictions)}")

ZeroR baseline model: 0.621
Random baseline model: 0.621
