In [3]:
# autoreload
%load_ext autoreload
%autoreload 2

# change current working directory to the root of the project
import os
os.chdir(os.path.dirname(os.getcwd()))

# Purpose
- Purpose of this notebook is to create train, validation and test sets
    - 64% Train Set
    - 8% Validation Set for Calibration
    - 8% Validation Set for Optimal Threshold
    - 20% Test Set

In [4]:
from IPython.display import display

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

# Load Data

In [5]:
nrows = None
df_data = pd.read_csv("data/transformed/df_transformed.csv", nrows=nrows)
df_data.head()

Unnamed: 0,customer_ID,target,B_30_count,B_30_last,B_30_first,B_30_nunique,B_38_count,B_38_last,B_38_first,B_38_nunique,...,D_141_sub,D_141_frac,D_142_sub,D_142_frac,D_143_sub,D_143_frac,D_144_sub,D_144_frac,D_145_sub,D_145_frac
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0,13,0.0,0.0,1,13,2.0,2.0,1,...,0.001268,1.332093,,,0.005241,10.206754,0.00236,4.870063,0.005858,3.190701
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0,13,0.0,0.0,1,13,2.0,2.0,1,...,-0.000329,0.959816,,,0.002223,3.094873,0.003142,117.99158,0.005886,3.239017
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0,13,0.0,0.0,1,13,1.0,1.0,1,...,-0.001214,0.442683,,,0.001006,1.840784,-0.001904,0.304443,0.003001,7.780396
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,0,13,0.0,0.0,1,13,2.0,2.0,1,...,0.001657,7.835286,,,0.001917,1.305865,0.003687,2.968755,-0.004605,0.393142
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,0,13,0.0,0.0,1,13,1.0,2.0,2,...,0.000343,1.063353,,,0.002606,1.469782,0.003077,1.795758,-0.00047,0.658051


# Split the data into train, validation and test sets

In [6]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df_data.drop("target", axis=1), df_data["target"], test_size=0.2, random_state=42)

In [7]:
# Split train set into train and calibration validation sets
X_train, X_val_calibration, y_train, y_val_calibration = train_test_split(X_train, y_train, test_size=0.1, random_state=42)


In [8]:
# Split test set into test and threshold validation sets
X_train, X_val_threshold, y_train, y_val_threshold = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [9]:
print('Training Set')
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
display(X_train.head())
display(y_train.head())

Training Set
X_train shape: (297375, 1461)
y_train shape: (297375,)


Unnamed: 0,customer_ID,B_30_count,B_30_last,B_30_first,B_30_nunique,B_38_count,B_38_last,B_38_first,B_38_nunique,D_114_count,...,D_141_sub,D_141_frac,D_142_sub,D_142_frac,D_143_sub,D_143_frac,D_144_sub,D_144_frac,D_145_sub,D_145_frac
362838,ca1c9f22fd2252949ac2f5564f26769780a48c3637433b...,13,0.0,0.0,1,13,2.0,2.0,1,13,...,0.012524,1.014238,0.048853,1.302738,0.000235,1.000235,0.091412,12.990935,0.004313,1.023706
186060,67b7eee3776f5884bcbf0c7bcfe613d844fd841746a9d2...,13,0.0,0.0,1,13,1.0,1.0,1,13,...,-0.002621,0.632003,,,-0.004055,0.520429,0.006335,4.164307,-0.004911,0.203894
335405,badd7d76b370a42fcd756e03658597892782f2a5ae6891...,13,1.0,1.0,1,13,6.0,3.0,5,13,...,0.000953,1.106869,,,0.007564,41.863259,-0.002124,0.745708,0.002719,1.941992
265053,938f90757d568a637be67271356bfe6ce6373b6912d9e4...,13,0.0,1.0,2,13,3.0,3.0,1,13,...,-0.005658,0.342687,,,-0.002021,0.52104,-0.008233,0.133476,0.003698,2.768127
184287,66b797c6750aabfe5ed8da57889a0b8905b2d47cc98df5...,4,0.0,0.0,1,4,1.0,1.0,1,1,...,0.002203,1.509618,,,-0.004063,0.504192,-0.003378,0.270064,0.001852,1.2729


362838    0
186060    0
335405    1
265053    1
184287    0
Name: target, dtype: int64

In [10]:
print('Calibration Validation Set')
print(f"X_val_calibration shape: {X_val_calibration.shape}")
print(f"y_val_calibration shape: {y_val_calibration.shape}")
display(X_val_calibration.head())
display(y_val_calibration.head())

Calibration Validation Set
X_val_calibration shape: (36713, 1461)
y_val_calibration shape: (36713,)


Unnamed: 0,customer_ID,B_30_count,B_30_last,B_30_first,B_30_nunique,B_38_count,B_38_last,B_38_first,B_38_nunique,D_114_count,...,D_141_sub,D_141_frac,D_142_sub,D_142_frac,D_143_sub,D_143_frac,D_144_sub,D_144_frac,D_145_sub,D_145_frac
68626,2662c770d10b106197fa729dc6416ef88c23e5c5525ba9...,13,0.0,0.0,1,13,2.0,2.0,1,13,...,0.000104,1.012796,,,0.005976,2.724419,0.004832,1.955449,-0.001436,0.788652
321531,b30cd33df77d39279a55f6ca641dc513cfe4c5679d02b5...,13,0.0,0.0,1,13,2.0,2.0,1,13,...,0.010573,1.010926,0.048838,1.097987,-0.003414,0.996598,0.00291,1.795558,-0.006638,0.933419
56360,1f90f6b626bce52fd637dee12ce96520054170ce63a75a...,13,0.0,0.0,1,13,2.0,3.0,3,13,...,0.005003,3.074819,,,-0.001901,0.40411,-0.004927,0.218985,0.001979,2.421412
117268,412888f2d5f1697e6e9e8bae69becf0c92c623310f5cdc...,13,0.0,0.0,1,13,1.0,3.0,4,13,...,0.007655,44.26008,,,0.006475,9.608882,0.004884,2.357053,-0.002959,0.584876
211738,75d32564db40f5de3f5607dfde7ed2f8fb7ed52eca536b...,12,0.0,0.0,1,12,2.0,1.0,3,8,...,-0.006652,0.313799,,,-1.7e-05,0.995637,0.001604,1.335535,0.005148,3.155907


68626     0
321531    0
56360     0
117268    0
211738    0
Name: target, dtype: int64

In [11]:
print('Threshold Validation Set')
print(f"X_val_threshold shape: {X_val_threshold.shape}")
print(f"y_val_threshold shape: {y_val_threshold.shape}")
display(X_val_threshold.head())
display(y_val_threshold.head())

Threshold Validation Set
X_val_threshold shape: (33042, 1461)
y_val_threshold shape: (33042,)


Unnamed: 0,customer_ID,B_30_count,B_30_last,B_30_first,B_30_nunique,B_38_count,B_38_last,B_38_first,B_38_nunique,D_114_count,...,D_141_sub,D_141_frac,D_142_sub,D_142_frac,D_143_sub,D_143_frac,D_144_sub,D_144_frac,D_145_sub,D_145_frac
145006,50b0f152094b1ea5bc1f53a64ba77757593cc20031fce6...,13,1.0,1.0,1,13,6.0,5.0,3,13,...,0.857841,143.142502,0.021808,6.500007,0.997483,273.244907,-0.002779,0.68638,0.263449,27.862875
413017,e647da873fef16d598a8840fc450b32f2e28c6296146e7...,12,0.0,0.0,1,12,3.0,1.0,5,11,...,0.004329,2.819182,,,0.004228,2.201138,-0.000633,0.677856,-0.002182,0.515654
243548,8794d2ba75b3e804027531b48e9b9cc9c048ff32312fa1...,13,1.0,0.0,3,13,3.0,7.0,3,13,...,0.009898,1.010463,0.288832,1.599306,-0.002224,0.997798,0.144799,1.373219,0.094288,1.343375
297160,a574e245e769318307ababb5eeab66114421207a311bfe...,13,0.0,0.0,1,13,1.0,1.0,1,13,...,0.004405,2.03344,,,0.003519,2.147678,-0.003416,0.634543,-0.001631,0.786354
271821,975ff1130b652712363e32a2d742a3560e339c014ce961...,13,0.0,0.0,2,13,3.0,3.0,1,13,...,0.005814,1.00635,0.049682,1.196684,-0.000881,0.999125,-0.002988,0.994485,-0.001426,0.985217


145006    1
413017    0
243548    0
297160    0
271821    0
Name: target, dtype: int64

In [12]:
print('Test Set')
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")
display(X_test.head())
display(y_test.head())

Test Set
X_test shape: (91783, 1461)
y_test shape: (91783,)


Unnamed: 0,customer_ID,B_30_count,B_30_last,B_30_first,B_30_nunique,B_38_count,B_38_last,B_38_first,B_38_nunique,D_114_count,...,D_141_sub,D_141_frac,D_142_sub,D_142_frac,D_143_sub,D_143_frac,D_144_sub,D_144_frac,D_145_sub,D_145_frac
33412,12c761e40348fa242de0356426fa9547f0feea34d1f57b...,13,0.0,0.0,1,13,5.0,1.0,3,11,...,0.001539,1.26503,,,-2.8e-05,0.995717,-0.007654,0.02791,0.001819,1.832629
350485,c34b0cf3175108892e42e9382e8177847ac15636870604...,3,0.0,0.0,1,3,4.0,1.0,2,1,...,-0.002279,0.751526,,,-0.003474,0.475973,0.002824,1.566516,-0.007112,0.070716
36829,14aff3c0e0ed2aaaef6deb127f2541b67fdd0002615bfd...,13,1.0,2.0,3,13,3.0,3.0,2,13,...,0.020072,1.022241,0.052345,1.21339,0.003577,1.003564,-0.213398,0.641255,-0.091631,0.505844
444951,f82be644eb90ab65d737b6dfc5de670559fa2ca23c98ea...,13,0.0,0.0,1,13,2.0,1.0,2,13,...,-0.000378,0.919788,,,0.004012,2.043321,0.00146,2.007422,0.002279,1.546141
17537,09dec6d53f0f12db6edcaecbb4d2bddf41f220ba9569d1...,13,0.0,0.0,1,13,1.0,2.0,2,13,...,-0.004239,0.356514,,,0.008933,78.804175,0.004662,30.032338,-0.003442,0.083617


33412     0
350485    1
36829     0
444951    0
17537     0
Name: target, dtype: int64

In [13]:
# Save data
X_train.to_csv("data/transformed/X_train.csv", index=False)
X_val_calibration.to_csv("data/transformed/X_val_calibration.csv", index=False)
X_val_threshold.to_csv("data/transformed/X_val_threshold.csv", index=False)
X_test.to_csv("data/transformed/X_test.csv", index=False)

y_train.to_csv("data/transformed/y_train.csv", index=False)
y_val_calibration.to_csv("data/transformed/y_val_calibration.csv", index=False)
y_val_threshold.to_csv("data/transformed/y_val_threshold.csv", index=False)
y_test.to_csv("data/transformed/y_test.csv", index=False)