In [1]:
# autoreload
%load_ext autoreload
%autoreload 2

# change current working directory to the root of the project
import os
os.chdir(os.path.dirname(os.getcwd()))

# Purpose
- Purpose of this notebook is to finalize features to be used for training

In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_regression

import warnings
from IPython.display import display

In [3]:
# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True, figsize=(11, 4))
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
)

  plt.style.use("seaborn-whitegrid")


# Load Data

In [4]:
nrows = 10000
# import csv data
df_data = pd.read_csv('data/raw/train_data.csv', nrows=nrows)
df_labels = pd.read_csv('data/raw/train_labels.csv', nrows=nrows)

display(df_data.head())
display(df_labels.head())

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938469,0.001733,0.008724,1.006838,0.009228,0.124035,0.008771,0.004709,...,,,,0.002427,0.003706,0.003818,,0.000569,0.00061,0.002674
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936665,0.005775,0.004923,1.000653,0.006151,0.12675,0.000798,0.002714,...,,,,0.003954,0.003167,0.005032,,0.009576,0.005492,0.009217
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.95418,0.091505,0.021655,1.009672,0.006815,0.123977,0.007598,0.009423,...,,,,0.003269,0.007329,0.000427,,0.003429,0.006986,0.002603
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.960384,0.002455,0.013683,1.0027,0.001373,0.117169,0.000685,0.005531,...,,,,0.006117,0.004516,0.0032,,0.008419,0.006527,0.0096
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.947248,0.002483,0.015193,1.000727,0.007605,0.117325,0.004653,0.009312,...,,,,0.003671,0.004946,0.008889,,0.00167,0.008126,0.009827


Unnamed: 0,customer_ID,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,0
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,0


# Feature Engineering

In [None]:
# 1. Combine Customer_IDs, with lag features of selected features
# 2. Combine Customer_IDs, with average of selected features
# 3. Combine Customer_IDs, with max and min of selected features
# 4. Combine Customer_IDs, with last and first of selected features
# 5. Combine Customer_IDs, difference between last and first of selected features
# 6. Combine Customer_IDs, fraction of last and first of selected features
# 7. Combine Customer_IDs, difference between last and average of selected features
# 8. Combine Customer_IDs, difference between last and lag1 of selected features

In [46]:
def feature_engineering(df: pd.DataFrame, df_labels: pd.DataFrame):
    warnings.filterwarnings("ignore")
    '''Feature Engineering'''

    features = df.drop(["customer_ID", "S_2"], axis = 1).columns.to_list()

    cat_cols = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68"]
    
    #select all columns except cat_cols
    num_cols = [col for col in features if col not in cat_cols]

    # feature engineering on categorical columns
    print('Feature Engineering on categorical columns...')
    df_cat_aggregated = df.groupby('customer_ID')[cat_cols].agg(['count', 'last', 'first', 'nunique'])
    df_cat_aggregated.columns = ["_".join(x) for x in df_cat_aggregated.columns]
    df_cat_aggregated.reset_index(inplace = True)

    # feature engineering on numerical columns
    print('Feature Engineering on numerical columns...')
    df_num_aggregated = df.groupby('customer_ID')[num_cols].agg(['last', 'first', 'mean', 'std', 'min', 'max'])
    df_num_aggregated.columns = ["_".join(x) for x in df_num_aggregated.columns]
    df_num_aggregated.reset_index(inplace = True)

    # feature engineering on lag features
    print('Feature Engineering on lag features...')
    for col in df_num_aggregated.columns:
        # subtract columns that have last and first in their name
        if 'last' in col and col.replace('last', 'first') in df_num_aggregated.columns:
            df_num_aggregated[col.replace('last', 'sub')] = df_num_aggregated[col] - df_num_aggregated[col.replace('last', 'first')]
        # divide columns that have last and first in their name
        if 'last' in col and col.replace('last', 'first') in df_num_aggregated.columns:
            df_num_aggregated[col.replace('last', 'frac')] = df_num_aggregated[col] / df_num_aggregated[col.replace('last', 'first')]

    # join the two dataframes with df_labels on customer_ID
    df = df_labels.merge(df_cat_aggregated, on = 'customer_ID', how = 'left')
    df = df.merge(df_num_aggregated, on = 'customer_ID', how = 'left')

    print("Dimensions after feature engineering", df.shape )

    return df   

In [47]:
df = feature_engineering(df_data, df_labels)

Feature Engineering on categorical columns...
Feature Engineering on numerical columns...
Feature Engineering on lag features...
Dimensions after feature engineering (10000, 1462)


In [48]:
# check length of df == length of df_labels
assert len(df) == len(df_labels)

# Save df

In [66]:
# save df to data/transformed
df.to_csv('data/transformed/df_transformed.csv', index = False)