# Feature Engineering

## 1. Setup

In [26]:
import pandas as pd
import numpy as np
from summarytools import dfSummary
from sklearn.preprocessing import MinMaxScaler

from env_setup import *
from functions.t_test import *

pd.set_option('display.max_columns', None)

In [27]:
# Load data
df_raw = pd.read_csv(fr"{dataout}//{dataset}_EDA.csv")
df_raw.head()
df_ori = df_raw.copy()

## 2. Feature Engineering

In [28]:
# Split categorical and numeric
l_cols_cat = [i for i in df_ori if df_ori[i].dtype==object]
l_cols_num = [i for i in df_ori if df_ori[i].dtype!=object and i != 'Churned']

print(f"Categorical columns: {', '.join(l_cols_cat)}")
print(f"Numeric columns: {', '.join(l_cols_num)}")

Categorical columns: Gender, Location
Numeric columns: Age, Income, Tenure, NumSupportCalls, NumComplaints, Purchase, Refund, Subscription Renewal, Support Fee, Upgrade, txn_mean, txn_count


### 2.1. Categorical One Hot Encoding
Apply one-hot encoding to categorical variables for modelling

In [29]:
# Summary Statistics for Categorical
## Encode categorical
df_ori_encoded = pd.get_dummies(df_ori, columns=l_cols_cat)
l_cols_encoded = [i for i in df_ori_encoded.columns if any(x in i for x in l_cols_cat)]

for col in l_cols_encoded:
    df_ori_encoded[col] = np.where(df_ori_encoded[col]==True, 1, 0)

stats_cat = df_ori_encoded.groupby('Churned')[l_cols_encoded].sum().reset_index()

## Scale
stats_cat_scaled = stats_cat.copy()
for col in l_cols_encoded:
    stats_cat_scaled[col] = np.where(stats_cat_scaled['Churned']==0, 
                                    stats_cat_scaled[col] / float(df_ori_encoded[df_ori_encoded['Churned']==0]['Churned'].count()),
                                    stats_cat_scaled[col] / float(df_ori_encoded[df_ori_encoded['Churned']==1]['Churned'].count())
    )
stats_cat_scaled

Unnamed: 0,Churned,Gender_Female,Gender_Male,Location_Rural,Location_Suburban,Location_Urban
0,0,0.503254,0.496746,0.199313,0.300423,0.500264
1,1,0.498825,0.501175,0.194814,0.306575,0.498611


### 2.2. Numerical Binning
Apply binning to numerical variables.

Mostly not necessary as bands are not distinct enough but just in case needed for modelling.

In [31]:
# Create Bins
df_ori_binned = df_ori.copy()

d_bins = {
    'Age': list(range(0, 80, 10)), 
    'Income': list(range(0, 200_000, 10_000)), 
    'Tenure': [0, 5], # Hypothesis test cut off = 5
    #'TransactionDate': list(range(0, 390, 30)),
    #'TransactionAmount': list(range(0, 600, 100)), 
    # 'NumSupportCalls': [], # Not required - too small and no major difference in hypothesis test
    # 'NumComplaints': [] # Not required - too small and no major difference in hypothesis test
}

for i,j in d_bins.items():
    df_ori_binned[f'{i}_bin'] = pd.cut(df_ori_binned[i], bins=j)

# Fix nans
df_ori_binned['Tenure_bin'] = df_ori_binned['Tenure_bin'].cat.add_categories('>5')
df_ori_binned.loc[df_ori_binned['Tenure_bin'].isna(), 'Tenure_bin'] = '>5'

In [32]:
# View Statistics
dfSummary(df_ori_binned[[f'{i}_bin' for i in d_bins.keys()]])

No,Variable,Stats / Values,Freqs / (% of Valid),Graph,Missing
1,Age_bin [category],"1. (30, 40] 2. (40, 50] 3. (50, 60] 4. (20, 30] 5. (60, 70] 6. (10, 20]","19,493 (19.5%) 19,292 (19.3%) 19,138 (19.1%) 19,065 (19.1%) 17,279 (17.3%) 5,733 (5.7%)",,0 (0.0%)
2,Income_bin [category],"1. (70000, 80000] 2. (50000, 60000] 3. (90000, 100000] 4. (60000, 70000] 5. (80000, 90000] 6. (20000, 30000] 7. (40000, 50000] 8. (30000, 40000] 9. (10000, 20000]","12,712 (12.7%) 12,611 (12.6%) 12,520 (12.5%) 12,507 (12.5%) 12,501 (12.5%) 12,474 (12.5%) 12,432 (12.4%) 12,241 (12.2%) 2 (0.0%)",,0 (0.0%)
3,Tenure_bin [category],"1. (0, 5] 2. >5","55,688 (55.7%) 44,312 (44.3%)",,0 (0.0%)


In [33]:
# One Hot Encoding
df_ori_encoded_num = pd.get_dummies(df_ori_binned[[f'{i}_bin' for i in d_bins.keys()]], columns=[f'{i}_bin' for i in d_bins.keys()])

for col in df_ori_encoded_num.columns:
    df_ori_encoded_num[col] = np.where(df_ori_encoded_num[col]==True, 1, 0)

df_ori_encoded_num.head()

Unnamed: 0,"Age_bin_(0, 10]","Age_bin_(10, 20]","Age_bin_(20, 30]","Age_bin_(30, 40]","Age_bin_(40, 50]","Age_bin_(50, 60]","Age_bin_(60, 70]","Income_bin_(0, 10000]","Income_bin_(10000, 20000]","Income_bin_(20000, 30000]","Income_bin_(30000, 40000]","Income_bin_(40000, 50000]","Income_bin_(50000, 60000]","Income_bin_(60000, 70000]","Income_bin_(70000, 80000]","Income_bin_(80000, 90000]","Income_bin_(90000, 100000]","Income_bin_(100000, 110000]","Income_bin_(110000, 120000]","Income_bin_(120000, 130000]","Income_bin_(130000, 140000]","Income_bin_(140000, 150000]","Income_bin_(150000, 160000]","Income_bin_(160000, 170000]","Income_bin_(170000, 180000]","Income_bin_(180000, 190000]","Tenure_bin_(0, 5]",Tenure_bin_>5
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [34]:
for col in df_ori_encoded_num.columns:
    df_ori_encoded_num[col] = np.where(df_ori_encoded_num[col]==True, 1, 0)

df_ori_encoded_label = pd.concat([df_ori_encoded_num, df_ori['Churned']], axis=1)
stats_bin = df_ori_encoded_label.groupby('Churned')[df_ori_encoded_num.columns].sum().reset_index()

## Scale
stats_bin_scaled = stats_bin.copy()
for col in df_ori_encoded_num.columns:
    stats_bin_scaled[col] = np.where(stats_bin_scaled['Churned']==0, 
                                    stats_bin_scaled[col] / float(df_ori_encoded[df_ori_encoded['Churned']==0]['Churned'].count()),
                                    stats_bin_scaled[col] / float(df_ori_encoded[df_ori_encoded['Churned']==1]['Churned'].count())
    )
stats_bin_scaled

Unnamed: 0,Churned,"Age_bin_(0, 10]","Age_bin_(10, 20]","Age_bin_(20, 30]","Age_bin_(30, 40]","Age_bin_(40, 50]","Age_bin_(50, 60]","Age_bin_(60, 70]","Income_bin_(0, 10000]","Income_bin_(10000, 20000]","Income_bin_(20000, 30000]","Income_bin_(30000, 40000]","Income_bin_(40000, 50000]","Income_bin_(50000, 60000]","Income_bin_(60000, 70000]","Income_bin_(70000, 80000]","Income_bin_(80000, 90000]","Income_bin_(90000, 100000]","Income_bin_(100000, 110000]","Income_bin_(110000, 120000]","Income_bin_(120000, 130000]","Income_bin_(130000, 140000]","Income_bin_(140000, 150000]","Income_bin_(150000, 160000]","Income_bin_(160000, 170000]","Income_bin_(170000, 180000]","Income_bin_(180000, 190000]","Tenure_bin_(0, 5]",Tenure_bin_>5
0,0,0.0,0.057882,0.191583,0.194461,0.192598,0.191708,0.171769,0.0,2.8e-05,0.125594,0.122786,0.123954,0.126331,0.125372,0.126609,0.124371,0.124955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.479339,0.520661
1,1,0.0,0.055916,0.188261,0.196132,0.193746,0.190541,0.175404,0.0,0.0,0.122551,0.121447,0.125258,0.125543,0.124297,0.128428,0.126647,0.125828,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.755503,0.244497


## 3. Normalisation
Normalisation used since most variables are not normally distributed

In [35]:
scaler = MinMaxScaler() 
scaled_values = scaler.fit_transform(df_raw[l_cols_num]) 
df_ori_norm = pd.DataFrame(scaled_values, columns=[f'{i}_norm' for i in l_cols_num])

## 4. Recombine

In [41]:
df_ori_encoded.columns

Index(['Age', 'Income', 'Tenure', 'NumSupportCalls', 'NumComplaints',
       'Churned', 'Purchase', 'Refund', 'Subscription Renewal', 'Support Fee',
       'Upgrade', 'txn_mean', 'txn_count', 'Gender_Female', 'Gender_Male',
       'Location_Rural', 'Location_Suburban', 'Location_Urban'],
      dtype='object')

In [42]:
df_fe = pd.concat(
    [
        df_raw, # Original
        df_ori_norm, # Normalised
        df_ori_encoded.drop(columns=['Age', 'Income', 'Tenure', 'NumSupportCalls', 'NumComplaints',
       'Churned', 'Purchase', 'Refund', 'Subscription Renewal', 'Support Fee',
       'Upgrade', 'txn_mean', 'txn_count']), # Categorical dummy
    ],
    axis = 1)
df_fe.head()

Unnamed: 0,Age,Gender,Income,Tenure,Location,NumSupportCalls,NumComplaints,Churned,Purchase,Refund,Subscription Renewal,Support Fee,Upgrade,txn_mean,txn_count,Age_norm,Income_norm,Tenure_norm,NumSupportCalls_norm,NumComplaints_norm,Purchase_norm,Refund_norm,Subscription Renewal_norm,Support Fee_norm,Upgrade_norm,txn_mean_norm,txn_count_norm,Gender_Female,Gender_Male,Location_Rural,Location_Suburban,Location_Urban
0,56.0,Female,91512.0,4.0,Urban,2.0,3.0,0,705.85,446.6,304.47,278.91,440.46,11.75,16.0,0.745098,0.893911,0.375,0.222222,0.75,0.581425,0.367875,0.249975,0.228054,0.360147,0.040262,0.833333,1,0,0,0,1
1,69.0,Female,87313.0,2.0,Urban,3.0,2.0,1,0.0,0.0,420.28,0.0,96.89,106.0,2.0,1.0,0.841423,0.125,0.333333,0.5,0.0,0.0,0.345057,0.0,0.079223,0.393258,0.055556,1,0,0,0,1
2,46.0,Male,95615.0,2.0,Suburban,4.0,3.0,1,241.31,314.55,206.98,95.22,407.96,48.0,12.0,0.54902,0.945199,0.125,0.444444,0.75,0.198773,0.259102,0.169934,0.077858,0.333573,0.17603,0.611111,0,1,0,1,0
3,32.0,Male,89271.0,8.0,Rural,1.0,0.0,1,291.22,1011.06,93.01,383.95,331.68,101.3,10.0,0.27451,0.865898,0.875,0.111111,0.0,0.239885,0.832834,0.076363,0.313941,0.271202,0.375655,0.5,0,1,1,0,0
4,60.0,Male,68751.0,8.0,Suburban,8.0,0.0,0,0.0,0.0,0.0,160.62,109.94,23.333333,3.0,0.823529,0.609395,0.875,0.888889,0.0,0.0,0.0,0.0,0.131333,0.089894,0.083645,0.111111,0,1,0,1,0


## 5. Export

In [43]:
# Export datasets
df_fe.to_csv(fr"{dataout}//{dataset}_FE.csv", index=False)