# Data Preprocessing

**Prerequisite : EDA.ipynb**

In [44]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.covariance import EllipticEnvelope

# For custom preprocessing
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import FunctionTransformer

In [47]:
df = pd.read_excel("../artifacts/raw/E Commerce Dataset.xlsx", sheet_name="E Comm", index_col=0)

### One Hot Encoding

In [50]:
# One hot encoding to prepare for iterative imputation
categorical_columns = ["PreferredLoginDevice", "CityTier", "PreferredPaymentMode", "Gender", "PreferedOrderCat", "MaritalStatus"] # for one hot encoding, exclude columns that are already binary
df_one_hot = pd.get_dummies(df, columns=categorical_columns, dtype=int)

df_one_hot.head()

Unnamed: 0_level_0,Churn,Tenure,WarehouseToHome,HourSpendOnApp,NumberOfDeviceRegistered,SatisfactionScore,NumberOfAddress,Complain,OrderAmountHikeFromlastYear,CouponUsed,...,Gender_Male,PreferedOrderCat_Fashion,PreferedOrderCat_Grocery,PreferedOrderCat_Laptop & Accessory,PreferedOrderCat_Mobile,PreferedOrderCat_Mobile Phone,PreferedOrderCat_Others,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
50001,1,4.0,6.0,3.0,3,2,9,1,11.0,1.0,...,0,0,0,1,0,0,0,0,0,1
50002,1,,8.0,3.0,4,3,7,1,15.0,0.0,...,1,0,0,0,1,0,0,0,0,1
50003,1,,30.0,2.0,4,3,6,1,14.0,0.0,...,1,0,0,0,1,0,0,0,0,1
50004,1,0.0,15.0,2.0,4,5,8,0,23.0,0.0,...,1,0,0,1,0,0,0,0,0,1
50005,1,0.0,12.0,,3,5,3,0,11.0,1.0,...,1,0,0,0,1,0,0,0,0,1


### Handle Null values

In [53]:
# Copy of dataset with dropped null values
df_dropped = df_one_hot.dropna()

print("Before: ", df_one_hot.shape)
print("After: ", df_dropped.shape)
print("Dropped: ", df_one_hot.shape[0] - df_dropped.shape[0])

Before:  (5630, 37)
After:  (3774, 37)
Dropped:  1856


In [54]:
# Imputing with Iterative Imputer
imputer = IterativeImputer(random_state=0)
df_imputed = pd.DataFrame(imputer.fit_transform(df_one_hot), columns=df_one_hot.columns)

df_imputed.head()

Unnamed: 0,Churn,Tenure,WarehouseToHome,HourSpendOnApp,NumberOfDeviceRegistered,SatisfactionScore,NumberOfAddress,Complain,OrderAmountHikeFromlastYear,CouponUsed,...,Gender_Male,PreferedOrderCat_Fashion,PreferedOrderCat_Grocery,PreferedOrderCat_Laptop & Accessory,PreferedOrderCat_Mobile,PreferedOrderCat_Mobile Phone,PreferedOrderCat_Others,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single
0,1.0,4.0,6.0,3.0,3.0,2.0,9.0,1.0,11.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,2.451829,8.0,3.0,4.0,3.0,7.0,1.0,15.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,1.0,3.649009,30.0,2.0,4.0,3.0,6.0,1.0,14.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,15.0,2.0,4.0,5.0,8.0,0.0,23.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,12.0,2.371956,3.0,5.0,3.0,0.0,11.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


### Handle outliers

### Features Selection using Variational Inflation Factor

### Standardizing

### Creating pipeline

In [46]:
categorical_columns = ["PreferredLoginDevice", "CityTier", "PreferredPaymentMode", "Gender", "PreferedOrderCat", "MaritalStatus"] # for one hot encoding, exclude columns that are already binary
numerical_columns = [col for col in df.columns if col not in categorical_columns]

# Preprocessing pipeline 1
preprocessing_pipeline_inference= ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', IterativeImputer()),
            ('scaler', StandardScaler())
        ]), numerical_columns),
        ('cat', Pipeline([
            ('onehot', OneHotEncoder(drop='first'))
        ]), categorical_columns)
    ])

# Apply preprocessing pipeline to your data
data_imputed_with_outliers = preprocessing_pipeline_inference.fit_transform(df)

ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 5067 and the array at index 1 has size 5630

# Model Training

### 