In [15]:
import pandas as pd
from scipy.stats import chi2_contingency
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [2]:
data = pd.read_excel('../data/E Commerce Dataset.xlsx', sheet_name='E Comm')

# Data Preprocessing

## Feature Selection¶


In this section we will drop all features tha does not provide any value in our target variable. At first we will drop the ```CustomerID```
as it is irrelevant to the target value. Secondly we will perform chi-square  test on categorical features in order to see, which of them may confirm
the initial hypothesis of being redundant.

In [3]:
categorical = ['Tenure', 'HourSpendOnApp','PreferredLoginDevice', 'CityTier', 'NumberOfDeviceRegistered','Complain','PreferredPaymentMode', 'Gender', 'PreferedOrderCat', 'SatisfactionScore', 'MaritalStatus']

In [4]:
chi2_array, p_array = [], []
for column in categorical:

    crosstab = pd.crosstab(data[column],data['Churn'])
    # Defining cross tabulation 
    '''Cross Tabulation. also known as contingency tables or cross tabs,
    cross tabulation groups variables to understand the correlation between different variables. 
    It also shows how correlations change from one variable grouping to another.
    It is usually used in statistical analysis to find patterns, trends,
    and probabilities within raw data.'''
    
    chi2, p, dof, expected = chi2_contingency(crosstab)
    chi2_array.append(chi2)
    p_array.append(p)

df_chi = pd.DataFrame({
    'Variable': categorical,
    'Chi-square': chi2_array,
    'p-value': p_array
})
df_chi.sort_values(by='Chi-square', ascending=False)

Unnamed: 0,Variable,Chi-square,p-value
0,Tenure,1485.718304,6.720735e-290
5,Complain,350.925455,2.664461e-78
8,PreferedOrderCat,288.639394,2.770833e-60
10,MaritalStatus,188.67104,1.073011e-41
4,NumberOfDeviceRegistered,81.108815,4.918443e-16
6,PreferredPaymentMode,77.89682,9.708709e-15
2,PreferredLoginDevice,73.536794,1.075692e-16
9,SatisfactionScore,69.865388,2.423335e-14
3,CityTier,40.982404,1.2612e-09
1,HourSpendOnApp,9.473528,0.09160322


According to the table, the feature ```HourSpendOnApp``` has a small chi-square a p-value greater than 0.05 which is the standard cut-off value
.Therefore our initial hypothesis is confirmed and ```HourSpendOnApp``` does not convey any useful information. In the next step, i will drop all the unnecesary columns and null values of the dataset

In [5]:
data_model = data.dropna() # dropping null values

In [6]:
data_model = data_model.drop(columns = ['CustomerID','HourSpendOnApp' ]) # dropping unnecesary columns

In [7]:
data_model

Unnamed: 0,Churn,Tenure,PreferredLoginDevice,CityTier,WarehouseToHome,PreferredPaymentMode,Gender,NumberOfDeviceRegistered,PreferedOrderCat,SatisfactionScore,MaritalStatus,NumberOfAddress,Complain,OrderAmountHikeFromlastYear,CouponUsed,OrderCount,DaySinceLastOrder,CashbackAmount
0,1,4.0,Mobile Phone,3,6.0,Debit Card,Female,3,Laptop & Accessory,2,Single,9,1,11.0,1.0,1.0,5.0,159.93
3,1,0.0,Phone,3,15.0,Debit Card,Male,4,Laptop & Accessory,5,Single,8,0,23.0,0.0,1.0,3.0,134.07
5,1,0.0,Computer,1,22.0,Debit Card,Female,5,Mobile Phone,5,Single,2,1,22.0,4.0,6.0,7.0,139.19
11,1,11.0,Mobile Phone,1,6.0,Debit Card,Male,4,Fashion,3,Single,10,1,13.0,0.0,1.0,0.0,153.81
12,1,0.0,Phone,1,11.0,COD,Male,3,Mobile,3,Single,2,1,13.0,2.0,2.0,2.0,134.41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5624,0,1.0,Mobile Phone,3,12.0,UPI,Female,5,Mobile Phone,3,Single,2,0,19.0,2.0,2.0,1.0,154.66
5625,0,10.0,Computer,1,30.0,Credit Card,Male,2,Laptop & Accessory,1,Married,6,0,18.0,1.0,2.0,4.0,150.71
5627,0,1.0,Mobile Phone,1,11.0,Debit Card,Male,2,Laptop & Accessory,4,Married,3,1,21.0,1.0,2.0,4.0,186.42
5628,0,23.0,Computer,3,9.0,Credit Card,Male,5,Laptop & Accessory,4,Married,4,0,15.0,2.0,2.0,9.0,178.90


## Encoding Categorical Features¶

In order to implement machine learning algorithms, we have to convert(encode) all categorical features to numbers.

On our dataset, five categorical features require encoding.

1. for ```PreferredLoginDevice``` , ```Gender``` , ```MaritalStatus``` we will use scikit-learn's ```LabelEncoder()``` which maps each unique label to an integer


2. for ```PreferredPaymentMode```  i will map the values as:
Debit card = 1,
E-wallet = 2,
credit card = 3,
Other methods = 4
in order to to make all values equally important to the feature

3. for ```PreferedOrderCat``` i will map the values as:
Laptop & Accessory = 1,
Mobile = 2,
Mobile Phone = 3,
Other = 4
in order to to make all values equally important to the feature



In [8]:
data_model['PreferredLoginDevice'] = LabelEncoder().fit_transform(data_model['PreferredLoginDevice'])

In [9]:
data_model['Gender'] = LabelEncoder().fit_transform(data_model['Gender'])

In [10]:
data_model['MaritalStatus'] = LabelEncoder().fit_transform(data_model['MaritalStatus'])

In [11]:
data_model['PreferredPaymentMode'] = data_model['PreferredPaymentMode'].map({'Debit Card': 1,'E-wallet': 2,'Credit Card': 3, 'CC':4,
                                                                             'COD':4, 'UPI':4, 'Cash on Delivery':4 })

In [12]:
data_model['PreferedOrderCat'] = data_model['PreferedOrderCat'].map({'Laptop & Accessory': 1,'Mobile': 2,'Mobile Phone': 3, 'Others':4,
                                                                             'Fashion':4, 'Grocery':4 })

In [13]:
data_model

Unnamed: 0,Churn,Tenure,PreferredLoginDevice,CityTier,WarehouseToHome,PreferredPaymentMode,Gender,NumberOfDeviceRegistered,PreferedOrderCat,SatisfactionScore,MaritalStatus,NumberOfAddress,Complain,OrderAmountHikeFromlastYear,CouponUsed,OrderCount,DaySinceLastOrder,CashbackAmount
0,1,4.0,1,3,6.0,1.0,0,3,1,2,2,9,1,11.0,1.0,1.0,5.0,159.93
3,1,0.0,2,3,15.0,1.0,1,4,1,5,2,8,0,23.0,0.0,1.0,3.0,134.07
5,1,0.0,0,1,22.0,1.0,0,5,3,5,2,2,1,22.0,4.0,6.0,7.0,139.19
11,1,11.0,1,1,6.0,1.0,1,4,4,3,2,10,1,13.0,0.0,1.0,0.0,153.81
12,1,0.0,2,1,11.0,4.0,1,3,2,3,2,2,1,13.0,2.0,2.0,2.0,134.41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5624,0,1.0,1,3,12.0,4.0,0,5,3,3,2,2,0,19.0,2.0,2.0,1.0,154.66
5625,0,10.0,0,1,30.0,3.0,1,2,1,1,1,6,0,18.0,1.0,2.0,4.0,150.71
5627,0,1.0,1,1,11.0,1.0,1,2,1,4,1,3,1,21.0,1.0,2.0,4.0,186.42
5628,0,23.0,0,3,9.0,3.0,1,5,1,4,1,4,0,15.0,2.0,2.0,9.0,178.90


## Scaling

Feature scaling is a method used to normalize the range of independent variables or features of data. In data processing, it is also known as data normalization and is generally performed during the data preprocessing step.

the scaling procedure will be performed with MinMax scaler.

In [16]:
scaler = MinMaxScaler()

In [None]:
data_df['tenure'] = scaler.fit_transform(data_df[['tenure']])


In [20]:
for column in data_model.drop(columns = 'Churn').columns:
    data_model[column] = scaler.fit_transform(data_model[[column]])

In [21]:
data_model

Unnamed: 0,Churn,Tenure,PreferredLoginDevice,CityTier,WarehouseToHome,PreferredPaymentMode,Gender,NumberOfDeviceRegistered,PreferedOrderCat,SatisfactionScore,MaritalStatus,NumberOfAddress,Complain,OrderAmountHikeFromlastYear,CouponUsed,OrderCount,DaySinceLastOrder,CashbackAmount
0,1,0.078431,0.5,1.0,0.008197,0.000000,0.0,0.4,0.000000,0.25,1.0,0.380952,1.0,0.000000,0.0625,0.000000,0.108696,0.492107
3,1,0.000000,1.0,1.0,0.081967,0.000000,1.0,0.6,0.000000,1.00,1.0,0.333333,0.0,0.800000,0.0000,0.000000,0.065217,0.412536
5,1,0.000000,0.0,0.0,0.139344,0.000000,0.0,0.8,0.666667,1.00,1.0,0.047619,1.0,0.733333,0.2500,0.333333,0.152174,0.428290
11,1,0.215686,0.5,0.0,0.008197,0.000000,1.0,0.6,1.000000,0.50,1.0,0.428571,1.0,0.133333,0.0000,0.000000,0.000000,0.473276
12,1,0.000000,1.0,0.0,0.049180,1.000000,1.0,0.4,0.333333,0.50,1.0,0.047619,1.0,0.133333,0.1250,0.066667,0.043478,0.413582
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5624,0,0.019608,0.5,1.0,0.057377,1.000000,0.0,0.8,0.666667,0.50,1.0,0.047619,0.0,0.533333,0.1250,0.066667,0.021739,0.475892
5625,0,0.196078,0.0,0.0,0.204918,0.666667,1.0,0.2,0.000000,0.00,0.5,0.238095,0.0,0.466667,0.0625,0.066667,0.086957,0.463737
5627,0,0.019608,0.5,0.0,0.049180,0.000000,1.0,0.2,0.000000,0.75,0.5,0.095238,1.0,0.666667,0.0625,0.066667,0.086957,0.573618
5628,0,0.450980,0.0,1.0,0.032787,0.666667,1.0,0.8,0.000000,0.75,0.5,0.142857,0.0,0.266667,0.1250,0.066667,0.195652,0.550478
