In [46]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
import pickle
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [2]:
data = pd.read_csv('challenge_dataset.csv')

### Data Pre-Processing & Transformations

List of Findings and Required Transformations for Univariate Analysis
(These transformations will be recorded as a JSON dump for mapping with real-time predictions.)

1. Tenure: Handle missing values by filling them with the mean value.
   
3. PreferredLoginDevice: Combine Mobile Phone and Phone into Phone, then apply label encoding.
   
5. WarehouseToHome: Fill missing values with the mean value.
6. PreferredPaymentMode:
       Combine:
           * Cash on Delivery and COD as Cash on Delivery
           * CC and Credit Card as Credit Card
           * E-Wallet and UPI as E-Wallet
           * Apply label encoding.

7. HourSpendOnApp: Fill missing values with the mean (average hours spent).
   
9. PreferredOrderCat:
        * Combine Mobile Phone and Phone as Mobile Phone.
        * Fill missing values with the mode.
   
10. SatisfactionScore: Fill missing values with the mean (average satisfaction score).
    
12. Complain: Fill missing values with the mode.
    
14. OrderAmountHikeFromlastYear: Fill missing values with the mean.
    
16. CouponUsed: Fill missing values with the mean (average coupons used).
    
18. OrderCount: Fill missing values with the mean.
    
20. DaySinceLastOrder: Fill missing values with the mean.
    
22. CashbackAmount: Fill missing values with the mean.

In [3]:
missing_maps = {

    "Tenure": "mean",
    "PreferredLoginDevice": "mode",
    "CityTier": "mode",
    "WarehouseToHome": "mean",
    "PreferredPaymentMode": "mode",
    "HourSpendOnApp":"mean",
    "NumberOfDeviceRegistered": "mean",
    "PreferedOrderCat": "mode",
    "SatisfactionScore":"mean",
    "NumberOfAddress":"mean",
    "Complain":"mode",
    "OrderAmountHikeFromlastYear":"mean",
    "CouponUsed": "mean",
    "OrderCount":"mean",
    "DaySinceLastOrder":"mean",
    "CashbackAmount":"mean"
}


value_maps = {
    "PreferredLoginDevice": {
        "Mobile Phone": "Mobile Phone",
        "Phone": "Mobile Phone",
        "Computer": "Computer"
    },
    "PreferredPaymentMode": {
        "Cash on Delivery": "Cash on Delivery",
        "COD": "Cash on Delivery",
        "CC": "Credit Card",
        "Credit Card": "Credit Card",
        "E-Wallet": "E-Wallet",
        "UPI": "E-Wallet",
        "Debit Card": "Debit Card"
    },
    "PreferedOrderCat": {
    "Mobile Phone": "Mobile Phone",
    "Phone": "Mobile Phone",
    "Laptop & Accessory": "Laptop & Accessory",
    "Fashion": "Fashion",
    "Grocery": "Grocery",
    "Others" : "Others"
    }
}

def maps_labels(df, value_maps):

    df['PreferredLoginDevice'] = df['PreferredLoginDevice'].map(value_maps['PreferredLoginDevice'])
    df['PreferredPaymentMode'] = df['PreferredPaymentMode'].map(value_maps['PreferredPaymentMode'])
    df['PreferedOrderCat'] = df['PreferedOrderCat'].map(value_maps['PreferedOrderCat'])

    return df


def encode_features(df):
    
    encoders = {}
    encoder_login_device = LabelEncoder()
    
    df["PreferredLoginDevice"] = encoder_login_device.fit_transform(df["PreferredLoginDevice"])
    encoders["PreferredLoginDevice"] = encoder_login_device
    print('PreferredLoginDevice encoding done')
    encoder_payment_mode = LabelEncoder()
    df["PreferredPaymentMode"] = encoder_payment_mode.fit_transform(df["PreferredPaymentMode"])
    encoders["PreferredPaymentMode"] = encoder_payment_mode
    print('PreferredPaymentMode encoding done')
    
    encoder_order_cat = LabelEncoder()
    df["PreferedOrderCat"] = encoder_order_cat.fit_transform(df["PreferedOrderCat"])
    encoders["PreferedOrderCat"] = encoder_order_cat
    print('PreferedOrderCat encoding done')

    # Save encoders to file
    encoder_file = "encoders.pkl"
    with open(encoder_file, "wb") as file:
        pickle.dump(encoders, file)

    print(f"Encoders saved to {encoder_file}.")
    return df

def fill_missing_values(df, missing_maps):
    for col, method in missing_maps.items():
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in DataFrame.")

        if df[col].isnull().any():
            if method == "mean":
                df[col].fillna(df[col].mean(), inplace=True)
            elif method == "mode":
                df[col].fillna(df[col].mode()[0], inplace=True)
            else:
                raise ValueError("Method must be either 'mean' or 'mode'.")
    return df

In [4]:
test_data = maps_labels(data, value_maps)

In [5]:
encoding_data = encode_features(test_data)

PreferredLoginDevice encoding done
PreferredPaymentMode encoding done
PreferedOrderCat encoding done
Encoders saved to encoders.pkl.


In [7]:
no_missing_data = fill_missing_values(encoding_data,missing_maps)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


Exporting complete and preprocessed data that is free of missing values and has been encoded for use in machine learning applications

In [11]:
no_missing_data.to_csv('model_data.csv', index=False)

In [12]:
fe_data = pd.read_csv('model_data.csv')

In [17]:
fe_data.dropna(subset=['Churn'],inplace=True)

### Feature Engineering

This feature engineering process involves selecting the appropriate features for machine learning. It is important to note that we do not preserve the feature engineering work for live predictions. This stage is solely for identifying which features are necessary for the ML model, and only those selected features will be utilized for live predictions as well

#### SMOTE

In [18]:
fe_data.isna().sum()

CustomerID                     0
Tenure                         0
PreferredLoginDevice           0
CityTier                       0
WarehouseToHome                0
PreferredPaymentMode           0
HourSpendOnApp                 0
NumberOfDeviceRegistered       0
PreferedOrderCat               0
SatisfactionScore              0
NumberOfAddress                0
Complain                       0
OrderAmountHikeFromlastYear    0
CouponUsed                     0
OrderCount                     0
DaySinceLastOrder              0
CashbackAmount                 0
Churn                          0
dtype: int64

In [19]:
fe_data['Churn'].value_counts()

Churn
0.0    3860
1.0     774
Name: count, dtype: int64

In [22]:
X = fe_data.drop(['CustomerID','Churn'], axis=1) 
y = fe_data['Churn']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [25]:
smote = SMOTE(random_state=42)

In [26]:
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [27]:
new_distribution = pd.Series(y_train_smote).value_counts()

In [28]:
new_distribution

Churn
0.0    2719
1.0    2719
Name: count, dtype: int64

In [29]:
df_smote = pd.DataFrame(X_train_smote, columns=X_train.columns)
df_smote['Churn'] = y_train_smote

In [31]:
df_smote.to_csv('model_data_train.csv', index=False)

In [32]:
df_smote_test = pd.DataFrame(X_test, columns=X_test.columns)
df_smote_test['Churn'] = y_test

In [33]:
df_smote_test

Unnamed: 0,Tenure,PreferredLoginDevice,CityTier,WarehouseToHome,PreferredPaymentMode,HourSpendOnApp,NumberOfDeviceRegistered,PreferedOrderCat,SatisfactionScore,NumberOfAddress,Complain,OrderAmountHikeFromlastYear,CouponUsed,OrderCount,DaySinceLastOrder,CashbackAmount,Churn
808,8.000000,1,1,7.0,2,3.0,4,2,1.000000,11,0.0,18.000000,2.0,3.000000,2.0,179.84000,0.0
1829,1.000000,1,1,18.0,0,3.0,5,1,4.000000,5,0.0,17.000000,1.0,2.960674,4.0,256.42000,0.0
3641,7.000000,1,1,11.0,1,3.0,5,3,4.000000,10,0.0,12.000000,2.0,2.000000,3.0,163.38000,0.0
2699,1.000000,1,3,16.0,2,4.0,5,3,3.066309,5,0.0,13.000000,3.0,3.000000,3.0,169.00000,1.0
33,10.239955,1,3,16.0,2,2.0,3,5,3.000000,2,1.0,19.000000,0.0,1.000000,0.0,122.20000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1440,13.000000,1,1,20.0,1,4.0,5,1,4.000000,8,1.0,15.737864,14.0,16.000000,3.0,296.23000,0.0
1135,16.000000,1,3,30.0,2,3.0,3,5,3.066309,7,0.0,13.000000,0.0,2.000000,2.0,177.36139,1.0
2660,5.000000,1,1,15.0,2,3.0,3,2,1.000000,2,1.0,12.000000,1.0,1.000000,5.0,163.22000,0.0
3637,1.000000,0,1,14.0,2,3.0,2,3,3.000000,3,0.0,12.000000,1.0,2.000000,4.0,151.10000,0.0


In [34]:
df_smote_test.to_csv('model_data_test.csv', index=False)

#### Features Selection

In [36]:
fe_train_data = pd.read_csv('model_data_train.csv')

In [37]:
fe_train_data.head()

Unnamed: 0,Tenure,PreferredLoginDevice,CityTier,WarehouseToHome,PreferredPaymentMode,HourSpendOnApp,NumberOfDeviceRegistered,PreferedOrderCat,SatisfactionScore,NumberOfAddress,Complain,OrderAmountHikeFromlastYear,CouponUsed,OrderCount,DaySinceLastOrder,CashbackAmount,Churn
0,13.0,0,1,16.0,2,3.0,4,4,1.0,4,0.0,15.737864,1.0,2.0,4.0,319.31,0.0
1,18.0,1,1,14.0,3,3.0,3,2,1.0,10,0.0,14.0,8.0,9.0,8.0,136.97,0.0
2,13.0,1,1,10.0,1,4.0,5,2,3.0,2,0.0,16.0,1.0,2.0,8.0,155.69,0.0
3,21.0,0,1,12.0,1,3.0,5,2,1.0,8,1.0,12.0,0.0,1.0,9.0,153.29,0.0
4,11.0,1,2,15.0,3,3.0,4,2,4.0,4,0.0,14.0,5.0,5.0,8.0,165.71,0.0


In [38]:
fe_train_data.columns

Index(['Tenure', 'PreferredLoginDevice', 'CityTier', 'WarehouseToHome',
       'PreferredPaymentMode', 'HourSpendOnApp', 'NumberOfDeviceRegistered',
       'PreferedOrderCat', 'SatisfactionScore', 'NumberOfAddress', 'Complain',
       'OrderAmountHikeFromlastYear', 'CouponUsed', 'OrderCount',
       'DaySinceLastOrder', 'CashbackAmount', 'Churn'],
      dtype='object')

##### Chi_Square_Test

In [47]:
def chi_squared_feature_selection(df, target_column, k='all'):
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    chi2_selector = SelectKBest(chi2, k=k)
    chi2_selector.fit(X, y)
    chi_scores = pd.Series(chi2_selector.scores_, index=X.columns)
    return chi_scores.sort_values(ascending=False)

In [48]:
chi_squared_feature_selection(fe_train_data,'Churn')

Tenure                         8494.862634
CashbackAmount                 3038.139022
DaySinceLastOrder               977.374034
Complain                        372.005410
WarehouseToHome                 203.856891
SatisfactionScore               120.875453
PreferredLoginDevice            115.218826
PreferedOrderCat                 84.212631
OrderCount                       36.669345
PreferredPaymentMode             26.910523
NumberOfAddress                  23.252054
NumberOfDeviceRegistered          3.255638
HourSpendOnApp                    2.196126
CityTier                          1.710458
OrderAmountHikeFromlastYear       1.187014
CouponUsed                        0.007354
dtype: float64

In [49]:
def random_forest_feature_importance(df, target_column):
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    importances = pd.Series(model.feature_importances_, index=X.columns)
    return importances.sort_values(ascending=False)

In [50]:
random_forest_feature_importance(fe_train_data,'Churn')

Tenure                         0.210763
Complain                       0.208772
CashbackAmount                 0.084400
SatisfactionScore              0.079648
DaySinceLastOrder              0.062235
WarehouseToHome                0.050472
OrderAmountHikeFromlastYear    0.042623
NumberOfAddress                0.041799
CouponUsed                     0.041310
HourSpendOnApp                 0.034599
OrderCount                     0.032683
PreferedOrderCat               0.026428
PreferredPaymentMode           0.026305
PreferredLoginDevice           0.021219
NumberOfDeviceRegistered       0.020322
CityTier                       0.016422
dtype: float64

#### Top 6 Features:
* Tenure
* Complain
* CashbackAmount
* SatisfactionScore
* DaySinceLastOrder
* WarehouseToHome
#### Top 8 Features (including the above):
* OrderAmountHikeFromlastYear
* NumberOfAddress
#### Top 10 Features (including the above):
* CouponUsed
* HourSpendOnApp