### 2. Data Preprocessing and Feature Engineering

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("Reward-Customer-Churn.csv")

In [4]:
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,MobileOnRecord,AddressOnRecord,LinkedAccount,TwoFactorAuth,...,DeviceProtection,TechSupport,NewsletterSubscribe,PaperlessBilling,Contract,LastLogInOneMonth,Generation,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No,Target,No,...,No,No,No,No,Month-to-month,Yes,Gen Z,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,Target,Yes,...,Yes,No,No,No,One year,No,Gen X,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,Target,Yes,...,No,No,No,No,Month-to-month,Yes,Gen X,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No,Target,Yes,...,Yes,Yes,No,No,One year,No,Millennials,42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Starbucks,No,...,No,No,No,No,Month-to-month,Yes,Gen Z,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,Target,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Gen X,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Starbucks,No,...,Yes,No,Yes,Yes,One year,Yes,Boomers,103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No,Target,Yes,...,No,No,No,No,Month-to-month,Yes,Gen Z,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Starbucks,No,...,No,No,No,No,Month-to-month,Yes,Gen X,74.40,306.6,Yes


#### Data cleaning, data engineering, data preprocessing

In [5]:
#Dropping the customerID as it has no use
df.drop('customerID',axis='columns',inplace=True)

In [6]:
df.dtypes

gender                  object
SeniorCitizen            int64
Partner                 object
Dependents              object
tenure                   int64
MobileOnRecord          object
AddressOnRecord         object
LinkedAccount           object
TwoFactorAuth           object
OnlineBackup            object
DeviceProtection        object
TechSupport             object
NewsletterSubscribe     object
PaperlessBilling        object
Contract                object
LastLogInOneMonth       object
Generation              object
MonthlyCharges         float64
TotalCharges            object
Churn                   object
dtype: object

In [7]:
df.TotalCharges.values

array(['29.85', '1889.5', '108.15', ..., '346.45', '306.6', '6844.5'],
      dtype=object)

In [8]:
# On viewing the datatypes, we see that TotalCharges is object, so we change it to integer
df[pd.to_numeric(df.TotalCharges,errors='coerce').isnull()].shape

(11, 20)

In [9]:
#Remove rows with space in TotalCharges
df1 = df[df.TotalCharges!=" "]
df1.shape

(7032, 20)

In [10]:
df1.TotalCharges = pd.to_numeric(df1.TotalCharges)
df1.TotalCharges.dtypes

dtype('float64')

In [11]:
# check the different values in the different columns
for column in df1:
  if df1[column].dtypes == object:  
    print(column+" :",df1[column].unique())

gender : ['Female' 'Male']
Partner : ['Yes' 'No']
Dependents : ['No' 'Yes']
MobileOnRecord : ['No' 'Yes']
AddressOnRecord : ['No' 'Yes']
LinkedAccount : ['Target' 'Starbucks' 'Amazon']
TwoFactorAuth : ['No' 'Yes']
OnlineBackup : ['Yes' 'No']
DeviceProtection : ['No' 'Yes']
TechSupport : ['No' 'Yes']
NewsletterSubscribe : ['No' 'Yes']
PaperlessBilling : ['No' 'Yes']
Contract : ['Month-to-month' 'One year' 'Two year']
LastLogInOneMonth : ['Yes' 'No']
Generation : ['Gen Z' 'Gen X' 'Millennials' 'Boomers']
Churn : ['No' 'Yes']


In [12]:
df1.replace('No internet service','No',inplace=True)
df1.replace('No phone service','No',inplace=True)

In [13]:
# Converting all 'Yes' to 1 and all 'No' to 0
yes_no_columns = ['Partner','Dependents','MobileOnRecord','AddressOnRecord','TwoFactorAuth','OnlineBackup','DeviceProtection',
                  'TechSupport', 'NewsletterSubscribe', 'PaperlessBilling','LastLogInOneMonth','Churn']
            
for col in yes_no_columns:
  df1[col].replace({'Yes':1,'No':0},inplace=True)

In [14]:
df1['gender'].replace({'Female':1,'Male':0},inplace=True)

In [15]:
# One hot encoding for categorical columns
df2 = pd.get_dummies(data=df1,columns=['LinkedAccount','Contract','Generation'])

In [16]:
cols = ['tenure','MonthlyCharges','TotalCharges']
scaler = MinMaxScaler()
df2[cols] = scaler.fit_transform(df2[cols])

#### Export data- ready for model training

In [17]:
X = df2.drop('Churn',axis='columns')
y = df2['Churn']

In [18]:
X.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'MobileOnRecord', 'AddressOnRecord', 'TwoFactorAuth', 'OnlineBackup',
       'DeviceProtection', 'TechSupport', 'NewsletterSubscribe',
       'PaperlessBilling', 'LastLogInOneMonth', 'MonthlyCharges',
       'TotalCharges', 'LinkedAccount_Amazon', 'LinkedAccount_Starbucks',
       'LinkedAccount_Target', 'Contract_Month-to-month', 'Contract_One year',
       'Contract_Two year', 'Generation_Boomers', 'Generation_Gen X',
       'Generation_Gen Z', 'Generation_Millennials'],
      dtype='object')

In [19]:
X.to_csv('X.csv', index=False)
y.to_csv('y.csv', index=False)