# 1 Getting Ready

## 1.1 Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from dateutil import relativedelta

## 1.2 Setting up Environment

In [2]:
plt.style.use('fivethirtyeight')
%matplotlib inline
pd.options.display.max_columns = None
pd.options.display.max_rows = None

# 2 Load EDA-DPP Data

## 2.1 Load from pickle

In [3]:
df_eda_churn = pd.read_pickle('./data/eda/churn_eda.pickle')

## 2.2 Have a Look into the EDA-DPP Outpput Data

In [4]:
df_eda_churn.head()

Unnamed: 0,CUSTOMER_ID,SENIOR_CITIZEN,GENDER,TENURE,MONTHLY_CHARGES,CHURN,CHURN_STATUS
0,7590-VHVEG,0,Female,1,29.85,No,0
1,5575-GNVDE,0,Male,34,56.95,No,0
2,3668-QPYBK,0,Male,2,53.85,Yes,1
3,7795-CFOCW,0,Male,45,42.3,No,0
4,9237-HQITU,0,Female,2,70.7,Yes,1


## 2.3 Make a copy of EDA-DPP Data to work on

In [5]:
df = df_eda_churn.copy()

# 3 Learn more about the Dataset - Meta Info

In [6]:
df.shape

(7043, 7)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7043 entries, 0 to 7042
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CUSTOMER_ID      7043 non-null   object 
 1   SENIOR_CITIZEN   7043 non-null   int64  
 2   GENDER           7043 non-null   object 
 3   TENURE           7043 non-null   int64  
 4   MONTHLY_CHARGES  7043 non-null   float64
 5   CHURN            7043 non-null   object 
 6   CHURN_STATUS     7043 non-null   int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 440.2+ KB


In [8]:
df.columns

Index(['CUSTOMER_ID', 'SENIOR_CITIZEN', 'GENDER', 'TENURE', 'MONTHLY_CHARGES',
       'CHURN', 'CHURN_STATUS'],
      dtype='object')

In [9]:
df.describe()

Unnamed: 0,SENIOR_CITIZEN,TENURE,MONTHLY_CHARGES,CHURN_STATUS
count,7043.0,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692,0.26537
std,0.368612,24.559481,30.090047,0.441561
min,0.0,0.0,18.25,0.0
25%,0.0,9.0,35.5,0.0
50%,0.0,29.0,70.35,0.0
75%,0.0,55.0,89.85,1.0
max,1.0,72.0,118.75,1.0


# 4 Dummification

If Needed

In [10]:
gender_map = {'Male': 'M',
             'Female':'F'
             }
df['GENDER'] = df['GENDER'].map(gender_map)
nominal_cols = ["GENDER"]
df_dum_gender = pd.get_dummies(df[nominal_cols])
df.head()

Unnamed: 0,CUSTOMER_ID,SENIOR_CITIZEN,GENDER,TENURE,MONTHLY_CHARGES,CHURN,CHURN_STATUS
0,7590-VHVEG,0,F,1,29.85,No,0
1,5575-GNVDE,0,M,34,56.95,No,0
2,3668-QPYBK,0,M,2,53.85,Yes,1
3,7795-CFOCW,0,M,45,42.3,No,0
4,9237-HQITU,0,F,2,70.7,Yes,1


In [11]:
df_dum_gender.head()

Unnamed: 0,GENDER_F,GENDER_M
0,1,0
1,0,1
2,0,1
3,0,1
4,1,0


In [12]:
df = pd.concat([df,df_dum_gender], axis=1)

In [13]:
df.head()

Unnamed: 0,CUSTOMER_ID,SENIOR_CITIZEN,GENDER,TENURE,MONTHLY_CHARGES,CHURN,CHURN_STATUS,GENDER_F,GENDER_M
0,7590-VHVEG,0,F,1,29.85,No,0,1,0
1,5575-GNVDE,0,M,34,56.95,No,0,0,1
2,3668-QPYBK,0,M,2,53.85,Yes,1,0,1
3,7795-CFOCW,0,M,45,42.3,No,0,0,1
4,9237-HQITU,0,F,2,70.7,Yes,1,1,0


# 5 Drop Columns

## 5.1 Drop ID Columns

In [14]:
df.drop(columns=['CUSTOMER_ID'],inplace=True,axis=1)

## 5.2 Drop CHURN Column

In [15]:
df.drop(columns=['CHURN'],inplace=True,axis=1)

In [16]:
df.drop(columns=['GENDER'],inplace=True,axis=1)

# 6 Save the DataFrame for ML Modeling

In [17]:
df.to_pickle('./data/training/churn.pickle')

In [18]:
df.head()

Unnamed: 0,SENIOR_CITIZEN,TENURE,MONTHLY_CHARGES,CHURN_STATUS,GENDER_F,GENDER_M
0,0,1,29.85,0,1,0
1,0,34,56.95,0,0,1
2,0,2,53.85,1,0,1
3,0,45,42.3,0,0,1
4,0,2,70.7,1,1,0
