In [2]:
import numpy as np
import pandas as pd
import sklearn
import imblearn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [3]:
Churn_data = pd.read_csv("churn_final.csv")

In [4]:
Churn_data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,Churn,Age,City,Satisfaction.Score
0,Female,0,Yes,Yes,9,Yes,No,DSL,No,Yes,...,Yes,No,One year,Yes,Mailed check,65.6,No,37,Frazier Park,3
1,Male,0,No,No,9,Yes,Yes,DSL,No,No,...,No,Yes,Month-to-month,No,Mailed check,59.9,No,46,Glendale,5
2,Male,0,No,No,4,Yes,No,Fiber optic,No,No,...,No,No,Month-to-month,Yes,Electronic check,73.9,Yes,50,Costa Mesa,1
3,Male,1,Yes,No,13,Yes,No,Fiber optic,No,Yes,...,Yes,Yes,Month-to-month,Yes,Electronic check,98.0,Yes,78,Martinez,1
4,Female,1,Yes,No,3,Yes,No,Fiber optic,No,No,...,Yes,No,Month-to-month,Yes,Mailed check,83.9,Yes,75,Camarillo,1


In [5]:
Churn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   gender              7032 non-null   object 
 1   SeniorCitizen       7032 non-null   int64  
 2   Partner             7032 non-null   object 
 3   Dependents          7032 non-null   object 
 4   tenure              7032 non-null   int64  
 5   PhoneService        7032 non-null   object 
 6   MultipleLines       7032 non-null   object 
 7   InternetService     7032 non-null   object 
 8   OnlineSecurity      7032 non-null   object 
 9   OnlineBackup        7032 non-null   object 
 10  DeviceProtection    7032 non-null   object 
 11  TechSupport         7032 non-null   object 
 12  StreamingTV         7032 non-null   object 
 13  StreamingMovies     7032 non-null   object 
 14  Contract            7032 non-null   object 
 15  PaperlessBilling    7032 non-null   object 
 16  Paymen

Dummy encoding is a process of converting categorical variables into numerical data that can be used in machine learning algorithms. In this process, we create new columns for each category in the original columns where the corresponding variables are categorical and assign a value of 1 or 0 to indicate whether a particular category is present or absent in each row.

We delete the City variable since there are so many City's categories.

In [6]:
Churn_data = Churn_data.drop('City',axis=1)

In [7]:
#Dummies variables
categorical_features=['Churn', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 
                      'PhoneService', 'MultipleLines','InternetService', 'OnlineSecurity', 
                      'OnlineBackup', 'DeviceProtection','TechSupport', 'StreamingTV',
                      'StreamingMovies','Contract','PaperlessBilling',
                     'PaymentMethod']

Churn_data_encode = pd.get_dummies(Churn_data[categorical_features].astype(str), 
                                   prefix=categorical_features, drop_first=True)

In [8]:
Churn_data_encode.head()

Unnamed: 0,Churn_Yes,gender_Male,SeniorCitizen_1,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_Yes,InternetService_Fiber optic,InternetService_No,OnlineSecurity_Yes,...,DeviceProtection_Yes,TechSupport_Yes,StreamingTV_Yes,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,0,1,1,1,0,0,0,0,...,0,1,1,0,1,0,1,0,0,1
1,0,1,0,0,0,1,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2,1,1,0,0,0,1,0,1,0,0,...,1,0,0,0,0,0,1,0,1,0
3,1,1,1,1,0,1,0,1,0,0,...,1,0,1,1,0,0,1,0,1,0
4,1,0,1,1,0,1,0,1,0,0,...,0,1,1,0,0,0,1,0,0,1


In [9]:
Churn_data_encode = Churn_data_encode.rename(columns={'Churn_Yes': 'Churn'})
Churn_data_encode.head()

Unnamed: 0,Churn,gender_Male,SeniorCitizen_1,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_Yes,InternetService_Fiber optic,InternetService_No,OnlineSecurity_Yes,...,DeviceProtection_Yes,TechSupport_Yes,StreamingTV_Yes,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,0,1,1,1,0,0,0,0,...,0,1,1,0,1,0,1,0,0,1
1,0,1,0,0,0,1,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2,1,1,0,0,0,1,0,1,0,0,...,1,0,0,0,0,0,1,0,1,0
3,1,1,1,1,0,1,0,1,0,0,...,1,0,1,1,0,0,1,0,1,0
4,1,0,1,1,0,1,0,1,0,0,...,0,1,1,0,0,0,1,0,0,1


In [10]:
Churn_data_encode['tenure'] = Churn_data['tenure']
Churn_data_encode['MonthlyCharges'] = Churn_data['MonthlyCharges']
Churn_data_encode['Age'] = Churn_data['Age']

In [11]:
Churn_data_encode.to_csv("Churn_data_dummy.csv")

We randomly split our data set into training set(80%) and testing set(20%). Since our churn prediction value is highly imbalanced. We use Smoke to handle the imbalanced data. SMOTE is an oversampling technique where the synthetic samples are generated for the minority class. This algorithm helps to overcome the overfitting problem posed by random oversampling. It focuses on the feature space to generate new instances with the help of interpolation between the positive instances that lie together.

In [12]:
X = Churn_data_encode.drop('Churn',axis=1)
y = Churn_data_encode.Churn

#Split our data to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=503)

#Standardize the data
#scaler = StandardScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)

# apply SMOTE for oversampling
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [14]:
train_df = pd.concat([X_resampled, y_resampled],axis=1)
test_df = pd.concat([X_test, y_test],axis=1)
test_df.index = range(len(test_df))

train_df.to_csv('Churn_data_train.csv')
test_df.to_csv('Churn_data_test.csv')