# Churn Prediction Modelling

#### Imports


In [22]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

### Load  Dataset


In [23]:
file_path = 'Dataset\\Telco-Customer-Churn.csv'
churn_data = pd.read_csv(file_path)

## Data PreProcessing

### Clean the data using what we learned from EDA


#### Drop customerID

In [24]:
churn_data = churn_data.drop(['customerID'],axis = 1) 
churn_data.head()   

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


#### Convert total charges from object into float

In [25]:
# Replace empty strings with NaN
churn_data['TotalCharges'] = churn_data['TotalCharges'].replace(" ", np.nan)

# Converts from object to float data type
churn_data['TotalCharges'] = churn_data['TotalCharges'].astype(float)

##### Deal with missing Data

- From our EDA we had 11 rows of missing data for Total charges 

In [26]:
churn_data[np.isnan(churn_data['TotalCharges'])]

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
488,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,No,Yes,Yes,Yes,No,Two year,Yes,Bank transfer (automatic),52.55,,No
753,Male,0,No,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.25,,No
936,Female,0,Yes,Yes,0,Yes,No,DSL,Yes,Yes,Yes,No,Yes,Yes,Two year,No,Mailed check,80.85,,No
1082,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.75,,No
1340,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,Yes,Yes,Yes,Yes,No,Two year,No,Credit card (automatic),56.05,,No
3331,Male,0,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.85,,No
3826,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.35,,No
4380,Female,0,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.0,,No
5218,Male,0,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,Yes,Mailed check,19.7,,No
6670,Female,0,Yes,Yes,0,Yes,Yes,DSL,No,Yes,Yes,Yes,Yes,No,Two year,No,Mailed check,73.35,,No


- we compute TotalCharges: TotalCharges = Tenure * Montlycharge 

In [27]:
def compute_total_charges(row):
    if pd.isnull(row['TotalCharges']) and row['MonthlyCharges'] > 0:
        return row['tenure'] * row['MonthlyCharges']
    return row['TotalCharges']

churn_data['TotalCharges'] = churn_data.apply(compute_total_charges, axis=1)


In [28]:
churn_data[np.isnan(churn_data['TotalCharges'])]

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn


We have now dealt with NaN rows and can proceeed.

### Encoding Categorical Variables.

In [29]:
# Identify all categorical columns by data type
categorical_cols_all = churn_data.select_dtypes(include=['object']).columns
print("All categorical columns:", categorical_cols_all)


All categorical columns: Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod', 'Churn'],
      dtype='object')


- Split the Categorical Variables into Binary and non-Binary

In [30]:
# Binary columns  have only 2 possible values.
binary_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']

non_binary_cols = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
                   'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
                   'Contract', 'PaymentMethod']


Encode Binary and Non-Binary Categorical Variables

In [31]:
binary_mappings = {}
one_hot_categories = {}

for col in binary_cols:
    # Get unique values
    unique_values = churn_data[col].unique()
    #store the mapping
    mapping = {unique_values[0]: 0, unique_values[1]: 1}
    # Map the first unique value to 0 and the second to 1
    churn_data[col] = churn_data[col].apply(lambda x: 0 if x == unique_values[0] else 1)
    
    
for col in non_binary_cols:
    # Get unique categories for each column
    unique_values = churn_data[col].unique()
    one_hot_categories[col] = churn_data[col].unique().tolist()
    for value in unique_values:
        # Create a new column for each unique category
        churn_data[f"{col}_{value}"] = (churn_data[col] == value).astype(int)
    # Drop the original non-binary categorical column
    churn_data.drop(col, axis=1, inplace=True)

# Check the transformed dataset
print(churn_data.head())
print("Binary mappings:", binary_mappings)
print("One-hot categories:", one_hot_categories)

   gender  SeniorCitizen  Partner  Dependents  tenure  PhoneService  \
0       0              0        0           0       1             0   
1       1              0        1           0      34             1   
2       1              0        1           0       2             1   
3       1              0        1           0      45             0   
4       0              0        1           0       2             1   

   PaperlessBilling  MonthlyCharges  TotalCharges  Churn  ...  \
0                 0           29.85         29.85      0  ...   
1                 1           56.95       1889.50      0  ...   
2                 0           53.85        108.15      1  ...   
3                 1           42.30       1840.75      0  ...   
4                 0           70.70        151.65      1  ...   

   StreamingMovies_No  StreamingMovies_Yes  \
0                   1                    0   
1                   1                    0   
2                   1                    0  