In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

In [44]:
# Load the dataset
attrition_df = pd.read_csv('customer_churn.csv')
attrition_df.head()

Unnamed: 0,customer_ID,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service,online_security,...,device_protection,tech_support,streaming_TV,streaming_movies,contract,paperless_billing,payment_method,monthly_charges,total_charges,churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [45]:
# Check the shape of the dataset
attrition_df.shape

(7043, 21)

In [46]:
# Check the unique values of the columns
for column in attrition_df.columns:
    print(column, attrition_df[column].unique())

customer_ID ['7590-VHVEG' '5575-GNVDE' '3668-QPYBK' ... '4801-JZAZL' '8361-LTMKD'
 '3186-AJIEK']
gender ['Female' 'Male']
senior_citizen [0 1]
partner ['Yes' 'No']
dependents ['No' 'Yes']
tenure [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26  0
 39]
phone_service ['No' 'Yes']
multiple_lines ['No phone service' 'No' 'Yes']
internet_service ['DSL' 'Fiber optic' 'No']
online_security ['No' 'Yes' 'No internet service']
online_backup ['Yes' 'No' 'No internet service']
device_protection ['No' 'Yes' 'No internet service']
tech_support ['No' 'Yes' 'No internet service']
streaming_TV ['No' 'Yes' 'No internet service']
streaming_movies ['No' 'Yes' 'No internet service']
contract ['Month-to-month' 'One year' 'Two year']
paperless_billing ['Yes' 'No']
payment_method ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'C

In [47]:
# Check for missing values
attrition_df.isnull().sum()

customer_ID          0
gender               0
senior_citizen       0
partner              0
dependents           0
tenure               0
phone_service        0
multiple_lines       0
internet_service     0
online_security      0
online_backup        0
device_protection    0
tech_support         0
streaming_TV         0
streaming_movies     0
contract             0
paperless_billing    0
payment_method       0
monthly_charges      0
total_charges        0
churn                0
dtype: int64

In [48]:
# Check for duplicate records
attrition_df.duplicated().sum()

0

In [49]:
# Convert 'total_charges' to numeric
attrition_df['total_charges'] = pd.to_numeric(attrition_df['total_charges'], errors='coerce')

attrition_df.dtypes

customer_ID           object
gender                object
senior_citizen         int64
partner               object
dependents            object
tenure                 int64
phone_service         object
multiple_lines        object
internet_service      object
online_security       object
online_backup         object
device_protection     object
tech_support          object
streaming_TV          object
streaming_movies      object
contract              object
paperless_billing     object
payment_method        object
monthly_charges      float64
total_charges        float64
churn                 object
dtype: object

In [54]:
# Transform the data to 'No' and 'Yes'
headers_to_transform = ['multiple_lines', 'online_security', 'online_backup', 'device_protection', 'tech_support', 'streaming_TV', 'streaming_movies'
]
for header in headers_to_transform:
    attrition_df[header] = attrition_df[header].replace({'No internet service': 'No', 'No internet access': 'No', 'No phone service': 'No', 'Yes': 'Yes', 'No': 'No'})

In [55]:
# Convert categorical variables to boolean
attrition_df['partner'] = attrition_df['partner'].replace({'Yes': 1, 'No': 0}).astype(bool)
attrition_df['dependents'] = attrition_df['dependents'].replace({'Yes': 1, 'No': 0}).astype(bool)
attrition_df['phone_service'] = attrition_df['phone_service'].replace({'Yes': 1, 'No': 0}).astype(bool)
attrition_df['multiple_lines'] = attrition_df['multiple_lines'].replace({'Yes': 1, 'No': 0}).astype(bool)
attrition_df['online_security'] = attrition_df['online_security'].replace({'Yes': 1, 'No': 0}).astype(bool)
attrition_df['online_backup'] = attrition_df['online_backup'].replace({'Yes': 1, 'No': 0}).astype(bool)
attrition_df['device_protection'] = attrition_df['device_protection'].replace({'Yes': 1, 'No': 0}).astype(bool)
attrition_df['tech_support'] = attrition_df['tech_support'].replace({'Yes': 1, 'No': 0}).astype(bool)
attrition_df['streaming_TV'] = attrition_df['streaming_TV'].replace({'Yes': 1, 'No': 0}).astype(bool)
attrition_df['streaming_movies'] = attrition_df['streaming_movies'].replace({'Yes': 1, 'No': 0}).astype(bool)
attrition_df['paperless_billing'] = attrition_df['paperless_billing'].replace({'Yes': 1, 'No': 0}).astype(bool)
attrition_df['churn'] = attrition_df['churn'].replace({'Yes': 1, 'No': 0}).astype(bool)

attrition_df.head()

Unnamed: 0,customer_ID,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service,online_security,...,device_protection,tech_support,streaming_TV,streaming_movies,contract,paperless_billing,payment_method,monthly_charges,total_charges,churn
0,7590-VHVEG,Female,0,True,False,1,False,False,DSL,False,...,False,False,False,False,Month-to-month,True,Electronic check,29.85,29.85,False
1,5575-GNVDE,Male,0,False,False,34,True,False,DSL,True,...,True,False,False,False,One year,False,Mailed check,56.95,1889.5,False
2,3668-QPYBK,Male,0,False,False,2,True,False,DSL,True,...,False,False,False,False,Month-to-month,True,Mailed check,53.85,108.15,True
3,7795-CFOCW,Male,0,False,False,45,False,False,DSL,True,...,True,True,False,False,One year,False,Bank transfer (automatic),42.3,1840.75,False
4,9237-HQITU,Female,0,False,False,2,True,False,Fiber optic,False,...,False,False,False,False,Month-to-month,True,Electronic check,70.7,151.65,True


In [56]:
# # Check data types
attrition_df.dtypes

customer_ID           object
gender                object
senior_citizen         int64
partner                 bool
dependents              bool
tenure                 int64
phone_service           bool
multiple_lines          bool
internet_service      object
online_security         bool
online_backup           bool
device_protection       bool
tech_support            bool
streaming_TV            bool
streaming_movies        bool
contract              object
paperless_billing       bool
payment_method        object
monthly_charges      float64
total_charges        float64
churn                   bool
dtype: object