In [61]:
import numpy as np
import pandas as pd

# Initial exploration

In [62]:
df = pd.read_csv("../data/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [63]:
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [64]:
df.drop("customerID", axis = 1, inplace = True)

In [65]:
print(df['gender'].unique())

['Female' 'Male']


In [66]:
#printing the unique values of each column
for col in df.columns:
    print(f"{col}: {df[col].unique()[:10]} ... ({df[col].nunique()} unique values)")

gender: ['Female' 'Male'] ... (2 unique values)
SeniorCitizen: [0 1] ... (2 unique values)
Partner: ['Yes' 'No'] ... (2 unique values)
Dependents: ['No' 'Yes'] ... (2 unique values)
tenure: [ 1 34  2 45  8 22 10 28 62 13] ... (73 unique values)
PhoneService: ['No' 'Yes'] ... (2 unique values)
MultipleLines: ['No phone service' 'No' 'Yes'] ... (3 unique values)
InternetService: ['DSL' 'Fiber optic' 'No'] ... (3 unique values)
OnlineSecurity: ['No' 'Yes' 'No internet service'] ... (3 unique values)
OnlineBackup: ['Yes' 'No' 'No internet service'] ... (3 unique values)
DeviceProtection: ['No' 'Yes' 'No internet service'] ... (3 unique values)
TechSupport: ['No' 'Yes' 'No internet service'] ... (3 unique values)
StreamingTV: ['No' 'Yes' 'No internet service'] ... (3 unique values)
StreamingMovies: ['No' 'Yes' 'No internet service'] ... (3 unique values)
Contract: ['Month-to-month' 'One year' 'Two year'] ... (3 unique values)
PaperlessBilling: ['Yes' 'No'] ... (2 unique values)
PaymentMetho

# Encoding features

In [67]:
df["MonthlyCharges"].dtype

dtype('float64')

In [68]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})
df["Dependents"] = df["Dependents"].map({"Yes": 1, "No": 0})

In [69]:
binary_service_map = {
    'Yes': 1,
    'No': 0,
    'No phone service': -1,
    'No internet service': -1
}

mapped_columns = [
    'MultipleLines',
    'OnlineSecurity',
    'OnlineBackup',
    'DeviceProtection',
    'TechSupport',
    'StreamingTV',
    'StreamingMovies'
]

# Apply the custom mapping
for col in mapped_columns:
    df[col] = df[col].map(binary_service_map)

# One-hot encode 'InternetService' and 'Contract'
df = pd.get_dummies(df, columns=['InternetService', 'Contract'], drop_first=True)


In [70]:
#one hot enoding the PaymentMethod feature
df = pd.get_dummies(df, columns=['PaymentMethod'], drop_first=False)

# Final check

In [71]:
for col in df.columns:
    print(f"{col}: {df[col].unique()[:10]} ... ({df[col].nunique()} unique values)")

gender: ['Female' 'Male'] ... (2 unique values)
SeniorCitizen: [0 1] ... (2 unique values)
Partner: ['Yes' 'No'] ... (2 unique values)
Dependents: [0 1] ... (2 unique values)
tenure: [ 1 34  2 45  8 22 10 28 62 13] ... (73 unique values)
PhoneService: ['No' 'Yes'] ... (2 unique values)
MultipleLines: [-1  0  1] ... (3 unique values)
OnlineSecurity: [ 0  1 -1] ... (3 unique values)
OnlineBackup: [ 1  0 -1] ... (3 unique values)
DeviceProtection: [ 0  1 -1] ... (3 unique values)
TechSupport: [ 0  1 -1] ... (3 unique values)
StreamingTV: [ 0  1 -1] ... (3 unique values)
StreamingMovies: [ 0  1 -1] ... (3 unique values)
PaperlessBilling: ['Yes' 'No'] ... (2 unique values)
MonthlyCharges: [ 29.85  56.95  53.85  42.3   70.7   99.65  89.1   29.75 104.8   56.15] ... (1585 unique values)
TotalCharges: [  29.85 1889.5   108.15 1840.75  151.65  820.5  1949.4   301.9  3046.05
 3487.95] ... (6530 unique values)
Churn: [0 1] ... (2 unique values)
InternetService_Fiber optic: [0 1] ... (2 unique valu