In [1]:
import pandas as pd
import numpy as np
from missforest import MissForest
from sklearn.model_selection import train_test_split

In [4]:
data = pd.read_csv("CustomerChurn.csv")
data = data.drop(columns=['Customer ID', 'LoyaltyID'], axis=1)
data["Total Charges"] = pd.to_numeric(data["Total Charges"], errors="coerce")
# data["Total Charges"] = data["Total Charges"].fillna(data["Total Charges"].mean())
data["Churn"] = data["Churn"].map({"No": 0, "Yes": 1})
prop_churn = len(data[data["Churn"] == 0]) / len(data['Churn'])
prop_churn

0.7346301292063041

In [3]:
# One-hot encode the other columns (excluding Total Charges for now)
data_encoded = pd.get_dummies(data.drop(columns=["Total Charges"]))
data_encoded.head()

Unnamed: 0,Tenure,Monthly Charges,Churn,Senior Citizen_No,Senior Citizen_Yes,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,Phone Service_No,...,Streaming Movies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,Paperless Billing_No,Paperless Billing_Yes,Payment Method_Bank transfer (automatic),Payment Method_Credit card (automatic),Payment Method_Electronic check,Payment Method_Mailed check
0,1,29.85,0,True,False,False,True,True,False,True,...,False,True,False,False,False,True,False,False,True,False
1,34,56.95,0,True,False,True,False,True,False,False,...,False,False,True,False,True,False,False,False,False,True
2,2,53.85,1,True,False,True,False,True,False,False,...,False,True,False,False,False,True,False,False,False,True
3,45,42.3,0,True,False,True,False,True,False,True,...,False,False,True,False,True,False,True,False,False,False
4,2,70.7,1,True,False,True,False,True,False,False,...,False,True,False,False,False,True,False,False,True,False


In [4]:
# Add Total Charges back in
data_encoded["Total Charges"] = data["Total Charges"]
data_encoded.head()

Unnamed: 0,Tenure,Monthly Charges,Churn,Senior Citizen_No,Senior Citizen_Yes,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,Phone Service_No,...,Contract_Month-to-month,Contract_One year,Contract_Two year,Paperless Billing_No,Paperless Billing_Yes,Payment Method_Bank transfer (automatic),Payment Method_Credit card (automatic),Payment Method_Electronic check,Payment Method_Mailed check,Total Charges
0,1,29.85,0,True,False,False,True,True,False,True,...,True,False,False,False,True,False,False,True,False,29.85
1,34,56.95,0,True,False,True,False,True,False,False,...,False,True,False,True,False,False,False,False,True,1889.5
2,2,53.85,1,True,False,True,False,True,False,False,...,True,False,False,False,True,False,False,False,True,108.15
3,45,42.3,0,True,False,True,False,True,False,True,...,False,True,False,True,False,True,False,False,False,1840.75
4,2,70.7,1,True,False,True,False,True,False,False,...,True,False,False,False,True,False,False,True,False,151.65


In [6]:
# Ensure all data is float type (important for MissForest)
data_encoded = data_encoded.astype(np.float64)
data_encoded.head()

Unnamed: 0,Tenure,Monthly Charges,Churn,Senior Citizen_No,Senior Citizen_Yes,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,Phone Service_No,...,Contract_Month-to-month,Contract_One year,Contract_Two year,Paperless Billing_No,Paperless Billing_Yes,Payment Method_Bank transfer (automatic),Payment Method_Credit card (automatic),Payment Method_Electronic check,Payment Method_Mailed check,Total Charges
0,1.0,29.85,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,29.85
1,34.0,56.95,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1889.5
2,2.0,53.85,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,108.15
3,45.0,42.3,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1840.75
4,2.0,70.7,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,151.65


In [None]:
# Use MissForest to impute Total Charges only
imputer = MissForest()
data_imputed_array = imputer.fit_transform(data_encoded)

 80%|████████  | 4/5 [00:12<00:03,  3.06s/it]
100%|██████████| 4/4 [00:00<00:00, 100.60it/s]


In [8]:
data_imputed_array.head()

Unnamed: 0,Tenure,Device Protection_No internet service,Device Protection_Yes,Tech Support_No,Tech Support_No internet service,Tech Support_Yes,Streaming TV_No,Streaming TV_No internet service,Streaming TV_Yes,Streaming Movies_No,...,Multiple Lines_No phone service,Multiple Lines_Yes,Internet Service_DSL,Internet Service_Fiber optic,Internet Service_No,Online Security_No,Online Security_No internet service,Online Security_Yes,Online Backup_No internet service,Total Charges
0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,29.85
1,34.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1889.5
2,2.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,108.15
3,45.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1840.75
4,2.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,151.65


In [9]:
# Convert back to DataFrame
data_imputed = pd.DataFrame(data_imputed_array, columns=data_encoded.columns)

In [10]:
# Replace original Total Charges column with the imputed one
data["Total Charges"] = data_imputed["Total Charges"]

In [11]:
feature_names = ["intercept"] + list(data.columns)
feature_names.remove("Churn")

In [12]:
data = pd.get_dummies(data)

In [13]:
Dmat = data.to_numpy(dtype=np.float64)

In [14]:
# standardize first 3 variables corresponding to continuous variables
Dcont = Dmat[:, 0:4]
Dcont = (Dcont - Dcont.mean(axis=0)) / Dcont.std(axis=0)

In [15]:
# add a column of 1s for the intercept term
Xmat = np.column_stack((np.ones(len(Dmat)), Dcont[:, 0:3], Dmat[:, 4:]))

In [16]:
# extract outcome vector
Y = Dcont[:, 3]
print(Y)

[-0.60102348 -0.60102348  1.66382851 ... -0.60102348  1.66382851
 -0.60102348]


In [18]:
Xmat

array([[ 1.        , -1.27744458, -1.16032292, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.        ,  0.06632742, -0.25962894, ...,  0.        ,
         0.        ,  1.        ],
       [ 1.        , -1.23672422, -0.36266036, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 1.        , -0.87024095, -1.1686319 , ...,  0.        ,
         1.        ,  0.        ],
       [ 1.        , -1.15528349,  0.32033821, ...,  0.        ,
         0.        ,  1.        ],
       [ 1.        ,  1.36937906,  1.35896134, ...,  0.        ,
         0.        ,  0.        ]])

In [36]:
# produce a train-test split
n = len(Xmat)
Xmat_train = Xmat[0:int(0.8*n), :]
Xmat_test = Xmat[int(0.8*n):, :]
Y_train = Y[0:int(0.8*n)]
Y_test = Y[int(0.8*n):]