In [12]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("blastchar/telco-customer-churn")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/datasets/blastchar/telco-customer-churn


In [13]:
import os

os.listdir(path)

['WA_Fn-UseC_-Telco-Customer-Churn.csv']

In [14]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

file_path = os.path.join(path,"WA_Fn-UseC_-Telco-Customer-Churn.csv")
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [15]:
df = df.drop('customerID', axis=1)

In [16]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(0)

In [17]:
binary_map = {'Yes': 1, 'No': 0, 'Male': 1, 'Female': 0}
binary_columns = ['gender', 'Partner', 'Dependents', 
                  'PhoneService', 'PaperlessBilling']
for col in binary_columns:
    df[col] = df[col].map(binary_map)

In [18]:
tricky_cols = ['MultipleLines', 'OnlineSecurity', 'OnlineBackup',
               'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']
for col in tricky_cols:
    df[col] = df[col].replace({'No phone service': 'No', 
                               'No internet service': 'No'})
    df[col] = df[col].map({'Yes': 1, 'No': 0})


In [19]:
nominal_cols = ['InternetService', 'PaymentMethod']
ohe = OneHotEncoder(drop='first', sparse_output=False)
encoded_array = ohe.fit_transform(df[nominal_cols])
encoded_cols  = ohe.get_feature_names_out(nominal_cols)
encoded_df    = pd.DataFrame(encoded_array, columns=encoded_cols, index=df.index)
df = pd.concat([df.drop(nominal_cols, axis=1), encoded_df], axis=1)

In [20]:
oe = OrdinalEncoder(categories=[['Month-to-month', 'One year', 'Two year']])
df['Contract'] = oe.fit_transform(df[['Contract']])

In [21]:
df['Churn'] = (df['Churn'] == 'Yes').astype(int)

In [23]:
#verifications
print(f"Shape      : {df.shape}")
print(f"NaNs       : {df.isna().sum().sum()}")
print(f"Churn rate : {df['Churn'].mean():.2%}")
print(f"Columns    : {list(df.columns)}")

Shape      : (7043, 23)
NaNs       : 0
Churn rate : 26.54%
Columns    : ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'MonthlyCharges', 'TotalCharges', 'Churn', 'InternetService_Fiber optic', 'InternetService_No', 'PaymentMethod_Credit card (automatic)', 'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check']


In [24]:
df.to_csv('telco_preprocessed_full.csv', index=False)