In [4]:
# RELOAD THE DATASET FRESH
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('/Users/babliparab/Downloads/WA_Fn-UseC_-Telco-Customer-Churn.csv')

print(f"Starting with {df.shape[0]} customers")

# 1. Fix TotalCharges
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())

# 2. Simplify categorical values
internet_cols = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                 'TechSupport', 'StreamingTV', 'StreamingMovies']
for col in internet_cols:
    df[col] = df[col].replace('No internet service', 'No')
    
df['MultipleLines'] = df['MultipleLines'].replace('No phone service', 'No')

# 3. Feature engineering
df['total_services'] = df[internet_cols].apply(lambda x: (x == 'Yes').sum(), axis=1)
df['avg_monthly_spend'] = df['TotalCharges'] / (df['tenure'] + 1)

# 4. Encode binary columns
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling'] + internet_cols
for col in binary_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0})

# 5. Encode multi-category columns
from sklearn.preprocessing import LabelEncoder

categorical_cols = ['gender', 'InternetService', 'Contract', 'PaymentMethod']
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# 6. Encode target (Churn)
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# 7. Drop customerID only
df_clean = df.drop('customerID', axis=1)

print(f"After cleaning: {df_clean.shape[0]} customers")
print(f"Columns: {df_clean.shape[1]}")

# 8. Separate X and y
X = df_clean.drop('Churn', axis=1)
y = df_clean['Churn']

print(f"\nFinal check:")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"Missing in X: {X.isnull().sum().sum()}")
print(f"Missing in y: {y.isnull().sum()}")


Starting with 7043 customers
After cleaning: 7043 customers
Columns: 22

Final check:
X shape: (7043, 21)
y shape: (7043,)
Missing in X: 0
Missing in y: 0


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.3, 
    random_state=42, 
    stratify=y
)

print(f"✓ Training set: {X_train.shape[0]} customers")
print(f"✓ Test set: {X_test.shape[0]} customers")
print(f"✓ Churn rate in train: {y_train.mean():.2%}")
print(f"✓ Churn rate in test: {y_test.mean():.2%}")


✓ Training set: 4930 customers
✓ Test set: 2113 customers
✓ Churn rate in train: 26.53%
✓ Churn rate in test: 26.55%


In [8]:
from sklearn.preprocessing import StandardScaler

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Data scaling complete!")
print(f"Scaled training data shape: {X_train_scaled.shape}")


ValueError: could not convert string to float: 'No'

In [9]:
from sklearn.preprocessing import StandardScaler

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # FIX: Changed X_train_scaled to X_train
X_test_scaled = scaler.transform(X_test)

print("Data scaling complete!")
print(f"Scaled training data shape: {X_train_scaled.shape}")
print(f"Scaled test data shape: {X_test_scaled.shape}")


ValueError: could not convert string to float: 'No'