In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('../data/WA_Fn-UseC_-Telco-Customer-Churn.csv')
print(f"Dataset Shape: {df.shape}")
print(f"Total Customers: {df.shape[0]}")
print(f"Total Features: {df.shape[1]}")



Dataset Shape: (7043, 21)
Total Customers: 7043
Total Features: 21


In [3]:

df.head(10)
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [5]:
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Missing %': missing_pct
})
print("Missing Values:")
print(missing_df[missing_df['Missing Count'] > 0])

Missing Values:
Empty DataFrame
Columns: [Missing Count, Missing %]
Index: []


In [6]:


#Inspect TotalCharges Column
print(f"TotalCharges dtype: {df['TotalCharges'].dtype}")
print(f"\nUnique non-numeric values:")
print(df[pd.to_numeric(df['TotalCharges'], errors='coerce').isnull()]['TotalCharges'].unique())

TotalCharges dtype: object

Unique non-numeric values:
[' ']


In [7]:
#Fix TotalCharges
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
print(f"Missing TotalCharges: {df['TotalCharges'].isnull().sum()}")
df['TotalCharges'].fillna(0, inplace=True)
print(f"After fix - Missing TotalCharges: {df['TotalCharges'].isnull().sum()}")

Missing TotalCharges: 11
After fix - Missing TotalCharges: 0


In [8]:
# Check Target Variable
print("Churn Distribution:")
print(df['Churn'].value_counts())
print(f"\nChurn Rate: {(df['Churn'] == 'Yes').mean() * 100:.2f}%")

Churn Distribution:
Churn
No     5174
Yes    1869
Name: count, dtype: int64

Churn Rate: 26.54%


In [9]:
# Identify Column Types
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols.remove('customerID')
categorical_cols.remove('Churn')

print(f"Categorical columns ({len(categorical_cols)}):")
print(categorical_cols)
print(f"\nNumerical columns ({len(numerical_cols)}):")
print(numerical_cols)

Categorical columns (15):
['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']

Numerical columns (4):
['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']


In [10]:
#Encode Target Variable
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
print("Churn encoded:")
print(df['Churn'].value_counts())

Churn encoded:
Churn
0    5174
1    1869
Name: count, dtype: int64


In [11]:
#Encode Binary Categorical Columns
binary_cols = []
for col in categorical_cols:
    if df[col].nunique() == 2:
        binary_cols.append(col)

print(f"Binary columns: {binary_cols}")

for col in binary_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    print(f"{col}: {le.classes_}")

Binary columns: ['gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
gender: ['Female' 'Male']
Partner: ['No' 'Yes']
Dependents: ['No' 'Yes']
PhoneService: ['No' 'Yes']
PaperlessBilling: ['No' 'Yes']


In [12]:
#One-Hot Encode Multi-Category Columns
multi_cat_cols = [col for col in categorical_cols if col not in binary_cols]
print(f"Multi-category columns: {multi_cat_cols}")

df_encoded = pd.get_dummies(df, columns=multi_cat_cols, drop_first=True)
print(f"\nShape after encoding: {df_encoded.shape}")


Multi-category columns: ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']

Shape after encoding: (7043, 32)


In [13]:
#  Prepare Features and Target
df_encoded.drop('customerID', axis=1, inplace=True)

X = df_encoded.drop('Churn', axis=1)
y = df_encoded['Churn']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature names:")
print(X.columns.tolist())
# Split Data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"\nTraining churn rate: {y_train.mean()*100:.2f}%")
print(f"Test churn rate: {y_test.mean()*100:.2f}%")

Features shape: (7043, 30)
Target shape: (7043,)

Feature names:
['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'PaperlessBilling', 'MonthlyCharges', 'TotalCharges', 'MultipleLines_No phone service', 'MultipleLines_Yes', 'InternetService_Fiber optic', 'InternetService_No', 'OnlineSecurity_No internet service', 'OnlineSecurity_Yes', 'OnlineBackup_No internet service', 'OnlineBackup_Yes', 'DeviceProtection_No internet service', 'DeviceProtection_Yes', 'TechSupport_No internet service', 'TechSupport_Yes', 'StreamingTV_No internet service', 'StreamingTV_Yes', 'StreamingMovies_No internet service', 'StreamingMovies_Yes', 'Contract_One year', 'Contract_Two year', 'PaymentMethod_Credit card (automatic)', 'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check']
Training set: 5634 samples
Test set: 1409 samples

Training churn rate: 26.54%
Test churn rate: 26.54%


In [14]:
# Scale Numerical Features
scaler = StandardScaler()

X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test_scaled[numerical_cols] = scaler.transform(X_test[numerical_cols])

print("Scaling complete!")
print(f"\nScaled training data sample:")
print(X_train_scaled[numerical_cols].head())

#  Save Processed Data
X_train_scaled.to_csv('../data/X_train.csv', index=False)
X_test_scaled.to_csv('../data/X_test.csv', index=False)
y_train.to_csv('../data/y_train.csv', index=False)
y_test.to_csv('../data/y_test.csv', index=False)
df_encoded.to_csv('../data/telco_processed.csv', index=False)
joblib.dump(scaler, '../outputs/models/scaler.pkl')


Scaling complete!

Scaled training data sample:
      SeniorCitizen    tenure  MonthlyCharges  TotalCharges
3738      -0.441773  0.102371       -0.521976     -0.262257
3151      -0.441773 -0.711743        0.337478     -0.503635
4860      -0.441773 -0.793155       -0.809013     -0.749883
3867      -0.441773 -0.263980        0.284384     -0.172722
3810      -0.441773 -1.281624       -0.676279     -0.989374


['../outputs/models/scaler.pkl']

In [15]:
# Statistics summary
print(f"\nOriginal dataset: 7,043 customers, 21 features")
print(f"Processed dataset: {df_encoded.shape[0]} customers, {df_encoded.shape[1]} features")
print(f"\nTarget variable: Churn")
print(f"  - No (0): {(y == 0).sum()} customers ({(y == 0).mean()*100:.1f}%)")
print(f"  - Yes (1): {(y == 1).sum()} customers ({(y == 1).mean()*100:.1f}%)")
print(f"\nTrain/Test split: 80/20")
print(f"  - Training: {len(X_train)} samples")
print(f"  - Testing: {len(X_test)} samples")
print(f"\nNumerical features scaled: {numerical_cols}")



Original dataset: 7,043 customers, 21 features
Processed dataset: 7043 customers, 31 features

Target variable: Churn
  - No (0): 5174 customers (73.5%)
  - Yes (1): 1869 customers (26.5%)

Train/Test split: 80/20
  - Training: 5634 samples
  - Testing: 1409 samples

Numerical features scaled: ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']
