In [3]:
# Transform the cleaned data into a purely numerical format suitable for machine learning algorithms,
# and create new features to capture patterns identified during EDA.

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np

In [4]:
# --- 1. DATA LOADING ---
print("--- 1. DATA LOADING --- \n ")
try:
    df = pd.read_csv('cleaned_telco_churn_data.csv')
    print(f"Loaded cleaned data with {df.shape[0]} rows and {df.shape[1]} features.")
except FileNotFoundError:
    print("Error: 'cleaned_telco_churn_data.csv' not found. Please ensure the file is in the directory.")
    exit()

# Separate features (X) and target (y)
X = df.drop('Churn', axis=1)
y = df['Churn']
print(f"Features (X) shape: {X.shape}, Target (y) shape: {y.shape}")

--- 1. DATA LOADING --- 
 
Loaded cleaned data with 7032 rows and 20 features.
Features (X) shape: (7032, 19), Target (y) shape: (7032,)


In [5]:
# Define feature types
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical_cols = X.drop(columns=numerical_cols).columns.tolist()
print(f"Categorical features to encode: {categorical_cols}")

Categorical features to encode: ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']


In [6]:
# --- 2. FEATURE CREATION (Binning Tenure) ---
print("\n--- 2. FEATURE CREATION: Tenure Grouping ---\n")
# Based on EDA, tenure is highly bimodal and influential. We create bins for better interpretability
# and to potentially improve non-linear model performance.

bins = [0, 12, 36, 60, 100] # 0-1 year, 1-3 years, 3-5 years, 5+ years
labels = ['New (0-1yr)', 'Mid (1-3yrs)', 'Senior (3-5yrs)', 'Loyal (5+yrs)']
X['Tenure_Group'] = pd.cut(X['tenure'], bins=bins, labels=labels, right=False)

# Drop the original tenure column after creating the binned feature.
# We keep it for now but note that it might be dropped later if multicollinearity is an issue.
print("Created 'Tenure_Group' categorical feature based on 'tenure'\n")
print(X['Tenure_Group'].value_counts())


--- 2. FEATURE CREATION: Tenure Grouping ---

Created 'Tenure_Group' categorical feature based on 'tenure'

New (0-1yr)        2058
Mid (1-3yrs)       1923
Senior (3-5yrs)    1568
Loyal (5+yrs)      1483
Name: Tenure_Group, dtype: int64


In [7]:

# --- 3. CATEGORICAL ENCODING (One-Hot Encoding) ---
print("\n--- 3. CATEGORICAL ENCODING (One-Hot Encoding) ---")
# Use pd.get_dummies to convert all categorical features into a set of binary (0 or 1) columns.
# We use 'drop_first=True' to avoid multicollinearity within the dummy variables (the "Dummy Variable Trap").

# Including the new Tenure_Group in the list of columns to encode
cols_to_encode = categorical_cols + ['Tenure_Group']

# Perform one-hot encoding
X_encoded = pd.get_dummies(X, columns=cols_to_encode, drop_first=True)

# Drop original 'tenure' if we proceed with 'Tenure_Group' as a primary feature
# We will keep the original 'tenure' for now, as it is a strong predictor.
X_encoded.drop(columns=['tenure'], inplace=True)
print("Removed original 'tenure' to rely on binned feature and mitigate correlation with TotalCharges.")



--- 3. CATEGORICAL ENCODING (One-Hot Encoding) ---
Removed original 'tenure' to rely on binned feature and mitigate correlation with TotalCharges.


In [8]:
# --- 4. NUMERICAL SCALING ---
print("\n--- 4. NUMERICAL SCALING ---\n")
# Scaling is crucial for distance-based and regularization-based models (Logistic Regression, SVM, kNN).
# We only scale the truly continuous variables: MonthlyCharges and TotalCharges.

# Define columns to scale
cols_to_scale = ['MonthlyCharges', 'TotalCharges']

scaler = StandardScaler()

# Fit and transform the data
X_encoded[cols_to_scale] = scaler.fit_transform(X_encoded[cols_to_scale])

print("Applied StandardScaler to 'MonthlyCharges' and 'TotalCharges'\n")
print("First 5 rows of scaled numerical features:\n")
print(X_encoded[cols_to_scale].head())


--- 4. NUMERICAL SCALING ---

Applied StandardScaler to 'MonthlyCharges' and 'TotalCharges'

First 5 rows of scaled numerical features:

   MonthlyCharges  TotalCharges
0       -1.161694     -0.994194
1       -0.260878     -0.173740
2       -0.363923     -0.959649
3       -0.747850     -0.195248
4        0.196178     -0.940457


In [9]:

# --- 5. FINAL PREPARATION AND DATA SPLIT ---
print("\n--- 5. FINAL PREPARATION AND DATA SPLIT ---\n")

# Align X and y
X_final = X_encoded
print(f"Final feature set (X_final) shape: {X_final.shape}")
print(f"Final column list count: {len(X_final.columns)}")

# Split the data into training and testing sets.
# We use 'stratify=y' because the target variable ('Churn') is imbalanced.
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y, test_size=0.25, random_state=42, stratify=y
)

print("\n--- Train/Test Split Complete ---\n")
print(f"X_train shape: {X_train.shape} | y_train Churn rate: {y_train.mean():.2%}")
print(f"X_test shape: {X_test.shape}   | y_test Churn rate: {y_test.mean():.2%}")

# Save processed data for the next step (Modeling)
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False, header=True)
y_test.to_csv('y_test.csv', index=False, header=True)

print("\nProcessed and split data saved to X_train.csv, X_test.csv, y_train.csv, and y_test.csv")


--- 5. FINAL PREPARATION AND DATA SPLIT ---

Final feature set (X_final) shape: (7032, 32)
Final column list count: 32

--- Train/Test Split Complete ---

X_train shape: (5274, 32) | y_train Churn rate: 26.58%
X_test shape: (1758, 32)   | y_test Churn rate: 26.56%

Processed and split data saved to X_train.csv, X_test.csv, y_train.csv, and y_test.csv
