# 1.Import Libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Load processed data
data_path = "../data/processed/churn_cleaned.csv"
df = pd.read_csv(data_path)

df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


# 2.Check Data Types

In [3]:
# Separate numerical and categorical features
num_features = df.select_dtypes(include=['int64', 'float64']).columns
cat_features = df.select_dtypes(include=['object']).columns

print("Numerical Features:", list(num_features))
print("Categorical Features:", list(cat_features))

Numerical Features: ['SeniorCitizen', 'tenure', 'MonthlyCharges']
Categorical Features: ['customerID', 'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'TotalCharges', 'Churn']


# 3.Handle Missing Values 

In [4]:
# Check again for missing values
print("Missing values before cleaning:\n", df.isnull().sum())

# Fill or drop missing values
df.fillna(df.median(numeric_only=True), inplace=True)
df.fillna("Unknown", inplace=True)

print("Missing values after cleaning:\n", df.isnull().sum())

Missing values before cleaning:
 customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64
Missing values after cleaning:
 customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               

# 4.Convert Target Variable

In [5]:
# Encode target 'Churn' as binary (Yes=1, No=0)
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# 5.Encode Categorical Variables

In [6]:
# Identify binary and multi-class categorical columns
binary_cols = [col for col in cat_features if df[col].nunique() == 2]
multi_cols = [col for col in cat_features if df[col].nunique() > 2]

# Label encode binary columns
le = LabelEncoder()
for col in binary_cols:
    df[col] = le.fit_transform(df[col])

# One-hot encode multi-class columns
df = pd.get_dummies(df, columns=multi_cols, drop_first=True)

df.head()


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,Churn,customerID_0003-MKNFE,...,TotalCharges_995.35,TotalCharges_996.45,TotalCharges_996.85,TotalCharges_996.95,TotalCharges_997.65,TotalCharges_997.75,TotalCharges_998.1,TotalCharges_999.45,TotalCharges_999.8,TotalCharges_999.9
0,0,0,1,0,1,0,1,29.85,0,False,...,False,False,False,False,False,False,False,False,False,False
1,1,0,0,0,34,1,0,56.95,0,False,...,False,False,False,False,False,False,False,False,False,False
2,1,0,0,0,2,1,1,53.85,1,False,...,False,False,False,False,False,False,False,False,False,False
3,1,0,0,0,45,0,0,42.3,0,False,...,False,False,False,False,False,False,False,False,False,False
4,0,0,0,0,2,1,1,70.7,1,False,...,False,False,False,False,False,False,False,False,False,False


# 6.Feature Scaling

In [7]:
scaler = StandardScaler()
df[num_features] = scaler.fit_transform(df[num_features])

df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,Churn,customerID_0003-MKNFE,...,TotalCharges_995.35,TotalCharges_996.45,TotalCharges_996.85,TotalCharges_996.95,TotalCharges_997.65,TotalCharges_997.75,TotalCharges_998.1,TotalCharges_999.45,TotalCharges_999.8,TotalCharges_999.9
0,0,-0.439916,1,0,-1.277445,0,1,-1.160323,0,False,...,False,False,False,False,False,False,False,False,False,False
1,1,-0.439916,0,0,0.066327,1,0,-0.259629,0,False,...,False,False,False,False,False,False,False,False,False,False
2,1,-0.439916,0,0,-1.236724,1,1,-0.36266,1,False,...,False,False,False,False,False,False,False,False,False,False
3,1,-0.439916,0,0,0.514251,0,0,-0.746535,0,False,...,False,False,False,False,False,False,False,False,False,False
4,0,-0.439916,0,0,-1.236724,1,1,0.197365,1,False,...,False,False,False,False,False,False,False,False,False,False


# 7.Train-Test Split

In [8]:
# Separate features (X) and target (y)
X = df.drop('Churn', axis=1)
y = df['Churn']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)


Training set shape: (5634, 13601)
Test set shape: (1409, 13601)


# 8.Save Preprocessed Data

In [9]:
X_train.to_csv("../data/processed/X_train.csv", index=False)
X_test.to_csv("../data/processed/X_test.csv", index=False)
y_train.to_csv("../data/processed/y_train.csv", index=False)
y_test.to_csv("../data/processed/y_test.csv", index=False)

print("✅ Preprocessed data saved successfully.")


✅ Preprocessed data saved successfully.


# 9.Feature Importance Exploration 

In [10]:
# Compute correlations only with the target column
corr_with_target = df.drop('Churn', axis=1).corrwith(df['Churn'], numeric_only=True)
corr_with_target = corr_with_target.sort_values(ascending=False)

print("Top correlated features with Churn:\n", corr_with_target.head(10))


Top correlated features with Churn:
 InternetService_Fiber optic       0.308020
PaymentMethod_Electronic check    0.301919
MonthlyCharges                    0.193356
PaperlessBilling                  0.191825
SeniorCitizen                     0.150889
StreamingTV_Yes                   0.063228
StreamingMovies_Yes               0.061382
MultipleLines_Yes                 0.040102
TotalCharges_69.65                0.039663
TotalCharges_69.6                 0.039663
dtype: float64
