In [1]:
# ================================
# 1. Import Libraries
# ================================
import pandas as pd
import numpy as np

from sklearn.preprocessing import (
    OneHotEncoder,
    OrdinalEncoder,
    LabelEncoder,
    StandardScaler,
    MinMaxScaler
)

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# ================================
# 2. Create Dataset
# ================================
np.random.seed(42)

data = pd.DataFrame({
    "customer_id": range(1001, 1201),
    "gender": np.random.choice(["Male", "Female"], 200),
    "SeniorCitizen": np.random.choice([0, 1], 200),
    "Partner": np.random.choice(["Yes", "No"], 200),
    "Dependents": np.random.choice(["Yes", "No"], 200),
    "tenure": np.random.randint(1, 72, 200),
    "MonthlyCharges": np.random.uniform(20, 120, 200).round(2),
    "TotalCharges": np.random.uniform(100, 8000, 200).round(2),
    "Contract": np.random.choice(
        ["Month-to-month", "One year", "Two year"], 200
    ),
    "PaymentMethod": np.random.choice(
        ["Electronic check", "Mailed check", "Bank transfer", "Credit card"], 200
    ),
    "InternetService": np.random.choice(
        ["DSL", "Fiber optic", "No"], 200
    ),
    "Churn": np.random.choice(["Yes", "No"], 200)
})

data.to_csv("churn_data.csv", index=False)

# ================================
# 3. Feature Engineering
# ================================
data["avg_monthly_spend"] = data["TotalCharges"] / (data["tenure"] + 1)

data["tenure_group"] = pd.cut(
    data["tenure"],
    bins=[0, 12, 24, 48, 72],
    labels=["New", "Medium", "Long", "Very Long"]
)

data["is_long_term_customer"] = np.where(data["tenure"] > 24, 1, 0)

data["charges_per_tenure"] = data["MonthlyCharges"] / (data["tenure"] + 1)

data["high_value_customer"] = np.where(
    data["MonthlyCharges"] > data["MonthlyCharges"].median(), 1, 0
)

data["monthly_charge_ratio"] = (
    data["MonthlyCharges"] / data["MonthlyCharges"].max()
)

data["senior_long_term"] = data["SeniorCitizen"] * data["is_long_term_customer"]

# ================================
# 4. Encoding & Scaling Setup
# ================================
binary_features = ["Partner", "Dependents"]
ordinal_features = ["Contract"]
nominal_features = ["gender", "PaymentMethod", "InternetService", "tenure_group"]
numeric_standard = ["tenure", "MonthlyCharges", "avg_monthly_spend"]
numeric_minmax = ["TotalCharges", "charges_per_tenure", "monthly_charge_ratio"]

# Label Encoding
for col in binary_features + ["Churn"]:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])

# ================================
# 5. Preprocessing Pipeline
# ================================
preprocessor = ColumnTransformer(
    transformers=[
        ("onehot", OneHotEncoder(drop="first"), nominal_features),
        ("ordinal", OrdinalEncoder(
            categories=[["Month-to-month", "One year", "Two year"]]
        ), ordinal_features),
        ("standard", StandardScaler(), numeric_standard),
        ("minmax", MinMaxScaler(), numeric_minmax)
    ],
    remainder="passthrough"
)

X = data.drop(columns=["Churn", "customer_id"])
y = data["Churn"]

X_processed = preprocessor.fit_transform(X)

print("Preprocessing Completed Successfully")
print("Final feature matrix shape:", X_processed.shape)


Preprocessing Completed Successfully
Final feature matrix shape: (200, 22)
