In [5]:
# scripts/02_preprocessing.py

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
import os

# -----------------------------
# Step 0: Ensure folders exist
# -----------------------------
os.makedirs("../data", exist_ok=True)
os.makedirs("../models", exist_ok=True)

# -----------------------------
# Step 1: Load dataset
# -----------------------------
df = pd.read_csv("../data/customer_churn.csv")

print("✅ Dataset loaded with shape:", df.shape)

# -----------------------------
# Step 2: Drop useless columns
# -----------------------------
if "customer_id" in df.columns:
    df = df.drop("customer_id", axis=1)

# -----------------------------
# Step 3: Define target variable
# -----------------------------
target_col = "churned"

if df[target_col].dtype == "object":
    df[target_col] = df[target_col].map({"Yes": 1, "No": 0})
df[target_col] = df[target_col].astype(int)

print(f"🎯 Target column: {target_col}")

# -----------------------------
# Step 4: Split features & target
# -----------------------------
X = df.drop(target_col, axis=1)
y = df[target_col]

print("✅ Features & target separated")
print("   Features shape:", X.shape)
print("   Target shape:", y.shape)

# -----------------------------
# Step 5: Handle categorical variables
# -----------------------------
categorical_cols = ["gender", "subscription_type", "region", "device", "payment_method", "favorite_genre"]
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

print("✅ One-hot encoding applied")
print("   Features after encoding:", X.shape)

# -----------------------------
# Step 6: Train-Test Split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("✅ Train-test split done")
print("   Training set:", X_train.shape)
print("   Testing set:", X_test.shape)

# -----------------------------
# Step 7: Feature Scaling
# -----------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("✅ Feature scaling complete")

# -----------------------------
# Step 8a: Save as Pickle (for training)
# -----------------------------
joblib.dump((X_train_scaled, X_test_scaled, y_train, y_test), "../data/processed_data.pkl")
joblib.dump(scaler, "../models/scaler.pkl")

print("💾 Preprocessed data saved at data/processed_data.pkl")
print("💾 Scaler saved at models/scaler.pkl")

# -----------------------------
# Step 8b: Save as CSV (for inspection/debugging)
# -----------------------------
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

X_train_scaled_df.to_csv("../data/X_train_scaled.csv", index=False)
X_test_scaled_df.to_csv("../data/X_test_scaled.csv", index=False)
y_train.to_csv("../data/y_train.csv", index=False)
y_test.to_csv("../data/y_test.csv", index=False)

print("💾 Also saved processed datasets as CSVs in data/")


✅ Dataset loaded with shape: (5000, 14)
🎯 Target column: churned
✅ Features & target separated
   Features shape: (5000, 12)
   Target shape: (5000,)
✅ One-hot encoding applied
   Features after encoding: (5000, 29)
✅ Train-test split done
   Training set: (4000, 29)
   Testing set: (1000, 29)
✅ Feature scaling complete
💾 Preprocessed data saved at data/processed_data.pkl
💾 Scaler saved at models/scaler.pkl
💾 Also saved processed datasets as CSVs in data/
