In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib

In [15]:
df = pd.read_csv("../data/customer_churn.csv")
print("Dataset shape:", df.shape)

# Drop useless columns
if "customer_id" in df.columns:
    df = df.drop("customer_id", axis=1)

Dataset shape: (5000, 14)


In [16]:
# Cell 3: Define target
target_col = "churned"
# Convert to binary
if df[target_col].dtype == "object":
    df[target_col] = df[target_col].map({"Yes": 1, "No": 0})
df[target_col] = df[target_col].astype(int)

In [17]:
X = df.drop(target_col, axis=1)
y = df[target_col]
print("Features shape:", X.shape, "Target shape:", y.shape)

Features shape: (5000, 12) Target shape: (5000,)


In [18]:
numeric_cols = ["age","watch_hours","last_login_days","monthly_fee",
                "number_of_profiles","avg_watch_time_per_day"]
categorical_cols = ["gender", "subscription_type", "region", "device", "payment_method", "favorite_genre"]

In [19]:
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
# Save feature columns for later use
feature_cols = X.columns.tolist()
joblib.dump(feature_cols, "../models/feature_columns.pkl")
print("Feature columns saved.")

Feature columns saved.


In [20]:
# Cell 7: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [21]:
scaler = StandardScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

# Save scaler
joblib.dump(scaler, "../models/scaler.pkl")
print("Scaler saved.")

Scaler saved.


In [22]:
X_train.to_csv("../data/X_train.csv", index=False)
X_test.to_csv("../data/X_test.csv", index=False)
y_train.to_csv("../data/y_train.csv", index=False)
y_test.to_csv("../data/y_test.csv", index=False)
print("Preprocessed data saved.")

Preprocessed data saved.
