In [4]:
import pandas as pd
import numpy as np
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer


In [5]:

# ✅ Step 1: Load Data
X_train = pd.read_csv("../data/X_train.csv")
X_test = pd.read_csv("../data/X_test.csv")
y_train = pd.read_csv("../data/y_train.csv")
y_test = pd.read_csv("../data/y_test.csv")


In [6]:
# ✅ Step 2: Convert y_train and y_test to 1D arrays
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()


In [7]:

# ✅ Step 3: Handle Missing Values (if any)
imputer = SimpleImputer(strategy="median")
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)


In [8]:

# ✅ Step 4: Ensure Data Types Are Numeric
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')


In [9]:

# ✅ Step 5: Train Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=200, max_depth=10)
}

for name, model in models.items():
    model.fit(X_train, y_train)  # Training
    y_pred = model.predict(X_test)  # Prediction
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.2f}")



Logistic Regression Accuracy: 0.83
Decision Tree Accuracy: 1.00
Random Forest Accuracy: 1.00


In [10]:
# ✅ Step 6: Save Best Model (Random Forest)
best_model = models["Random Forest"]
joblib.dump(best_model, "../models/churn_model.pkl")
print("✅ Model saved successfully!")


✅ Model saved successfully!
