In [None]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, log_loss

# Load Dataset
df = pd.read_csv('merged_dataset_VF03.csv')

# Drop columns that are IDs or irrelevant to the math
X = df.drop(['Customer ID', 'Churn Value', 'Churn Category', 'Churn Reason'], axis=1, errors='ignore')
y = df['Churn Value']

# Strategic Split: 80/20 with stratification to handle class imbalance
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
# Grouping features for specific mathematical treatments
binary_cols = [col for col in X.columns if X[col].nunique() == 2]
categorical_cols = ['Contract', 'Internet Type', 'Payment Method', 'Offer']
numeric_cols = ['Tenure in Months', 'Monthly Charge', 'Total Charges']

preprocessor = ColumnTransformer(
    transformers=[
        # OneHot for categories, handle unknown values for production safety
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
        # Pass-through or binary encoding for simple flags
        ('bin', OneHotEncoder(drop='if_binary', sparse_output=False), binary_cols),
        # Standardize numeric features for Convex Solver stability (Crucial for L-BFGS)
        ('num', StandardScaler(), numeric_cols)
    ])

# Define the full Pipeline: Preprocessing + Solver
# Using SAGA as it supports L1/L2 and is faster for large datasets as per your README
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='saga', penalty='l2', max_iter=2000, tol=1e-4))
])

In [None]:
solvers = [
    ('L-BFGS (Quasi-Newton)', 'lbfgs'),
    ('SAGA (Stochastic)', 'saga'),
    ('Liblinear (Coordinate Descent)', 'liblinear')
]

plt.figure(figsize=(10, 6))

for name, s in solvers:
    # Build temporary pipeline for comparison
    test_pipe = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(solver=s, max_iter=100, warm_start=True))
    ])
    
    losses = []
    # Manually iterate to capture the convergence path
    for i in range(20):
        test_pipe.fit(X_train, y_train)
        losses.append(log_loss(y_train, test_pipe.predict_proba(X_train)))
    
    plt.plot(losses, label=name)



plt.title("Empirical Investigation: Solver Convergence Path")
plt.xlabel("Epochs/Iterations"); plt.ylabel("Log Loss"); plt.legend()
plt.show()

In [None]:
# Train the production-ready model
pipeline.fit(X_train, y_train)

# Print performance metrics to verify no degradation
print(classification_report(y_test, pipeline.predict(X_test)))

# EXPORT: This replaces your 8 joblib files
import os
if not os.path.exists('models'): os.makedirs('models')
joblib.dump(pipeline, 'models/churn_pipeline.joblib')

print("SUCCESS: Pipeline exported as models/churn_pipeline.joblib")