# Combining Models - Customer Churn

In [None]:
!pip install -r requirements.txt --quiet

In [None]:
import pandas as pd
cust_df = pd.read_csv("data/synth_customer_churn.csv")

In [None]:
y = cust_df['ChurnCategory'].map({'Low Risk': 0, 'Medium Risk': 1, 'High Risk': 2})
X = cust_df.drop(columns=['ChurnCategory', 'CustomerID'])

## Read in preprocessor

In [None]:
import joblib
from pipeline.binner import TenureBinner #Needed as our joblib contains a custom transformer
preprocessor = joblib.load("models/preprocessing/preprocessor.joblib")

## Split the data (avoid data leakage)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42)

## Combine multiple models

### Option 1 - Stacking Classifier

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC


In [None]:
base_learners = [
    ('rf', RandomForestClassifier()),
    ('svc', SVC(probability=True)),
    ('gb', GradientBoostingClassifier())
]

meta_model = LogisticRegression()

In [None]:
from sklearn.ensemble import StackingClassifier
stacking_clf = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_model
)

### Add SMOTENN to pipeline to oversample

In [None]:
from imblearn.pipeline import Pipeline
from collections import Counter
from imblearn.combine import SMOTEENN
import os

original_counts = Counter(y_train)

In [None]:
target_counts = {
    1: int(original_counts[1] * 1.4),
    2: int(original_counts[2] * 2.5)}

In [None]:
target_counts

In [None]:
smotenn = SMOTEENN(sampling_strategy=target_counts, 
                   random_state=42)

### Build pipeline

In [None]:
stacking_pipeline = Pipeline([
    ("preprocess", preprocessor),  
    ("resample",smotenn), 
    ("stacking_clf", stacking_clf)])

In [None]:
stacking_pipeline

In [None]:
SAVE_DIR = "models/classifiers"
os.makedirs(SAVE_DIR, exist_ok=True)
joblib.dump(stacking_pipeline, os.path.join(SAVE_DIR, "stack_class_pipe.joblib"))

In [None]:
param_grid = {
    'stacking_clf__rf__n_estimators': [100, 200],
    'stacking_clf__svc__C': [0.1, 1.0],
    'stacking_clf__gb__learning_rate': [0.05, 0.1],
    'stacking_clf__final_estimator__C': [0.1, 1.0]
}

In [None]:
import json 
os.makedirs("models/configs", exist_ok=True)
with open("models/configs/stack_param_grid.json", "w") as f:
    json.dump(param_grid, f, indent=2)

## Option 2 - Create reusable base models in Pipeline

In [None]:
base_models = {
    'rf': RandomForestClassifier(),
    'svc': SVC(probability=True),
    'gb' : GradientBoostingClassifier()
}

In [None]:
from imblearn.pipeline import Pipeline
sampler = SMOTEENN(sampling_strategy=target_counts, 
                   random_state=42)

In [None]:
pipelines = {}

In [None]:
for name, model in base_models.items():
    pipelines[name] = Pipeline([
        ("preprocess", preprocessor),
        ("resample", sampler),
        ("classifier", model)
    ])

In [None]:
import joblib
import os
CLASS_PATH = "models/classifiers/"
os.makedirs(CLASS_PATH, exist_ok=True)


In [None]:
for name, pipe in pipelines.items():
    joblib.dump(pipe, f"{CLASS_PATH}/{name}_pipeline.joblib")

### List files in path

In [None]:
files = [f for f in os.listdir(CLASS_PATH) if os.path.isfile(os.path.join(CLASS_PATH, f)) and not f.startswith("best_model")]
print(files)