In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
# # Import libraries
import pandas as pd
from pathlib import Path
from churn.preprocessing import load_data
from churn.paths import DATA_DIR, MODELS_DIR


import churn.config as cfg
from functools import partial
import joblib
from IPython.display import display, HTML
import plotly.offline as pyo
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours

# Import necessary functions from modelling.py
from churn.modelling import train_tune_evaluate, calculate_classification_metrics, display_classification_results, draw_roc_curve
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.model_selection import RepeatedStratifiedKFold
import pickle

In [3]:
# Define the file paths
train_path = Path(DATA_DIR / 'train_features_binned.parquet')
test_path = Path(DATA_DIR / 'test_features_binned.parquet')

# Load the raw data
train_features_binned = load_data(train_path) 
test_features_binned = load_data(test_path) 
# Display the first rows of the raw data
train_features_binned.head()

2024-09-01 11:10:39,635 - INFO - Data loaded from /Users/borja/Documents/Somniumrema/projects/ml/churn/data/train_features_binned.parquet
2024-09-01 11:10:39,637 - INFO - Data loaded from /Users/borja/Documents/Somniumrema/projects/ml/churn/data/test_features_binned.parquet


Unnamed: 0,churn,total_day_minutes,n_sms,total_eve_minutes,total_eve_calls,customer_service_rating,customer_happiness,customer_service_calls
0,0,15,12,22,3,3,3,0
1,0,17,9,23,23,1,0,0
2,0,18,12,18,35,3,14,0
3,1,18,32,19,29,3,0,5
4,0,17,13,9,9,3,5,3


Con las variables más importantes, entrenar un modelo Naive-Bayes y un modelo SVM (kernel=lineal)
- Usar 5 splits y 2 repeticiones
- Acometer el entrenamiento para diferentes hiperparámetros
- SVM: regularización (C, cost)
- Naive-Bayes (bernuilli): laplace smoothing

In [4]:
# Define the cross-validation strategy
cv = RepeatedStratifiedKFold(n_splits=cfg.N_SPLITS, n_repeats=cfg.N_REPEATS, random_state=cfg.SEED)

In [5]:
# Define the models and their hyperparameter search spaces
models = {
    "Naive Bayes": (
        BernoulliNB,  # Use a lambda to include fixed parameters
        {
            "alpha": lambda trial: trial.suggest_float('alpha', cfg.BERNOULLI_LOWER_BOUND, cfg.BERNOULLI_UPPER_BOUND)
        }
    ),
    "SVM": (
        partial(SVC, probability=True, kernel='linear'),
        {
            "C": lambda trial: trial.suggest_float('C', cfg.SVC_C_LOWER_BOUND, cfg.SCV_C_UPPER_BOUND, log=True) 
        }
    )
}


In [6]:
# # Train, tune, and evaluate the models
results = train_tune_evaluate(train_features_binned, test_features_binned, models, cv=cv, n_trials=cfg.N_TRIALS)

2024-09-01 11:10:39,748 - INFO - Starting hyperparameter optimization for BernoulliNB...
Optimizing BernoulliNB: 100%|██████████| 50/50 [00:01<00:00, 35.60it/s]
2024-09-01 11:10:41,207 - INFO - Hyperparameter optimization for BernoulliNB completed.
2024-09-01 11:10:41,243 - INFO - Starting hyperparameter optimization for SVC...
Optimizing SVC: 100%|██████████| 50/50 [23:35<00:00, 28.31s/it]   
2024-09-01 11:34:16,604 - INFO - Hyperparameter optimization for SVC completed.


In [7]:
# Display the model results
for model_name, result in results.items():
    print(f"Model: {model_name}")
    for key, value in result.items():
        if key not in ['model', 'predictions_train', 'predictions_test', 'predictions_test_adjusted', 'predictions_train_proba', 'predictions_test_proba']:
            formatted_value = f"{value:.4f}" if isinstance(value, float) else value
            print(f"{key.replace('_', ' ').title()}: {formatted_value}")
    print() 

Model: Naive Bayes
Best Params: {'alpha': 0.8905369188141217}
Roc Auc Cv: 0.8313
Roc Auc Train: 0.8335
Roc Auc Test: 0.8549
Threshold: 0.2311

Model: SVM
Best Params: {'C': 0.18218945844562445}
Roc Auc Cv: 0.9613
Roc Auc Train: 0.9621
Roc Auc Test: 0.9532
Threshold: 0.3182



In [8]:
# Display the classification rmetrics
for model_name, result in results.items():
    metrics = calculate_classification_metrics(train_features_binned, test_features_binned, result['predictions_train'], result['predictions_test'])
    display_classification_results(metrics, model_name)
    
    # Draw and display ROC curves
    fig_train = draw_roc_curve(train_features_binned['churn'], result['predictions_train_proba'], f'Receiver Operating Characteristic - {model_name} (Train)')
    display(HTML("<h3>ROC AUC (Train):</h3>"))
    pyo.iplot(fig_train)
    
    fig_test = draw_roc_curve(test_features_binned['churn'], result['predictions_test_proba'], f'Receiver Operating Characteristic - {model_name} (Test)')
    display(HTML("<h3>ROC AUC (Test):</h3>"))
    pyo.iplot(fig_test)
    
    display(HTML("<br>")) 

Unnamed: 0,precision,recall,f1-score,support
0,0.93,1.0,0.96,5135.0
1,0.87,0.21,0.34,490.0
accuracy,0.93,0.93,0.93,0.93
macro avg,0.9,0.6,0.65,5625.0
weighted avg,0.92,0.93,0.91,5625.0

Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,5119,16
Actual Positive,386,104


Unnamed: 0,precision,recall,f1-score,support
0,0.94,1.0,0.96,1712.0
1,0.85,0.28,0.42,163.0
accuracy,0.93,0.93,0.93,0.93
macro avg,0.89,0.64,0.69,1875.0
weighted avg,0.93,0.93,0.92,1875.0

Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,1704,8
Actual Positive,117,46


Unnamed: 0,precision,recall,f1-score,support
0,0.96,0.98,0.97,5135.0
1,0.77,0.53,0.63,490.0
accuracy,0.95,0.95,0.95,0.95
macro avg,0.86,0.76,0.8,5625.0
weighted avg,0.94,0.95,0.94,5625.0

Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,5057,78
Actual Positive,229,261


Unnamed: 0,precision,recall,f1-score,support
0,0.96,0.99,0.97,1712.0
1,0.78,0.52,0.62,163.0
accuracy,0.95,0.95,0.95,0.95
macro avg,0.87,0.75,0.8,1875.0
weighted avg,0.94,0.95,0.94,1875.0

Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,1688,24
Actual Positive,78,85


In [None]:
# Save each model to a pickle file
for model_name, result in results.items():
    model = result['model']
    model_filename = Path(MODELS_DIR / f'{model_name}_model_1.pkl')
    with open(model_filename, 'wb') as f:
        pickle.dump(model, f)

In [9]:
# Split the features and label within the training and testing datasets
X_train = train_features_binned.drop('churn', axis=1)
y_train = train_features_binned['churn']
X_test = test_features_binned.drop('churn', axis=1)
y_test = test_features_binned['churn']

# Upsample the minority class using SMOTE and then apply EditedNearestNeighbours
smote_enn = SMOTEENN(smote=SMOTE(sampling_strategy='minority'), enn=EditedNearestNeighbours())
X_train_res, y_train_res = smote_enn.fit_resample(X_train, y_train)

# Combine the resampled features and labels into a single DataFrame
train_res = pd.concat([pd.DataFrame(X_train_res, columns=X_train.columns), pd.DataFrame(y_train_res, columns=['churn'])], axis=1)

# # Train, tune, and evaluate the models
results = train_tune_evaluate(train_res, test_features_binned, models, cv=cv, n_trials=cfg.N_TRIALS)

2024-09-01 11:34:22,889 - INFO - Starting hyperparameter optimization for BernoulliNB...
Optimizing BernoulliNB: 100%|██████████| 50/50 [00:01<00:00, 32.66it/s]
2024-09-01 11:34:24,421 - INFO - Hyperparameter optimization for BernoulliNB completed.
2024-09-01 11:34:24,459 - INFO - Starting hyperparameter optimization for SVC...
Optimizing SVC: 100%|██████████| 50/50 [9:01:47<00:00, 650.16s/it]    
2024-09-01 20:36:12,442 - INFO - Hyperparameter optimization for SVC completed.


In [10]:
# Display the model results
for model_name, result in results.items():
    print(f"Model: {model_name}")
    for key, value in result.items():
        if key not in ['model', 'predictions_train', 'predictions_test', 'predictions_test_adjusted', 'predictions_train_proba', 'predictions_test_proba']:
            formatted_value = f"{value:.4f}" if isinstance(value, float) else value
            print(f"{key.replace('_', ' ').title()}: {formatted_value}")
    print()  # Print a newline

Model: Naive Bayes
Best Params: {'alpha': 9.685258460963704}
Roc Auc Cv: 0.8382
Roc Auc Train: 0.8384
Roc Auc Test: 0.8566
Threshold: 0.5778

Model: SVM
Best Params: {'C': 10.642227588416182}
Roc Auc Cv: 0.9707
Roc Auc Train: 0.9709
Roc Auc Test: 0.9538
Threshold: 0.6723



In [11]:
for model_name, result in results.items():
    metrics = calculate_classification_metrics(train_res, test_features_binned, result['predictions_train'], result['predictions_test_adjusted'])
    display_classification_results(metrics, model_name)
    
    # Draw and display ROC curves
    fig_train = draw_roc_curve(train_res['churn'], result['predictions_train'], f'Receiver Operating Characteristic - {model_name} (Train)')
    display(HTML("<h3>ROC AUC (Train):</h3>"))
    pyo.iplot(fig_train)
    
    fig_test = draw_roc_curve(test_features_binned['churn'], result['predictions_test_proba'], f'Receiver Operating Characteristic - {model_name} (Test)')
    display(HTML("<h3>ROC AUC (Test):</h3>"))
    pyo.iplot(fig_test)
    
    display(HTML("<br>"))

Unnamed: 0,precision,recall,f1-score,support
0,0.99,0.58,0.73,5135.0
1,0.7,0.99,0.82,4984.0
accuracy,0.78,0.78,0.78,0.78
macro avg,0.84,0.79,0.78,10119.0
weighted avg,0.84,0.78,0.77,10119.0

Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,2988,2147
Actual Positive,44,4940


Unnamed: 0,precision,recall,f1-score,support
0,0.99,0.61,0.75,1712.0
1,0.19,0.96,0.32,163.0
accuracy,0.64,0.64,0.64,0.64
macro avg,0.59,0.78,0.54,1875.0
weighted avg,0.92,0.64,0.72,1875.0

Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,1042,670
Actual Positive,7,156


Unnamed: 0,precision,recall,f1-score,support
0,0.96,0.88,0.92,5135.0
1,0.89,0.96,0.92,4984.0
accuracy,0.92,0.92,0.92,0.92
macro avg,0.92,0.92,0.92,10119.0
weighted avg,0.92,0.92,0.92,10119.0

Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,4522,613
Actual Positive,208,4776


Unnamed: 0,precision,recall,f1-score,support
0,0.98,0.93,0.95,1712.0
1,0.52,0.79,0.62,163.0
accuracy,0.92,0.92,0.92,0.92
macro avg,0.75,0.86,0.79,1875.0
weighted avg,0.94,0.92,0.93,1875.0

Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,1593,119
Actual Positive,35,128


In [13]:
# Save each model to a pickle file
for model_name, result in results.items():
    model = result['model']
    model_filename = Path(MODELS_DIR / f'{model_name}_model_2.pkl')
    with open(model_filename, 'wb') as f:
        pickle.dump(model, f)