In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from scipy.stats import randint, uniform
import math
import warnings
from imblearn.combine import SMOTEENN
from scipy.stats import chi2_contingency
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler,OrdinalEncoder, PowerTransformer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay, \
                            precision_score, recall_score, f1_score, roc_auc_score,roc_curve 
warnings.filterwarnings("ignore")

%matplotlib inline

In [None]:
data_df = pd.read_csv(f'{Path.cwd().parent}/data/processed/processed_data.csv')

In [None]:
data_df.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Unnamed: 0,0,1,2,3,4,5,6,7,8,9
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU,9305-CDSKC,1452-KIOVK,6713-OKOMC,7892-POOKP,6388-TABGU
gender,1,0,0,0,1,1,0,1,1,0
SeniorCitizen,1,1,1,1,1,1,1,1,1,1
Partner,1,0,0,0,0,0,0,0,1,0
Dependents,1,1,1,1,1,1,0,1,1,0
tenure,1,34,2,45,2,8,22,10,28,62
PhoneService,1,0,0,1,0,0,0,1,0,0
MultipleLines,No phone service,No,No,No phone service,No,Yes,Yes,No phone service,Yes,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic,Fiber optic,Fiber optic,DSL,Fiber optic,DSL


In [None]:
X = data_df.drop(['customerID', 'Churn','Unnamed: 0'], axis = 1)
y = data_df['Churn']

In [None]:
X_train,  X_test, y_train,y_test = train_test_split(X,y, random_state = 42, test_size =0.2)

In [None]:
multi_class_var = ['MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaymentMethod']

In [None]:
def validation_monitor(pred,test):
    model_test_accuracy = accuracy_score(pred,test)
    model_test_precision = precision_score(pred,test)
    model_test_f1 = f1_score(pred,test)
    model_test_recall = recall_score(pred,test)
    model_test_rocauc_score = roc_auc_score(pred,test)

    return model_test_accuracy,model_test_precision,model_test_f1,model_test_recall,model_test_rocauc_score


In [None]:
preprocessor = ColumnTransformer([
    ('onehotencoding',OneHotEncoder(),multi_class_var)
])

In [None]:
model_output = {}

In [None]:
models = [LogisticRegression(),RandomForestClassifier(), AdaBoostClassifier(), GradientBoostingClassifier(),KNeighborsClassifier(),
          DecisionTreeClassifier()]

In [None]:
scalar = StandardScaler()

In [None]:
param_distributions = {
    
    'LogisticRegression': {
        'model__C': uniform(0.001, 100),
        'model__penalty': ['l1', 'l2', 'elasticnet'],
        'model__solver': ['liblinear', 'saga'],
        'model__class_weight': ['balanced', None],
        'model__max_iter': [100, 200, 500]
    },
    
    'RandomForestClassifier': {
        'model__n_estimators': randint(100, 500),
        'model__max_depth': [10, 20, 30, None],
        'model__min_samples_split': randint(2, 20),
        'model__min_samples_leaf': randint(1, 10),
        'model__max_features': ['sqrt', 'log2', None],
        'model__class_weight': ['balanced', 'balanced_subsample', None]
    },
    
    'AdaBoostClassifier': {
        'model__n_estimators': randint(50, 200),
        'model__learning_rate': uniform(0.01, 1.0)
    },
    
    'GradientBoostingClassifier': {
        'model__n_estimators': randint(100, 500),
        'model__learning_rate': uniform(0.01, 0.3),
        'model__max_depth': randint(3, 8),
        'model__min_samples_split': randint(2, 20),
        'model__min_samples_leaf': randint(1, 10),
        'model__subsample': uniform(0.6, 0.4),
        'model__max_features': ['sqrt', 'log2', None]
    },
    
    'KNeighborsClassifier': {
        'model__n_neighbors': randint(3, 21),
        'model__weights': ['uniform', 'distance'],
        'model__metric': ['euclidean', 'manhattan', 'minkowski'],
        'model__p': [1, 2],
        'model__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
    },
    
    'DecisionTreeClassifier': {
        'model__max_depth': [5, 10, 15, 20, None],
        'model__min_samples_split': randint(2, 50),
        'model__min_samples_leaf': randint(1, 20),
        'model__max_features': ['sqrt', 'log2', None],
        'model__criterion': ['gini', 'entropy'],
        'model__splitter': ['best', 'random'],
        'model__class_weight': ['balanced', None]
    }
}

In [None]:
best_overall_f1 = -1
best_overall_pipeline = None
best_overall_name = ""

for model in models:

    model_name = model.__class__.__name__

    pipeline = ImbPipeline([
        ('pre', preprocessor),
        ('smote', SMOTEENN(random_state=42)),
        ('scalar',scalar),
        ('model',model)
    ])

    search = RandomizedSearchCV(
        n_iter = 50,
        estimator = pipeline,
        param_distributions=param_distributions[model_name],
        cv = 5,
        scoring = 'f1',
        n_jobs=-1,
        random_state = 42
    )    

    search.fit(X_train, y_train)

    if search.best_score_ > best_overall_f1:
        best_overall_f1 = search.best_score_
        best_model_overall = search.best_estimator_
        best_model_parameter = search.best_params_

print(f'---------------Best Model Attributes----------------')
print(f'Best Model : {best_model_overall}')
print(f'f1 : {best_model_parameter}')
print(f'recall : {best_overall_f1}')





ValueError: Invalid parameter 'algorithm' for estimator AdaBoostClassifier(). Valid parameters are: ['estimator', 'learning_rate', 'n_estimators', 'random_state'].