In [10]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

# Import Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
import joblib

In [11]:
#load data 
data = pd.read_csv("cleaned_data.csv")
data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,No,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,No,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,No,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,No,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,No,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [12]:
# Backing up the data
df = data.copy()
# Explore Data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   object 
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [13]:
# Encode target columns
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})
df["Churn"].value_counts()

Churn
0    5174
1    1869
Name: count, dtype: int64

In [14]:
# Seperate features and target
X = df.drop(columns= ['Churn'], axis=1)
y = df['Churn']

In [15]:
# Numeric columns and categoric columns
num_columns = X.select_dtypes(include= 'number').columns.to_list()
cat_columns = X.select_dtypes(include= 'object').columns.to_list()

print('The numeric columns are: \n', num_columns)
print('The categorical columns are: \n', cat_columns)

The numeric columns are: 
 ['tenure', 'MonthlyCharges', 'TotalCharges']
The categorical columns are: 
 ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']


In [16]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [17]:
# Processing the data
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy= 'median')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps= [
    ('imputer', SimpleImputer(strategy= 'most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown= 'ignore', sparse_output= False)) # sparse_ouput = False is to get dense matrix due to Machine Learning
])

# Combine Steps
preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_columns),
    ('cat', cat_transformer, cat_columns)
])

In [18]:
# Models
model_dict = {
    'Logistic_Regression': LogisticRegression(max_iter=1000),
    'SVM': SVC(probability=True),
    'RandomForestClassifier': RandomForestClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder = False, eval_metric = 'logloss')
}
# Hyperparameter
# Generic search space
search_space ={
    'C': [0.1,1,10],
    'kernal': ['linear', 'rgf'],
    'n_estimators': [50,100,200],
    'max_depth': [None,5,10],
    'learning_rate': [0.5,1]
}

# Function to filter hyperparameter
def filter_hyperparameter(model, space):
    valid_keys = model.get_params().keys()
    return {k:v for k, v in space.items() if k in valid_keys}

In [None]:
# Grid Search for each model
result = []
best_pipeline = {}

for name, model in model_dict.items():
    print(f'Tuning {model}...')
    pipe = Pipeline(steps = [
        ('processor', preprocessor),
        ('model', model)
    ])
    hyperparameter = filter_hyperparameter(model, search_space)
    # Prefix model name
    param_grid = {f'model__{k}':v for k,v in hyperparameter.items()}
    grid = GridSearchCV(estimator= pipe, param_grid=param_grid, cv =5, scoring= 'accuracy', n_jobs=-1)
    grid.fit(X_train,y_train)

    y_pred = grid.predict(X_test)
    report = metrics.classification_report(y_test,y_pred, output_dict= True) # output_dict = True because the result will come in dictionary
    
    result.append({
        'model_name': name, 
        'best_parameters': param_grid,
        'accuracy': round(metrics.accuracy_score(y_test,y_pred), 4),
        'f1-score': round(report['weighted avg']['f1-score'], 4)
    })

    best_pipeline[name] = grid.best_estimator_

Tuning LogisticRegression(max_iter=1000)...
Tuning SVC(probability=True)...


In [None]:
# Compare the models
result_df = pd.DataFrame(result)
sorted_result_df = result_df.sort_values(by = 'accuracy', ascending=False)
print('Model Comparisons: \n', sorted_result_df)

# Best Model
best_row = sorted_result_df.iloc[0]
best_model = best_row['model_name']
print('\nBest Model:', best_model)
print('\nBest Hyperparameter:', best_row['best_parameters'])

In [None]:
# Retraining best model on full dataset
final_pipeline = best_pipeline[best_model]
final_pipeline.fit(X,y)

# Save Pipeline
joblib.dump(final_pipeline, 'churnpipeline.pkl')
print('The best model is saved as: churnpipeline.pkl')