In [59]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../dataset_comp/dataset_task1.csv')

In [3]:
df.head(10)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes
5,9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes
6,1452-KIOVK,Male,0,No,Yes,22,Yes,Yes,Fiber optic,No,...,No,No,Yes,No,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,No
7,6713-OKOMC,Female,0,No,No,10,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,No,Mailed check,29.75,301.9,No
8,7892-POOKP,Female,0,Yes,No,28,Yes,Yes,Fiber optic,No,...,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes
9,6388-TABGU,Male,0,No,Yes,62,Yes,No,DSL,Yes,...,No,No,No,No,One year,No,Bank transfer (automatic),56.15,3487.95,No


In [4]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [5]:
df.isnull().sum()

# as we see that the total number of null data is zero so we can say that we will not need to delete or handle missing values

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [6]:
df.info()
# understanding the features and their datatypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [7]:
for i in df.columns:
    print(f"{i}: {df[i].nunique()}")

customerID: 7043
gender: 2
SeniorCitizen: 2
Partner: 2
Dependents: 2
tenure: 73
PhoneService: 2
MultipleLines: 3
InternetService: 3
OnlineSecurity: 3
OnlineBackup: 3
DeviceProtection: 3
TechSupport: 3
StreamingTV: 3
StreamingMovies: 3
Contract: 3
PaperlessBilling: 2
PaymentMethod: 4
MonthlyCharges: 1585
TotalCharges: 6531
Churn: 2


In [35]:
df.drop(columns=['customerID'], inplace=True)

In [39]:
# above we saw that total charges is somehow an object, so we will convert it to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [43]:
# as there are some missing values we will fill them using an observation of monthly_charges * tenure = total_charges
df['TotalCharges'].fillna(df['tenure'] * df['MonthlyCharges'], inplace=True)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





In [46]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# we will first divide it into categorical and numerical features
categorical_cols = ['gender','Partner','Dependents','PhoneService',
                    'MultipleLines','InternetService','OnlineSecurity','OnlineBackup',
                    'DeviceProtection','TechSupport','StreamingTV','StreamingMovies',
                    'Contract','PaperlessBilling','PaymentMethod']

numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']




In [47]:
# we will use chi square test on each categorical column and find if there is a relation between it and our target churn
from scipy.stats import chi2_contingency
for col in categorical_cols:
    contingency = pd.crosstab(df[col], df['Churn'])
    chi2, p, dof, ex = chi2_contingency(contingency)
    print(f"{col}: p-value = {p}")

gender: p-value = 0.48657873605618596
Partner: p-value = 2.1399113440759935e-36
Dependents: p-value = 4.9249216612154196e-43
PhoneService: p-value = 0.3387825358066928
MultipleLines: p-value = 0.0034643829548773
InternetService: p-value = 9.571788222840544e-160
OnlineSecurity: p-value = 2.661149635176552e-185
OnlineBackup: p-value = 2.0797592160864276e-131
DeviceProtection: p-value = 5.505219496457244e-122
TechSupport: p-value = 1.4430840279998987e-180
StreamingTV: p-value = 5.528994485739183e-82
StreamingMovies: p-value = 2.667756755723681e-82
Contract: p-value = 5.863038300673391e-258
PaperlessBilling: p-value = 4.073354668665985e-58
PaymentMethod: p-value = 3.6823546520097993e-140


In [None]:
#converting the yes and no into 1's and 0's
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

In [None]:

x = df.drop(columns=['Churn'])
y = df['Churn']

In [70]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# models = [
#     ('LogisticRegression', LogisticRegression(max_iter=500)),
#     ('RandomForest', RandomForestClassifier(n_estimators=100)),
#     ('GradientBoosting', GradientBoostingClassifier(n_estimators=100)),
#     ('LDA', LinearDiscriminantAnalysis()),
#     ('QDA', QuadraticDiscriminantAnalysis()),
#     ('KNN', KNeighborsClassifier()),
#     ('SVM', SVC())
# ]

# Define transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'
    )

# # Full pipeline with SMOTE
# for name, model in models:
#     clf = Pipeline(steps=[
#         ('preprocessor', preprocessor),  # your ColumnTransformer
#         ('smote', SMOTE(random_state=42)),
#         ('model', model)
#     ])
#     clf.fit(x_train, y_train)
#     y_pred = clf.predict(x_test)
#     print(f"\n{name} Results:")
#     print(classification_report(y_test, y_pred))




LogisticRegression Results:
              precision    recall  f1-score   support

           0       0.93      0.73      0.82      1036
           1       0.53      0.84      0.65       373

    accuracy                           0.76      1409
   macro avg       0.73      0.78      0.73      1409
weighted avg       0.82      0.76      0.77      1409


RandomForest Results:
              precision    recall  f1-score   support

           0       0.85      0.86      0.85      1036
           1       0.60      0.56      0.58       373

    accuracy                           0.78      1409
   macro avg       0.72      0.71      0.72      1409
weighted avg       0.78      0.78      0.78      1409


GradientBoosting Results:
              precision    recall  f1-score   support

           0       0.89      0.83      0.86      1036
           1       0.60      0.72      0.66       373

    accuracy                           0.80      1409
   macro avg       0.75      0.77      0.76      


The covariance matrix of class 0 is not full rank. Increasing the value of parameter `reg_param` might help reducing the collinearity.


The covariance matrix of class 1 is not full rank. Increasing the value of parameter `reg_param` might help reducing the collinearity.




QDA Results:
              precision    recall  f1-score   support

           0       0.91      0.52      0.66      1036
           1       0.39      0.85      0.54       373

    accuracy                           0.61      1409
   macro avg       0.65      0.69      0.60      1409
weighted avg       0.77      0.61      0.63      1409


KNN Results:
              precision    recall  f1-score   support

           0       0.89      0.69      0.78      1036
           1       0.47      0.76      0.58       373

    accuracy                           0.71      1409
   macro avg       0.68      0.73      0.68      1409
weighted avg       0.78      0.71      0.72      1409


SVM Results:
              precision    recall  f1-score   support

           0       0.89      0.77      0.83      1036
           1       0.54      0.75      0.63       373

    accuracy                           0.77      1409
   macro avg       0.72      0.76      0.73      1409
weighted avg       0.80      0.7

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
# Pipeline
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('model', RandomForestClassifier(random_state=42))
])

# Parameter grid
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [5, 10],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [1, 2, 4]
}

# GridSearch
grid_search = GridSearchCV(rf_pipeline, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(x_train, y_train)

# Test and eval
print("Best params:", grid_search.best_params_)
print("Best ROC-AUC:", grid_search.best_score_)
print("\nClassification Report on Test Set:")
y_pred = grid_search.predict(x_test)
print(classification_report(y_test, y_pred))
y_proba = grid_search.predict_proba(x_test)[:, 1]
print("Test ROC-AUC:", roc_auc_score(y_test, y_proba))


Best params: {'model__max_depth': 5, 'model__min_samples_leaf': 2, 'model__min_samples_split': 5, 'model__n_estimators': 100}
Best ROC-AUC: 0.837690615631443

Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.91      0.75      0.82      1036
           1       0.53      0.80      0.64       373

    accuracy                           0.76      1409
   macro avg       0.72      0.77      0.73      1409
weighted avg       0.81      0.76      0.77      1409

Test ROC-AUC: 0.8533361971699773


In [76]:
from sklearn.linear_model import LogisticRegression

# Pipeline
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('model', LogisticRegression(max_iter=500, random_state=42))
])

# Parameter grid
param_grid_lr = {
    'model__C': [0.01, 0.1, 1, 10],
    'model__penalty': ['l2'],
    'model__solver': ['lbfgs']
}

# GridSearch
grid_search_lr = GridSearchCV(lr_pipeline, param_grid_lr, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search_lr.fit(x_train, y_train)

print("Best params (LR):", grid_search_lr.best_params_)
print("Best ROC-AUC (CV):", grid_search_lr.best_score_)

# Test evaluation
y_pred = grid_search_lr.predict(x_test)
y_proba = grid_search_lr.predict_proba(x_test)[:, 1]

print("\nClassification Report (LR):")
print(classification_report(y_test, y_pred))
print("Test ROC-AUC (LR):", roc_auc_score(y_test, y_proba))


Best params (LR): {'model__C': 10, 'model__penalty': 'l2', 'model__solver': 'lbfgs'}
Best ROC-AUC (CV): 0.8399446607654264

Classification Report (LR):
              precision    recall  f1-score   support

           0       0.92      0.72      0.81      1036
           1       0.52      0.83      0.64       373

    accuracy                           0.75      1409
   macro avg       0.72      0.78      0.73      1409
weighted avg       0.82      0.75      0.77      1409

Test ROC-AUC (LR): 0.8609171695632821


In [None]:
from sklearn.neighbors import KNeighborsClassifier

# model_pipeline
knn_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('model', KNeighborsClassifier())
])

#param grid for the model
param_grid_knn = {
    'model__n_neighbors': [3, 5, 7, 9],
    'model__weights': ['uniform', 'distance'],
    'model__p': [1, 2]  # Manhattan (1) or Euclidean (2)
}

#implementing grid search cv
grid_search_knn = GridSearchCV(knn_pipeline, param_grid_knn, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search_knn.fit(x_train, y_train)


# Tests and evals
print("Best params (KNN):", grid_search_knn.best_params_)
print("Best ROC-AUC (CV):", grid_search_knn.best_score_)

y_pred = grid_search_knn.predict(x_test)
y_proba = grid_search_knn.predict_proba(x_test)[:, 1]

print("\nClassification Report (KNN):")
print(classification_report(y_test, y_pred))
print("Test ROC-AUC (KNN):", roc_auc_score(y_test, y_proba))


Best params (KNN): {'model__n_neighbors': 9, 'model__p': 1, 'model__weights': 'uniform'}
Best ROC-AUC (CV): 0.7913289385935798

Classification Report (KNN):
              precision    recall  f1-score   support

           0       0.89      0.73      0.80      1036
           1       0.50      0.76      0.61       373

    accuracy                           0.74      1409
   macro avg       0.70      0.75      0.71      1409
weighted avg       0.79      0.74      0.75      1409

Test ROC-AUC (KNN): 0.816267195958885


In [78]:
from sklearn.svm import SVC

svm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('model', SVC(probability=True, random_state=42))
])

param_grid_svm = {
    'model__C': [0.1, 1, 10],
    'model__kernel': ['linear', 'rbf'],
    'model__gamma': ['scale', 'auto']
}

grid_search_svm = GridSearchCV(svm_pipeline, param_grid_svm, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search_svm.fit(x_train, y_train)

print("Best params (SVM):", grid_search_svm.best_params_)
print("Best ROC-AUC (CV):", grid_search_svm.best_score_)

y_pred = grid_search_svm.predict(x_test)
y_proba = grid_search_svm.predict_proba(x_test)[:, 1]

print("\nClassification Report (SVM):")
print(classification_report(y_test, y_pred))
print("Test ROC-AUC (SVM):", roc_auc_score(y_test, y_proba))


Best params (SVM): {'model__C': 10, 'model__gamma': 'scale', 'model__kernel': 'linear'}
Best ROC-AUC (CV): 0.8378233846488315

Classification Report (SVM):
              precision    recall  f1-score   support

           0       0.93      0.70      0.80      1036
           1       0.51      0.84      0.63       373

    accuracy                           0.74      1409
   macro avg       0.72      0.77      0.72      1409
weighted avg       0.82      0.74      0.76      1409

Test ROC-AUC (SVM): 0.8601576490316437


In [79]:
from xgboost import XGBClassifier

xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('model', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
])

param_grid_xgb = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [3, 5, 7],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__subsample': [0.7, 1]
}

grid_search_xgb = GridSearchCV(xgb_pipeline, param_grid_xgb, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search_xgb.fit(x_train, y_train)

print("Best params (XGBoost):", grid_search_xgb.best_params_)
print("Best ROC-AUC (CV):", grid_search_xgb.best_score_)

y_pred = grid_search_xgb.predict(x_test)
y_proba = grid_search_xgb.predict_proba(x_test)[:, 1]

print("\nClassification Report (XGBoost):")
print(classification_report(y_test, y_pred))
print("Test ROC-AUC (XGBoost):", roc_auc_score(y_test, y_proba))


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Traceback (most recent call last):
  File "/opt/miniconda3/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 942, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/opt/miniconda3/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 308, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/miniconda3/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 400, in _score
    

Best params (XGBoost): {'model__learning_rate': 0.01, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__subsample': 0.7}
Best ROC-AUC (CV): nan

Classification Report (XGBoost):
              precision    recall  f1-score   support

           0       0.92      0.71      0.80      1036
           1       0.50      0.83      0.63       373

    accuracy                           0.74      1409
   macro avg       0.71      0.77      0.71      1409
weighted avg       0.81      0.74      0.75      1409

Test ROC-AUC (XGBoost): 0.8465173331125073
