In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Import the data

The actual data is Netflix Customer Churn Dataset. I uploaded it into R and did some data cleaning & data visualizations. I encoded categorical variables and exported the cleaned csv file. The corresponding files are in the following link: 

https://github.com/asadov-vasif/R-Projects/tree/main/Data%20Cleaning%201

In [2]:
df = pd.read_csv('/kaggle/input/netflix-data/cleaned_netflix_data.csv')

In [3]:
df.head()

Unnamed: 0,age,watch_hours,last_login_days,monthly_fee,churned,number_of_profiles,avg_watch_time_per_day,genderFemale,genderMale,genderOther,...,payment_methodCrypto,payment_methodDebit Card,payment_methodGift Card,payment_methodPayPal,favorite_genreComedy,favorite_genreDocumentary,favorite_genreDrama,favorite_genreHorror,favorite_genreRomance,favorite_genreSci-Fi
0,51,14.73,29,8.99,1,1,0.49,0,0,1,...,0,0,1,0,0,0,0,0,0,0
1,47,0.7,19,13.99,1,5,0.03,0,0,1,...,0,0,1,0,0,0,0,0,0,1
2,27,16.32,10,13.99,0,2,1.48,1,0,0,...,1,0,0,0,0,0,1,0,0,0
3,53,4.51,12,17.99,1,2,0.35,0,0,1,...,1,0,0,0,0,0,0,1,0,0
4,56,1.89,13,13.99,1,2,0.13,0,0,1,...,1,0,0,0,0,0,0,0,0,0


## 2. Scale the numerical variables

In [4]:
numeric_cols = ['age', 'watch_hours', 'last_login_days', 'monthly_fee',
                'number_of_profiles', 'avg_watch_time_per_day']

In [5]:
from sklearn.preprocessing import StandardScaler

In [6]:
scaler = StandardScaler()

In [7]:
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [8]:
df.head()

Unnamed: 0,age,watch_hours,last_login_days,monthly_fee,churned,number_of_profiles,avg_watch_time_per_day,genderFemale,genderMale,genderOther,...,payment_methodCrypto,payment_methodDebit Card,payment_methodGift Card,payment_methodPayPal,favorite_genreComedy,favorite_genreDocumentary,favorite_genreDrama,favorite_genreHorror,favorite_genreRomance,favorite_genreSci-Fi
0,0.461471,0.256425,-0.062152,-1.271341,1,-1.429965,-0.146895,0,0,1,...,0,0,1,0,0,0,0,0,0,0
1,0.203399,-0.911432,-0.632462,0.083051,1,1.395494,-0.322497,0,0,1,...,0,0,1,0,0,0,0,0,0,1
2,-1.086959,0.388777,-1.145741,0.083051,0,-0.7236,0.231031,1,0,0,...,1,0,0,0,0,0,1,0,0,0
3,0.590506,-0.594288,-1.031679,1.166565,1,-0.7236,-0.200339,0,0,1,...,1,0,0,0,0,0,0,1,0,0
4,0.78406,-0.812377,-0.974648,0.083051,1,-0.7236,-0.284322,0,0,1,...,1,0,0,0,0,0,0,0,0,0


## 3. Prepare data 

In [12]:
y = df['churned']
X = df.drop('churned', axis=1)

In [14]:
X.head()

Unnamed: 0,age,watch_hours,last_login_days,monthly_fee,number_of_profiles,avg_watch_time_per_day,genderFemale,genderMale,genderOther,subscription_typePremium,...,payment_methodCrypto,payment_methodDebit Card,payment_methodGift Card,payment_methodPayPal,favorite_genreComedy,favorite_genreDocumentary,favorite_genreDrama,favorite_genreHorror,favorite_genreRomance,favorite_genreSci-Fi
0,0.461471,0.256425,-0.062152,-1.271341,-1.429965,-0.146895,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
1,0.203399,-0.911432,-0.632462,0.083051,1.395494,-0.322497,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1
2,-1.086959,0.388777,-1.145741,0.083051,-0.7236,0.231031,1,0,0,0,...,1,0,0,0,0,0,1,0,0,0
3,0.590506,-0.594288,-1.031679,1.166565,-0.7236,-0.200339,0,0,1,1,...,1,0,0,0,0,0,0,1,0,0
4,0.78406,-0.812377,-0.974648,0.083051,-0.7236,-0.284322,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0


In [15]:
y

0       1
1       1
2       0
3       1
4       1
       ..
4995    0
4996    0
4997    1
4998    0
4999    1
Name: churned, Length: 5000, dtype: int64

## 4. Classification Models

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                    f1_score, roc_auc_score, confusion_matrix)


In [17]:
def model_results(X, y, models, modelnames):
    X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.3, random_state=42, shuffle=True)
    
    metrics = {}
    conf_matrices = {}

    for model, modelname in zip(models, modelnames):

        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, zero_division=0)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_prob)
        gini = 2 * roc_auc - 1
        conf_mat = confusion_matrix(y_test, y_pred)

        metrics[modelname] = [
            accuracy, precision, recall, f1, roc_auc, gini
        ]
        conf_matrices[modelname] = conf_mat

    metrics_df = pd.DataFrame(
        metrics,
        index=["Accuracy", "Precision", "Recall", "F1-Score", "AUC ROC", "Gini"]
    )

    return metrics_df.T, conf_matrices

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [20]:
models = [
    LogisticRegression(max_iter=1000, solver='lbfgs'),
    GaussianNB(),
    KNeighborsClassifier(),
    SVC(probability=True, kernel='rbf'),
    DecisionTreeClassifier(random_state=42),
    RandomForestClassifier(random_state=42),
    ExtraTreesClassifier(random_state=42),
    XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    LGBMClassifier(random_state=42)]


modelnames = [
    "Logistic Regression",
    "Naive Bayes",
    "KNN",
    "SVM",
    "Decision Tree",
    "Random Forest",
    "Extra Trees",
    "XGBoost",
    "LightGBM"]

In [21]:
metrics_df, conf_matrices = model_results(X, y, models, modelnames)

[LightGBM] [Info] Number of positive: 1768, number of negative: 1732
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001115 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 684
[LightGBM] [Info] Number of data points in the train set: 3500, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.505143 -> initscore=0.020572
[LightGBM] [Info] Start training from score 0.020572


In [27]:
metrics_df

Unnamed: 0,Accuracy,Precision,Recall,F1-Score,AUC ROC,Gini
Logistic Regression,0.902667,0.889754,0.91834,0.903821,0.968195,0.93639
Naive Bayes,0.816667,0.73506,0.987952,0.842947,0.946036,0.892073
KNN,0.833333,0.81982,0.852744,0.835958,0.915634,0.831268
SVM,0.916,0.903771,0.930388,0.916887,0.975201,0.950403
Decision Tree,0.974,0.975806,0.971888,0.973843,0.973992,0.947983
Random Forest,0.974667,0.979702,0.96921,0.974428,0.997676,0.995353
Extra Trees,0.937333,0.93245,0.942436,0.937417,0.985183,0.970366
XGBoost,0.994667,0.994645,0.994645,0.994645,0.999879,0.999758
LightGBM,0.994,0.994638,0.993307,0.993972,0.999831,0.999662


In [22]:
# === Save metrics to CSV ===
metrics_df.to_csv("classification_metrics.csv", index=True)

Collect the confusion matrices to import them as csv file

In [24]:
conf_matrix_list = []
for name, cm in conf_matrices.items():
    tn, fp, fn, tp = cm.ravel()  # for binary classification
    conf_matrix_list.append({
        "Model": name,
        "True Negative": tn,
        "False Positive": fp,
        "False Negative": fn,
        "True Positive": tp
    })

In [25]:
conf_matrix_df = pd.DataFrame(conf_matrix_list)

In [28]:
conf_matrix_df

Unnamed: 0,Model,True Negative,False Positive,False Negative,True Positive
0,Logistic Regression,668,85,61,686
1,Naive Bayes,487,266,9,738
2,KNN,613,140,110,637
3,SVM,679,74,52,695
4,Decision Tree,735,18,21,726
5,Random Forest,738,15,23,724
6,Extra Trees,702,51,43,704
7,XGBoost,749,4,4,743
8,LightGBM,749,4,5,742


In [26]:
# Save confusion matrices to CSV
conf_matrix_df.to_csv("confusion_matrices.csv", index=False)
