In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Import the data

The actual data is Netflix Customer Churn Dataset. I uploaded it into R and did some data cleaning & data visualizations. I encoded categorical variables and exported the cleaned csv file. The corresponding files are in the following link: 

https://github.com/asadov-vasif/R-Projects/tree/main/Data%20Cleaning%201

In [None]:
df = pd.read_csv('/kaggle/input/netflix-data/cleaned_netflix_data.csv')

In [None]:
df.head()

## 2. Scale the numerical variables

In [None]:
numeric_cols = ['age', 'watch_hours', 'last_login_days', 'monthly_fee',
                'number_of_profiles', 'avg_watch_time_per_day']

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [None]:
df.head()

## 3. Prepare data 

In [None]:
y = df['churned']
X = df.drop('churned', axis=1)

In [None]:
X.head()

In [None]:
y

## 4. Classification Models

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                    f1_score, roc_auc_score, confusion_matrix)


In [None]:
def model_results(X, y, models, modelnames):
    X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.3, random_state=42, shuffle=True)
    
    metrics = {}
    conf_matrices = {}

    for model, modelname in zip(models, modelnames):

        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, zero_division=0)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_prob)
        gini = 2 * roc_auc - 1
        conf_mat = confusion_matrix(y_test, y_pred)

        metrics[modelname] = [
            accuracy, precision, recall, f1, roc_auc, gini
        ]
        conf_matrices[modelname] = conf_mat

    metrics_df = pd.DataFrame(
        metrics,
        index=["Accuracy", "Precision", "Recall", "F1-Score", "AUC ROC", "Gini"]
    )

    return metrics_df.T, conf_matrices

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [None]:
models = [
    LogisticRegression(max_iter=1000, solver='lbfgs'),
    GaussianNB(),
    KNeighborsClassifier(),
    SVC(probability=True, kernel='rbf'),
    DecisionTreeClassifier(random_state=42),
    RandomForestClassifier(random_state=42),
    ExtraTreesClassifier(random_state=42),
    XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    LGBMClassifier(random_state=42)]


modelnames = [
    "Logistic Regression",
    "Naive Bayes",
    "KNN",
    "SVM",
    "Decision Tree",
    "Random Forest",
    "Extra Trees",
    "XGBoost",
    "LightGBM"]

In [None]:
metrics_df, conf_matrices = model_results(X, y, models, modelnames)

In [None]:
metrics_df

In [None]:
# === Save metrics to CSV ===
metrics_df.to_csv("classification_metrics.csv", index=True)

Collect the confusion matrices to import them as csv file

In [None]:
conf_matrix_list = []
for name, cm in conf_matrices.items():
    tn, fp, fn, tp = cm.ravel()  # for binary classification
    conf_matrix_list.append({
        "Model": name,
        "True Negative": tn,
        "False Positive": fp,
        "False Negative": fn,
        "True Positive": tp
    })

In [None]:
conf_matrix_df = pd.DataFrame(conf_matrix_list)

In [None]:
conf_matrix_df

In [None]:
# Save confusion matrices to CSV
conf_matrix_df.to_csv("confusion_matrices.csv", index=False)
