In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 1. Load the dataset from URL
# Try the provided URL, with a fallback to the standard raw format if needed
url = "https://raw.githubusercontent.com/akay6483/mmml-ecommerce/refs/heads/main/dataset/data_ecommerce_customer_churn.csv"

try:
    df = pd.read_csv(url)
    print("Successfully read from provided URL.")
except Exception as e:
    print(f"Could not read from provided URL ({e}). Trying standard raw format...")
    # Fallback to standard raw.githubusercontent format (removing refs/heads/)
    url_backup = "https://raw.githubusercontent.com/akay6483/mmml-ecommerce/main/dataset/data_ecommerce_customer_churn.csv"
    df = pd.read_csv(url_backup)
    print("Successfully read from backup URL.")

# 2. Preprocessing Setup
X = df.drop('Churn', axis=1)
y = df['Churn']

# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# 3. Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Define Models
models = {
    "K-Nearest Neighbour": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Trees": DecisionTreeClassifier(random_state=42),
    "Support Vector Machines": SVC(random_state=42),
    "Naive Bayes": GaussianNB()
}

# 5. Train and Evaluate
results = []
print("\nModel Performance:")

for name, model in models.items():
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', model)])

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, zero_division=0),
        "Recall": recall_score(y_test, y_pred, zero_division=0),
        "F1 Score": f1_score(y_test, y_pred, zero_division=0)
    })

# 6. Display Results
results_df = pd.DataFrame(results).sort_values(by='Accuracy', ascending=False)
display(results_df)