In [1]:

# === Common Imports ===
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Make plots a bit larger
plt.rcParams["figure.figsize"] = (6,4)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import (r2_score, mean_squared_error, mean_absolute_error,
                             accuracy_score, precision_score, recall_score, f1_score,
                             confusion_matrix, roc_curve, auc, jaccard_score)
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.decomposition import PCA

# Optional: SciPy for dendrogram (only used if available)
try:
    from scipy.cluster.hierarchy import dendrogram, linkage
    SCIPY_OK = True
except Exception:
    SCIPY_OK = False

import os

RANDOM_STATE = 42

# === Small helpers ===
def ensure_csv_or_make(df, fname):
    # If fname exists, return pd.read_csv(fname); else save df to fname then read+return.
    if os.path.exists(fname):
        return pd.read_csv(fname)
    else:
        df.to_csv(fname, index=False)
        return df

def plot_actual_vs_pred(y_true, y_pred, title="Actual vs Predicted"):
    plt.figure()
    plt.scatter(y_true, y_pred, alpha=0.7)
    mn = min(y_true.min(), y_pred.min())
    mx = max(y_true.max(), y_pred.max())
    plt.plot([mn, mx], [mn, mx])
    plt.xlabel("Actual")
    plt.ylabel("Predicted")
    plt.title(title)
    plt.show()

def print_regression_metrics(y_true, y_pred):
    print("R2:", r2_score(y_true, y_pred))
    print("MAE:", mean_absolute_error(y_true, y_pred))
    print("RMSE:", mean_squared_error(y_true, y_pred, squared=False))

def plot_confusion_matrix_basic(cm, class_names=None, title="Confusion Matrix"):
    plt.figure()
    plt.imshow(cm, interpolation='nearest')
    plt.title(title)
    plt.colorbar()
    import numpy as _np
    tick_marks = _np.arange(cm.shape[0])
    if class_names is None:
        class_names = [str(i) for i in range(cm.shape[0])]
    plt.xticks(tick_marks, class_names, rotation=45)
    plt.yticks(tick_marks, class_names)
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, format(cm[i, j], 'd'),
                     ha="center", va="center",
                     color="white" if cm[i, j] > thresh else "black")
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.show()


In [2]:

fname = "pima-indians-diabetes.data.csv"
if not os.path.exists(fname):
    np.random.seed(RANDOM_STATE)
    n = 768
    Xsyn = np.abs(np.random.randn(n, 8))*5
    w = np.array([0.3,-0.7,1.1,0.2,-0.5,0.8,-1.0,0.6])
    logits = Xsyn @ w + np.random.normal(0, 1, size=n)
    ysyn = (logits > np.median(logits)).astype(int)
    df = pd.DataFrame(Xsyn, columns=[f"f{i}" for i in range(8)])
    df["Outcome"] = ysyn
    df = ensure_csv_or_make(df, fname)

data = pd.read_csv(fname)
ycol = "Outcome" if "Outcome" in data.columns else data.columns[-1]
X = data.drop(columns=[ycol]).values
y = data[ycol].values.astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=RANDOM_STATE, stratify=y)

gnb = GaussianNB()
gnb.fit(X_train, y_train)
pred_g = gnb.predict(X_test)
print("GaussianNB – Accuracy:", accuracy_score(y_test, pred_g))
print("GaussianNB – F1:", f1_score(y_test, pred_g))

Xb_train = (X_train > np.median(X_train, axis=0)).astype(int)
Xb_test = (X_test > np.median(X_train, axis=0)).astype(int)
bnb = BernoulliNB()
bnb.fit(Xb_train, y_train)
pred_b = bnb.predict(Xb_test)
print("BernoulliNB – Accuracy:", accuracy_score(y_test, pred_b))
print("BernoulliNB – F1:", f1_score(y_test, pred_b))


GaussianNB – Accuracy: 0.9010416666666666
GaussianNB – F1: 0.9025641025641026
BernoulliNB – Accuracy: 0.7708333333333334
BernoulliNB – F1: 0.7684210526315789
