In [3]:
# Imports
import numpy as np
import pandas as pd

from ucimlrepo import fetch_ucirepo 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from pathlib import Path

RANDOM_SEEDS = [0, 1, 2]
N_JOBS = -1 

## Test


In [None]:
# ============================================================
# COGS 118A Final Project: Interpretable Models Across Datasets
# Datasets: Adult, Breast Cancer, SMS Spam, HAR, Digits
# Models: Logistic Regression, Decision Tree, KNN
# ============================================================

import numpy as np
import pandas as pd

from ucimlrepo import fetch_ucirepo

from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score

# ------------------------------------------------------------
# Global settings
# ------------------------------------------------------------

RANDOM_STATE = 42
N_TRIALS = 3          # You can reduce to 1 or 2 if runtime is an issue
TRAIN_SIZES = [0.2, 0.5, 0.8]  # Proportion of data used for training

DATASET_NAMES = [
    "adult_income",
    "breast_cancer",
    "sms_spam",
    "har_smartphone",
    "digits"
]

# ------------------------------------------------------------
# Dataset loaders & binary label definitions
# ------------------------------------------------------------

def load_adult_income():
    """
    Adult / Census Income dataset (UCI id=2).
    Task: predict whether income > 50K (1) vs <= 50K (0).
    """
    ds = fetch_ucirepo(id=2)
    X = ds.data.features.copy()
    targets = ds.data.targets

    # Targets can be a Series or DataFrame
    if isinstance(targets, pd.DataFrame):
        y_raw = targets.iloc[:, 0].astype(str)
    else:
        y_raw = targets.astype(str)

    # Positive class: contains '>50K'
    y = y_raw.str.contains('>50K').astype(int)

    info = {
        "name": "Adult Income",
        "is_text": False
    }
    return X, y, info


def load_breast_cancer():
    """
    Breast Cancer Wisconsin (Diagnostic) (UCI id=17).
    Task: Malignant (1) vs Benign (0).
    """
    ds = fetch_ucirepo(id=17)
    X = ds.data.features.copy()
    targets = ds.data.targets

    if isinstance(targets, pd.DataFrame):
        y_raw = targets.iloc[:, 0].astype(str)
    else:
        y_raw = targets.astype(str)

    # Typically 'M' and 'B'
    y = (y_raw == 'M').astype(int)

    info = {
        "name": "Breast Cancer (Diagnostic)",
        "is_text": False
    }
    return X, y, info


def load_sms_spam():
    """
    SMS Spam Collection (UCI id=228).
    Task: spam (1) vs ham (0).
    The features DataFrame has one text column; we just use the first column.
    """
    ds = fetch_ucirepo(id=228)
    X = ds.data.features.copy()
    targets = ds.data.targets

    # X is a DataFrame with one text column.
    text_series = X.iloc[:, 0].astype(str)

    if isinstance(targets, pd.DataFrame):
        y_raw = targets.iloc[:, 0].astype(str)
    else:
        y_raw = targets.astype(str)

    y = (y_raw.str.lower() == "spam").astype(int)

    info = {
        "name": "SMS Spam",
        "is_text": True,
        "text_series": text_series  # store for convenience
    }
    # Return X as the raw text Series to keep types simple in the experiment loop
    return text_series, y, info


def load_har():
    """
    Human Activity Recognition Using Smartphones (UCI id=240).
    Original task: 6-way activity classification.
    Here we make it binary:
        Positive (1): WALKING / WALKING_UPSTAIRS / WALKING_DOWNSTAIRS
        Negative (0): SITTING / STANDING / LAYING
    """
    ds = fetch_ucirepo(id=240)
    X = ds.data.features.copy()
    targets = ds.data.targets

    if isinstance(targets, pd.DataFrame):
        y_raw = targets.iloc[:, 0].astype(str)
    else:
        y_raw = targets.astype(str)

    dynamic = ["WALKING", "WALKING_UPSTAIRS", "WALKING_DOWNSTAIRS"]
    y = y_raw.isin(dynamic).astype(int)

    info = {
        "name": "HAR (Dynamic vs Static)",
        "is_text": False
    }
    return X, y, info


def load_digits():
    """
    Optical Recognition of Handwritten Digits (UCI id=80).
    Original task: 10 digits (0–9).
    Here we make a simple binary task:
        Positive class (1): digits 5–9
        Negative class (0): digits 0–4
    """
    ds = fetch_ucirepo(id=80)
    X = ds.data.features.copy()
    targets = ds.data.targets

    if isinstance(targets, pd.DataFrame):
        y_raw = targets.iloc[:, 0].astype(int)
    else:
        y_raw = targets.astype(int)

    y = (y_raw >= 5).astype(int)

    info = {
        "name": "Digits (5–9 vs 0–4)",
        "is_text": False
    }
    return X, y, info


def load_dataset_by_name(name):
    if name == "adult_income":
        return load_adult_income()
    elif name == "breast_cancer":
        return load_breast_cancer()
    elif name == "sms_spam":
        return load_sms_spam()
    elif name == "har_smartphone":
        return load_har()
    elif name == "digits":
        return load_digits()
    else:
        raise ValueError(f"Unknown dataset name: {name}")


# ------------------------------------------------------------
# Model definitions & hyperparameter grids
# ------------------------------------------------------------

def get_models_and_grids():
    """
    Returns:
        models: dict[name -> base estimator type (class)]
        param_grids: dict[name -> param grid for GridSearchCV]
    NOTE: we only tune clf__* params so this works with any preprocessing.
    """
    models = {
        "log_reg": LogisticRegression(max_iter=1000),
        "decision_tree": DecisionTreeClassifier(random_state=RANDOM_STATE),
        "knn": KNeighborsClassifier()
    }

    param_grids = {
        "log_reg": {
            "clf__C": [0.01, 0.1, 1.0, 10.0],
            "clf__penalty": ["l2"],
            "clf__solver": ["liblinear"]  # good for small/medium datasets
        },
        "decision_tree": {
            "clf__max_depth": [None, 5, 10, 20],
            "clf__min_samples_leaf": [1, 5, 10]
        },
        "knn": {
            "clf__n_neighbors": [3, 5, 11],
            "clf__weights": ["uniform", "distance"]
        }
    }

    return models, param_grids


# ------------------------------------------------------------
# Preprocessing builders
# ------------------------------------------------------------

def build_tabular_preprocessor(X_train):
    """
    Build a ColumnTransformer for mixed numeric/categorical tabular data.
    Works for Adult, Breast Cancer, HAR, Digits.
    """
    numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = [c for c in X_train.columns if c not in numeric_cols]

    numeric_pipeline = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]
    )

    categorical_pipeline = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_pipeline, numeric_cols),
            ("cat", categorical_pipeline, categorical_cols)
        ],
        remainder="drop"
    )

    return preprocessor


def build_text_preprocessor():
    """
    Preprocessor for SMS Spam (raw text).
    Tfidf -> TruncatedSVD (to keep dimensionality manageable).
    """
    text_pipeline = Pipeline(
        steps=[
            ("tfidf", TfidfVectorizer(stop_words="english", max_features=5000)),
            ("svd", TruncatedSVD(n_components=100, random_state=RANDOM_STATE))
        ]
    )
    return text_pipeline


def build_pipeline(model_name, base_model, is_text, X_train=None):
    """
    Build a sklearn Pipeline that ends in 'clf'.

    For tabular datasets:
        [ColumnTransformer] -> [clf]

    For text dataset:
        [Tfidf + SVD] -> [clf]
    """
    if is_text:
        text_pre = build_text_preprocessor()
        pipe = Pipeline(
            steps=[
                ("preprocess", text_pre),
                ("clf", base_model)
            ]
        )
    else:
        if X_train is None:
            raise ValueError("X_train must be provided for tabular data.")
        tab_pre = build_tabular_preprocessor(X_train)
        pipe = Pipeline(
            steps=[
                ("preprocess", tab_pre),
                ("clf", base_model)
            ]
        )

    return pipe


# ------------------------------------------------------------
# Experiment loop
# ------------------------------------------------------------

def run_experiments():
    models, param_grids = get_models_and_grids()
    all_results = []

    for dataset_key in DATASET_NAMES:
        X_raw, y, info = load_dataset_by_name(dataset_key)
        print(f"\n=== Dataset: {info['name']} ({dataset_key}) ===")
        is_text = info["is_text"]

        # For text dataset, X_raw is a Series of strings
        if is_text:
            X_array = X_raw.values  # 1D array of texts
        else:
            X_array = X_raw  # DataFrame

        for train_size in TRAIN_SIZES:
            print(f"\n  Train size = {train_size:.2f}")

            splitter = StratifiedShuffleSplit(
                n_splits=N_TRIALS,
                train_size=train_size,
                random_state=RANDOM_STATE
            )

            for trial_idx, (train_idx, test_idx) in enumerate(
                splitter.split(X_array, y),
                start=1
            ):
                print(f"    Trial {trial_idx}/{N_TRIALS}")

                if is_text:
                    X_train = X_array[train_idx]
                    X_test = X_array[test_idx]
                else:
                    X_train = X_array.iloc[train_idx].copy()
                    X_test = X_array.iloc[test_idx].copy()

                y_train = y.iloc[train_idx]
                y_test = y.iloc[test_idx]

                for model_name, base_model in models.items():
                    print(f"      Model: {model_name}")

                    pipe = build_pipeline(
                        model_name=model_name,
                        base_model=base_model,
                        is_text=is_text,
                        X_train=None if is_text else X_train
                    )
                    param_grid = param_grids[model_name]

                    grid = GridSearchCV(
                        estimator=pipe,
                        param_grid=param_grid,
                        cv=3,
                        scoring="accuracy",
                        n_jobs=-1
                    )

                    grid.fit(X_train, y_train)

                    # Training & test performance
                    y_train_pred = grid.predict(X_train)
                    y_test_pred = grid.predict(X_test)

                    train_acc = accuracy_score(y_train, y_train_pred)
                    test_acc = accuracy_score(y_test, y_test_pred)
                    val_acc_cv = grid.best_score_
                    best_params = grid.best_params_

                    result_row = {
                        "dataset_key": dataset_key,
                        "dataset_name": info["name"],
                        "is_text": is_text,
                        "train_size": train_size,
                        "trial": trial_idx,
                        "model": model_name,
                        "train_acc": train_acc,
                        "val_acc_cv": val_acc_cv,
                        "test_acc": test_acc
                    }

                    # Flatten best_params into the result row
                    for k, v in best_params.items():
                        result_row[f"best_{k}"] = v

                    all_results.append(result_row)

    results_df = pd.DataFrame(all_results)
    return results_df


# ------------------------------------------------------------
# Main: run and summarize
# ------------------------------------------------------------

if __name__ == "__main__":
    results_df = run_experiments()

    # Save raw results (every trial, every split)
    results_df.to_csv("all_results_raw.csv", index=False)

    # Aggregate: mean accuracies over trials
    summary = (
        results_df
        .groupby(["dataset_key", "dataset_name", "model", "train_size"])
        [["train_acc", "val_acc_cv", "test_acc"]]
        .mean()
        .reset_index()
    )

    summary.to_csv("summary_results.csv", index=False)

    print("\n=== Summary (first few rows) ===")
    print(summary.head())



=== Dataset: Adult Income (adult_income) ===

  Train size = 0.20
    Trial 1/3
      Model: log_reg


python(70997) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(70998) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(70999) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(71000) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(71001) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(71002) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(71003) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(71004) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(71005) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(71006) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


      Model: decision_tree
      Model: knn
    Trial 2/3
      Model: log_reg
      Model: decision_tree
      Model: knn
    Trial 3/3
      Model: log_reg
      Model: decision_tree
      Model: knn

  Train size = 0.50
    Trial 1/3
      Model: log_reg
      Model: decision_tree
      Model: knn
