In [1]:
# ----------------------
# 0) Environment checks
# ----------------------

def _check_gpu():
    try:
        import cupy as cp  # noqa
        n = cp.cuda.runtime.getDeviceCount()
        assert n > 0, "No CUDA GPU visible. In Colab: Runtime -> Change runtime type -> GPU"
        dev = cp.cuda.runtime.getDevice()
        props = cp.cuda.runtime.getDeviceProperties(dev)
        name = props.get('name', b'').decode('utf-8', errors='ignore')
        print(f"CUDA GPUs available: {n}; using: {name}")
    except Exception as e:
        print("[WARN] GPU check failed:", e)
        print("If you're on Colab/Kaggle, enable a GPU first.")

_check_gpu()

# --------------------------------
# 1) (Optional) Quick RAPIDS install
# --------------------------------
# If cuML isn't installed yet, uncomment ONE of the blocks below.

# A) Colab-friendly helper (conda under-the-hood)
!git clone https://github.com/rapidsai-community/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py  # installs RAPIDS stable incl. cuML

# B) Pip wheels (CUDA 12.x). Works on many environments with CUDA 12.*
# !pip -q install --extra-index-url=https://pypi.nvidia.com cupy-cuda12x cudf-cu12 cuml-cu12

CUDA GPUs available: 1; using: NVIDIA A100-SXM4-40GB
Cloning into 'rapidsai-csp-utils'...
remote: Enumerating objects: 603, done.[K
remote: Counting objects: 100% (169/169), done.[K
remote: Compressing objects: 100% (87/87), done.[K
remote: Total 603 (delta 131), reused 82 (delta 82), pack-reused 434 (from 3)[K
Receiving objects: 100% (603/603), 199.38 KiB | 2.59 MiB/s, done.
Resolving deltas: 100% (305/305), done.
Installing RAPIDS remaining 25.08 libraries
Using Python 3.12.11 environment at: /usr
Resolved 177 packages in 2.93s
Prepared 54 packages in 30.94s
Uninstalled 32 packages in 947ms
Installed 54 packages in 110ms
 + arrow==1.3.0
 - bokeh==3.7.3
 + bokeh==3.6.3
 + cucim-cu12==25.8.0
 + cuda-bindings==12.9.2
 + cuda-pathfinder==1.2.1
 - cuda-python==12.6.2.post1
 + cuda-python==12.9.2
 - cudf-cu12==25.6.0 (from https://pypi.nvidia.com/cudf-cu12/cudf_cu12-25.6.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl)
 + cudf-cu12==25.8.0
 + cugraph-cu12==25.8.0
 - cuml-

In [2]:
import os, sys, time, json, math, pathlib
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from typing import Dict, List, Tuple

# GPU + metrics imports
import cupy as cp
import cudf
from cuml.svm import SVC
from cuml.linear_model import LogisticRegression
from cuml.ensemble import RandomForestClassifier
from cuml.neighbors import KNeighborsClassifier
from cuml.decomposition import PCA

from sklearn.metrics import (
    accuracy_score, f1_score, classification_report, confusion_matrix
)

In [11]:
# ---------------------------
# 3) Load your saved features
# ---------------------------
# First, you need to upload your 'features_scaled' directory to Colab
# or mount your Google Drive where the files are located.

# Example if you uploaded a zip file:
# !unzip features_scaled.zip

DATA_DIR = Path('./features_raw')  # change to ./features_raw if you prefer
assert (DATA_DIR / 'train.npz').exists(), f"Can't find {DATA_DIR/'train.npz'} — check your paths."

train = np.load(DATA_DIR / 'train.npz', allow_pickle=False)
val   = np.load(DATA_DIR / 'val.npz',   allow_pickle=False)
test  = np.load(DATA_DIR / 'test.npz',  allow_pickle=False)

X_tr = train['X'].astype(np.float32)
Y_tr = train['y'].astype(np.int32)
classes = train['classes'].tolist()

X_va = val['X'].astype(np.float32)
Y_va = val['y'].astype(np.int32)

X_te = test['X'].astype(np.float32)
Y_te = test['y'].astype(np.int32)

# Drop any -1 labels (just in case some class didn't appear in train)
keep_va = Y_va >= 0
keep_te = Y_te >= 0
X_va, Y_va = X_va[keep_va], Y_va[keep_va]
X_te, Y_te = X_te[keep_te], Y_te[keep_te]

print("Shapes (scaled):", X_tr.shape, X_va.shape, X_te.shape)
print("Classes:", classes)

Shapes (scaled): (17714, 1584) (2023, 1584) (2394, 1584)
Classes: ['angry', 'happy', 'neutral', 'sad']


In [12]:
# ---------------------------
# 4) Move to GPU (CuPy arrays)
# ---------------------------
Xtr = cp.asarray(X_tr)
Ytr = cp.asarray(Y_tr)
Xva = cp.asarray(X_va)
Yva = cp.asarray(Y_va)
Xte = cp.asarray(X_te)
Yte = cp.asarray(Y_te)

n_classes = int(cp.asnumpy(Ytr.max()) + 1)

# ---------------------------
# 5) Class imbalance handling
# ---------------------------
# We'll compute inverse-frequency sample weights on the TRAIN set and try to pass them
# to models that support it. If a model doesn't support sample_weight, we'll skip it.

vals, counts = cp.unique(Ytr, return_counts=True)
class_freq = cp.zeros(n_classes, dtype=cp.float32)
class_freq[vals] = counts
class_weights = (len(Ytr) / (n_classes * class_freq)).astype(cp.float32)
train_weights = class_weights[Ytr]

print("Class counts:", dict(zip(cp.asnumpy(vals).tolist(), cp.asnumpy(counts).tolist())))
print("Class weights:", cp.asnumpy(class_weights))

Class counts: {0: 4485, 1: 4485, 2: 4260, 3: 4484}
Class weights: [0.98740244 0.98740244 1.039554   0.9876227 ]


In [13]:
# 6) Utilities: training, evaluation & plotting

OUT = Path("./results")
OUT.mkdir(parents=True, exist_ok=True)


def _to_cpu(x):
    return cp.asnumpy(x) if isinstance(x, cp.ndarray) else x


def evaluate(name: str, model, Xtr, Ytr, Xva, Yva, Xte, Yte, sample_weight=None):
    print(f"\n=== Training {name} ===")
    t0 = time.time()
    try:
        if sample_weight is not None:
            model.fit(Xtr, Ytr, sample_weight=sample_weight)
        else:
            model.fit(Xtr, Ytr)
    except TypeError:
        # model doesn't accept sample_weight
        print(f"   [Info] Model {name} does not support sample_weight. Training without it.")
        model.fit(Xtr, Ytr)
    tr_time = time.time() - t0

    # Predictions
    P_va = model.predict(Xva)
    P_te = model.predict(Xte)

    # Metrics on CPU for convenience
    yv = _to_cpu(Yva); yt = _to_cpu(Yte)
    pv = _to_cpu(P_va); pt = _to_cpu(P_te)

    acc_va = accuracy_score(yv, pv)
    f1m_va = f1_score(yv, pv, average='macro')
    acc_te = accuracy_score(yt, pt)
    f1m_te = f1_score(yt, pt, average='macro')

    print(f"{name}: val acc={acc_va:.4f} f1_macro={f1m_va:.4f} | test acc={acc_te:.4f} f1_macro={f1m_te:.4f} (fit {tr_time:.2f}s)")

    # Save confusion matrix on test
    cm = confusion_matrix(yt, pt, labels=list(range(n_classes)))
    fig = plt.figure(figsize=(6.5, 5.5))
    ax = fig.add_subplot(111)
    im = ax.imshow(cm, interpolation='nearest')
    ax.set_title(f"Confusion Matrix — {name} (test)")
    ax.set_xlabel("Predicted")
    ax.set_ylabel("True")
    ax.set_xticks(range(n_classes)); ax.set_xticklabels(classes, rotation=45, ha='right')
    ax.set_yticks(range(n_classes)); ax.set_yticklabels(classes)
    for i in range(n_classes):
        for j in range(n_classes):
            ax.text(j, i, str(cm[i, j]), ha='center', va='center', fontsize=8)
    fig.tight_layout()
    out_png = OUT / f"cm_{name.replace(' ', '_')}.png"
    fig.savefig(out_png, dpi=140)
    plt.close(fig)

    return {
        'name': name, 'val_acc': acc_va, 'val_f1m': f1m_va,
        'test_acc': acc_te, 'test_f1m': f1m_te, 'fit_sec': tr_time,
        'cm_path': str(out_png)
    }

## OPTIONAL PCA

In [None]:
#
#  Train baselines
results = []
USE_PCA = False

# 8.1) Random Forest (GPU) — modest sweep
rf_grid = [
    dict(n_estimators=1000, max_depth=18, max_features=1.0, n_streams=8, bootstrap=True, random_state=42),
    dict(n_estimators=1500, max_depth=24, max_features=0.8, n_streams=8, bootstrap=True, random_state=42),
]
for i, params in enumerate(rf_grid, 1):
    rf = RandomForestClassifier(**params)
    res = evaluate(f"RF_{i}", rf, Xtr, Ytr, Xva, Yva, Xte, Yte, sample_weight=train_weights)
    results.append(res)

# 8.2) SVM RBF (GPU) — grid over C/gamma and PCA sizes
svm_C = [1.0, 2.0, 4.0, 8.0, 16.0]
svm_gamma = ['scale', 'auto']

# Try PCA at 64/128/256 and keep the best
PCA_SIZES = [64, 128, 256]
svm_results = []
for pca_k in PCA_SIZES:
    if USE_PCA:
        pca_k = int(min(pca_k, Xtr.shape[1]))
        pca_k = max(2, pca_k)
        print(f"Fitting GPU PCA for SVM grid: k={pca_k}")
        pca_tmp = PCA(n_components=pca_k)
        Xtr_pp = pca_tmp.fit_transform(Xtr)
        Xva_pp = pca_tmp.transform(Xva)
        Xte_pp = pca_tmp.transform(Xte)
    else:
        Xtr_pp, Xva_pp, Xte_pp = Xtr, Xva, Xte
    for C in svm_C:
        for g in svm_gamma:
            svc = SVC(C=C, gamma=g, kernel='rbf', max_iter=200000, cache_size=2048)
            tag = f"SVM_RBF_C{C}_G{g}__PCA{pca_k}"
            res = evaluate(tag, svc, Xtr_pp, Ytr, Xva_pp, Yva, Xte_pp, Yte, sample_weight=train_weights)
            svm_results.append(res)
            results.append(res)

# 8.3) Logistic Regression (GPU) — a couple of C values
log_grid = [
    dict(C=1.0, penalty='l2', max_iter=4000, tol=1e-4, fit_intercept=True),
    dict(C=0.5, penalty='l2', max_iter=4000, tol=1e-4, fit_intercept=True),
]
for i, params in enumerate(log_grid, 1):
    lr = LogisticRegression(**params)
    res = evaluate(f"LogReg_{i}", lr, Xtr, Ytr, Xva, Yva, Xte, Yte, sample_weight=train_weights)
    results.append(res)

# 8.4) KNN (GPU) — try a wider k sweep with PCA=64/128
for k in [5, 9, 13, 21]:
    for pca_k in [64, 128]:
        if USE_PCA:
            pca_k = int(min(pca_k, Xtr.shape[1]))
            pca_tmp = PCA(n_components=pca_k)
            Xtr_pp = pca_tmp.fit_transform(Xtr)
            Xva_pp = pca_tmp.transform(Xva)
            Xte_pp = pca_tmp.transform(Xte)
        else:
            Xtr_pp, Xva_pp, Xte_pp = Xtr, Xva, Xte
        knn = KNeighborsClassifier(n_neighbors=k, weights='uniform', metric='euclidean')
        res = evaluate(f"KNN_k{k}__PCA{pca_k}", knn, Xtr_pp, Ytr, Xva_pp, Yva, Xte_pp, Yte)
        results.append(res)


=== Training RF_1 ===
   [Info] Model RF_1 does not support sample_weight. Training without it.
RF_1: val acc=0.4493 f1_macro=0.4433 | test acc=0.5543 f1_macro=0.5423 (fit 40.97s)

=== Training RF_2 ===
   [Info] Model RF_2 does not support sample_weight. Training without it.
RF_2: val acc=0.4548 f1_macro=0.4491 | test acc=0.5610 f1_macro=0.5493 (fit 60.47s)

=== Training SVM_RBF_C1.0_Gscale__PCA64 ===
SVM_RBF_C1.0_Gscale__PCA64: val acc=0.3520 f1_macro=0.3496 | test acc=0.4048 f1_macro=0.3664 (fit 1.01s)

=== Training SVM_RBF_C1.0_Gauto__PCA64 ===


In [7]:
# ---------------------------
# 9) XGBoost (GPU) + tuned sweep
# ---------------------------
TRY_XGB = True
if TRY_XGB:
    try:
        import xgboost as xgb
        dtrain = xgb.DMatrix(_to_cpu(X_tr), label=_to_cpu(Y_tr))
        dval   = xgb.DMatrix(_to_cpu(X_va), label=_to_cpu(Y_va))
        dtest  = xgb.DMatrix(_to_cpu(X_te), label=_to_cpu(Y_te))

        xgb_grid = [
            dict(max_depth=6,  eta=0.10, subsample=0.9, colsample_bytree=0.9),
            dict(max_depth=8,  eta=0.08, subsample=0.9, colsample_bytree=0.9),
            dict(max_depth=10, eta=0.06, subsample=0.9, colsample_bytree=0.9),
        ]
        for i, hp in enumerate(xgb_grid, 1):
            params = dict(
                objective='multi:softprob', num_class=n_classes,
                tree_method='gpu_hist', reg_lambda=1.0, max_bin=256, eval_metric='mlogloss',
                **hp
            )
            bst = xgb.train(
                params, dtrain, num_boost_round=900,
                evals=[(dval, 'val')], verbose_eval=False, early_stopping_rounds=50
            )
            # Use softprob output
            pv_prob = bst.predict(dval)
            pt_prob = bst.predict(dtest)
            pv = np.argmax(pv_prob, axis=1).astype(np.int32)
            pt = np.argmax(pt_prob, axis=1).astype(np.int32)

            acc_va = accuracy_score(_to_cpu(Yva), pv)
            f1m_va = f1_score(_to_cpu(Yva), pv, average='macro')
            acc_te = accuracy_score(_to_cpu(Yte), pt)
            f1m_te = f1_score(_to_cpu(Yte), pt, average='macro')
            name = f"XGB_gpu_hist_{i}"
            print(f"{name}: val acc={acc_va:.4f} f1_macro={f1m_va:.4f} | test acc={acc_te:.4f} f1_macro={f1m_te:.4f}")

            # Save CM for each XGB run
            cm = confusion_matrix(_to_cpu(Yte), pt, labels=list(range(n_classes)))
            fig = plt.figure(figsize=(6.5, 5.5))
            ax = fig.add_subplot(111)
            ax.imshow(cm, interpolation='nearest')
            ax.set_title(f"Confusion Matrix — {name} (test)")
            ax.set_xlabel("Predicted"); ax.set_ylabel("True")
            ax.set_xticks(range(n_classes)); ax.set_xticklabels(classes, rotation=45, ha='right')
            ax.set_yticks(range(n_classes)); ax.set_yticklabels(classes)
            for r in range(n_classes):
                for c in range(n_classes):
                    ax.text(c, r, str(cm[r, c]), ha='center', va='center', fontsize=8)
            out_png = OUT / f"cm_{name}.png"
            fig.tight_layout(); fig.savefig(out_png, dpi=140); plt.close(fig)

            results.append({
                'name': name, 'val_acc': acc_va, 'val_f1m': f1m_va,
                'test_acc': acc_te, 'test_f1m': f1m_te, 'fit_sec': math.nan,
                'cm_path': str(out_png)
            })
    except Exception as e:
        print("[WARN] XGBoost GPU couldn't run:", e)


    E.g. tree_method = "hist", device = "cuda"

  self.starting_round = model.num_boosted_rounds()

    E.g. tree_method = "hist", device = "cuda"

  return func(**kwargs)

    E.g. tree_method = "hist", device = "cuda"

  self.starting_round = model.num_boosted_rounds()


XGB_gpu_hist_1: val acc=0.5344 f1_macro=0.5108 | test acc=0.5957 f1_macro=0.5848



    E.g. tree_method = "hist", device = "cuda"

  return func(**kwargs)

    E.g. tree_method = "hist", device = "cuda"

  self.starting_round = model.num_boosted_rounds()


XGB_gpu_hist_2: val acc=0.5180 f1_macro=0.4944 | test acc=0.5848 f1_macro=0.5736
XGB_gpu_hist_3: val acc=0.5141 f1_macro=0.4963 | test acc=0.5890 f1_macro=0.5784



    E.g. tree_method = "hist", device = "cuda"

  return func(**kwargs)


In [8]:
# ---------------------------
# 10) Results table
# ---------------------------
import pandas as pd

df = pd.DataFrame(results).sort_values(['val_f1m','test_f1m'], ascending=False)
print("\n=== Summary (sorted by val macro-F1) ===")
print(df[['name','val_acc','val_f1m','test_acc','test_f1m','fit_sec','cm_path']].to_string(index=False))

# Save
CSV_PATH = OUT / 'summary.csv'
df.to_csv(CSV_PATH, index=False)
print(f"\nSaved summary to: {CSV_PATH}")
print(f"Confusion matrices saved under: {OUT.resolve()}")

# You can download the results folder like this in Colab:
# !zip -r results.zip results


=== Summary (sorted by val macro-F1) ===
                        name  val_acc  val_f1m  test_acc  test_f1m   fit_sec                                         cm_path
                    LogReg_2 0.618389 0.601885  0.532581  0.530308  1.082034                     gpu_results/cm_LogReg_2.png
                    LogReg_1 0.613940 0.596495  0.529240  0.527809  1.936371                     gpu_results/cm_LogReg_1.png
 SVM_RBF_C8.0_Gscale__PCA256 0.593178 0.573039  0.525898  0.517650  0.602472  gpu_results/cm_SVM_RBF_C8.0_Gscale__PCA256.png
SVM_RBF_C16.0_Gscale__PCA256 0.590707 0.570107  0.525063  0.516115  0.661877 gpu_results/cm_SVM_RBF_C16.0_Gscale__PCA256.png
 SVM_RBF_C4.0_Gscale__PCA256 0.590707 0.568976  0.533417  0.527276  0.507732  gpu_results/cm_SVM_RBF_C4.0_Gscale__PCA256.png
 SVM_RBF_C2.0_Gscale__PCA256 0.582798 0.562071  0.544277  0.539337  0.400275  gpu_results/cm_SVM_RBF_C2.0_Gscale__PCA256.png
 SVM_RBF_C1.0_Gscale__PCA256 0.572417 0.552231  0.557226  0.553346  0.317666  gpu_r

In [9]:
!zip -r results.zip /content/gpu_results


  adding: content/gpu_results/ (stored 0%)
  adding: content/gpu_results/cm_KNN_k13__PCA128.png (deflated 15%)
  adding: content/gpu_results/cm_RF_2.png (deflated 16%)
  adding: content/gpu_results/cm_SVM_RBF_C8.0_Gauto__PCA256.png (deflated 14%)
  adding: content/gpu_results/cm_SVM_RBF_C2.0_Gscale__PCA128.png (deflated 14%)
  adding: content/gpu_results/cm_SVM_RBF_C8.0_Gscale__PCA128.png (deflated 15%)
  adding: content/gpu_results/cm_SVM_RBF_C4.0_Gscale__PCA64.png (deflated 14%)
  adding: content/gpu_results/cm_SVM_RBF_C1.0_Gauto__PCA128.png (deflated 15%)
  adding: content/gpu_results/cm_SVM_RBF_C4.0_Gscale__PCA128.png (deflated 14%)
  adding: content/gpu_results/cm_SVM_RBF_C1.0_Gauto__PCA64.png (deflated 15%)
  adding: content/gpu_results/cm_LogReg_1.png (deflated 15%)
  adding: content/gpu_results/cm_SVM_RBF_C16.0_Gauto__PCA256.png (deflated 14%)
  adding: content/gpu_results/cm_XGB_gpu_hist_2.png (deflated 15%)
  adding: content/gpu_results/cm_LogReg_2.png (deflated 16%)
  adding

In [10]:
bst.save_model("xgb_gpu_hist_1.json")   # or "xgb_gpu_hist_1.ubj" (smaller/faster)
import json, numpy as np
meta = {
    "classes": classes,                 # label names
    "n_classes": int(len(classes)),
    "feature_dim": int(np.shape(X_tr)[1]),
    "preprocessing": "features_scaled", # reminder on whats been used
}
json.dump(meta, open("xgb_gpu_hist_1.meta.json","w"))


## For later loading

In [None]:
import xgboost as xgb
import numpy as np

# Load the model
bst = xgb.Booster()
bst.load_model("xgb_gpu_hist_1.json")

# Force CPU inference
bst.set_param({"predictor": "cpu_predictor"})

# Example: run on new data (must be preprocessed the same way as training!)
X_new = X_te.astype(np.float32)   # replace with a new example
dnew = xgb.DMatrix(X_new)

# Get predictions
proba = bst.predict(dnew)         # shape (n_samples, n_classes)
preds = proba.argmax(axis=1)      # class indices