# IP2 – MEx (Multimodal Exercise) klasifikacija

Ovaj notebook izvršava kompletan postupak koji se traži u zadatku:

- učitavanje i konstrukcija atributa (feature extraction)
- preprocesiranje (inf→NaN, imputacija medianom, standardizacija)
- vizuelizacija (PCA 2D i 3D)
- klasifikacija sa **≥5 algoritama**
- poređenje modela sa:
  - **svim atributima** (baseline)
  - **PCA redukcijom** (95% varijanse)
  - **SelectKBest** (mutual information, k=50)
- validacija bez curenja podataka (GroupKFold po `subject`)
- čuvanje izlaza za reprodukciju (`outputs/`)


In [None]:
import os, re, glob
import numpy as np
import pandas as pd

from scipy.stats import skew, kurtosis

from sklearn.model_selection import GroupKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, mutual_info_classif

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

import matplotlib.pyplot as plt

pd.set_option("display.max_columns", 200)
np.random.seed(42)

DATA_DIR = "data"          # očekuje se da je MEx dataset lokalno ovde (vidi README u dataset-u)
OUT_DIR  = "outputs"
os.makedirs(OUT_DIR, exist_ok=True)


In [None]:
# -----------------------------
# 1) Učitavanje i parsiranje fajlova
# -----------------------------

# Ovaj regex odgovara tipičnim nazivima iz MEx skupa (subject, exercise_id, exercise, sensor)
# Ako ti se razlikuje šema imenovanja fajlova, samo prilagodi regex u FNAME_RE.
FNAME_RE = re.compile(
    r"(?P<subject>subject\d+)[^\d]*(?P<exercise_id>\d+)[^A-Za-z]*(?P<exercise>[A-Za-z_]+)[^A-Za-z]*(?P<sensor>ACT|ACW|PM|DC)",
    re.IGNORECASE
)

def parse_file_info(path: str):
    fname = os.path.basename(path)
    m = FNAME_RE.search(fname)
    if not m:
        return None
    return {
        "subject": m.group("subject").lower(),
        "exercise_id": int(m.group("exercise_id")),
        "exercise": m.group("exercise").lower(),
        "sensor": m.group("sensor").lower()
    }

def read_act_acw_csv(path: str):
    # ACT/ACW imaju 3 ose + vreme (ponekad) -> uzimamo prva 3 stuba
    df = pd.read_csv(path, header=None)
    if df.shape[1] < 3:
        return None
    M = df.iloc[:, :3].values.astype(float)
    return M

def read_pm_dc_csv(path: str):
    # PM/DC u dataset-u često imaju drugačiji format; ovde uzimamo sve numeričke kolone
    df = pd.read_csv(path, header=None)
    df = df.apply(pd.to_numeric, errors="coerce")
    df = df.dropna(axis=1, how="all")
    if df.shape[1] == 0:
        return None
    M = df.values.astype(float)
    return M


In [None]:
# -----------------------------
# 2) Feature extraction
# -----------------------------

def feats_act_like(M: np.ndarray, prefix: str):
    if M is None or M.size == 0 or M.shape[1] < 3:
        return {}
    x, y, z = M[:, 0], M[:, 1], M[:, 2]
    mag = np.sqrt(x**2 + y**2 + z**2)

    feats = {}
    for name, arr in [("x", x), ("y", y), ("z", z), ("mag", mag)]:
        feats[f"{prefix}_{name}_mean"] = np.nanmean(arr)
        feats[f"{prefix}_{name}_std"]  = np.nanstd(arr)
        feats[f"{prefix}_{name}_min"]  = np.nanmin(arr)
        feats[f"{prefix}_{name}_max"]  = np.nanmax(arr)
        feats[f"{prefix}_{name}_skew"] = skew(arr, nan_policy="omit")
        feats[f"{prefix}_{name}_kurt"] = kurtosis(arr, nan_policy="omit")

        # "energy" = srednja kvadratna vrednost
        feats[f"{prefix}_{name}_energy"] = np.nanmean(arr**2)
    return feats

def feats_generic(M: np.ndarray, prefix: str):
    # Za senzore sa N kolona, izvlačimo agregate po kolonama (mean/std/min/max)
    if M is None or M.size == 0:
        return {}
    feats = {}
    for j in range(M.shape[1]):
        col = M[:, j]
        feats[f"{prefix}_c{j}_mean"] = np.nanmean(col)
        feats[f"{prefix}_c{j}_std"]  = np.nanstd(col)
        feats[f"{prefix}_c{j}_min"]  = np.nanmin(col)
        feats[f"{prefix}_c{j}_max"]  = np.nanmax(col)
        feats[f"{prefix}_c{j}_energy"] = np.nanmean(col**2)
    return feats

def extract_features_for_file(path: str, sensor_key: str):
    info = parse_file_info(path)
    if info is None:
        return None

    if sensor_key in ["act", "acw"]:
        M = read_act_acw_csv(path)
        feats = feats_act_like(M, sensor_key)
    else:
        M = read_pm_dc_csv(path)
        feats = feats_generic(M, sensor_key)

    row = {**{k: info[k] for k in ["subject", "exercise_id", "exercise"]}, **feats}
    return row

def build_feature_table(sensor_key: str):
    # traži sensor podfoldere ili sve csv-ove ispod DATA_DIR
    patt = os.path.join(DATA_DIR, "**", "*.csv")
    paths = glob.glob(patt, recursive=True)

    rows = []
    for p in paths:
        info = parse_file_info(p)
        if info is None:
            continue
        if info["sensor"] != sensor_key:
            continue
        row = extract_features_for_file(p, sensor_key)
        if row is not None:
            rows.append(row)

    df = pd.DataFrame(rows)
    if df.empty:
        raise RuntimeError(f"Nisam našla fajlove za senzor '{sensor_key}'. Proveri DATA_DIR i FNAME_RE.")
    return df


In [None]:
# -----------------------------
# 3) Konstrukcija full skupa atributa (ALL sensors)
# -----------------------------

act_df = build_feature_table("act")
acw_df = build_feature_table("acw")
pm_df  = build_feature_table("pm")
dc_df  = build_feature_table("dc")

keys = ["subject", "exercise_id", "exercise"]

# left-merge na ACT (pretpostavka: ACT postoji za svaki primer)
df_all = act_df.merge(acw_df, on=keys, how="left", suffixes=("", "_acwdup"))
df_all = df_all.merge(pm_df,  on=keys, how="left", suffixes=("", "_pmdup"))
df_all = df_all.merge(dc_df,  on=keys, how="left", suffixes=("", "_dcdup"))

# ukloni eventualne duplikat sufikse ako se pojave
df_all = df_all.loc[:, ~df_all.columns.str.contains("dup$")]

df_all.head(), df_all.shape


In [None]:
# Snimi "raw features" (pre preprocesiranja)
raw_path = os.path.join(OUT_DIR, "mex_features_all_raw.csv")
df_all.to_csv(raw_path, index=False)

print("Saved:", raw_path)


In [None]:
# -----------------------------
# 4) Preprocesiranje (inf→NaN, imputacija medianom)
# -----------------------------
df_all_clean = df_all.copy()

# zameni inf
df_all_clean = df_all_clean.replace([np.inf, -np.inf], np.nan)

# imputacija medianom samo na feature kolonama
feature_cols = [c for c in df_all_clean.columns if c not in keys]
df_all_clean[feature_cols] = df_all_clean[feature_cols].apply(pd.to_numeric, errors="coerce")
df_all_clean[feature_cols] = df_all_clean[feature_cols].fillna(df_all_clean[feature_cols].median(numeric_only=True))

preproc_path = os.path.join(OUT_DIR, "mex_features_all_preprocessed.csv")
df_all_clean.to_csv(preproc_path, index=False)

print("Saved:", preproc_path)
df_all_clean.head()


In [None]:
# -----------------------------
# 5) Vizuelizacija (PCA 2D i 3D)
# -----------------------------
from sklearn.preprocessing import LabelEncoder

X = df_all_clean.drop(columns=["exercise", "subject", "exercise_id"])
y = df_all_clean["exercise"].values
groups = df_all_clean["subject"].values

# standardizacija samo za vizuelizaciju
X_scaled = StandardScaler().fit_transform(X)

le = LabelEncoder()
y_enc = le.fit_transform(y)

# PCA 2D
pca2 = PCA(n_components=2, random_state=42)
Z2 = pca2.fit_transform(X_scaled)

plt.figure(figsize=(8,6))
plt.scatter(Z2[:,0], Z2[:,1], c=y_enc, s=12)
plt.title("PCA 2D – ALL sensors")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.tight_layout()
plt.show()

# PCA 3D
pca3 = PCA(n_components=3, random_state=42)
Z3 = pca3.fit_transform(X_scaled)

fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111, projection="3d")
ax.scatter(Z3[:,0], Z3[:,1], Z3[:,2], c=y_enc, s=10)
ax.set_title("PCA 3D – ALL sensors")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_zlabel("PC3")
plt.tight_layout()
plt.show()


In [None]:
# -----------------------------
# 6) Evaluacija (≥5 algoritama) + redukcija atributa
#    - baseline: svi atributi
#    - PCA: 95% varijanse
#    - KBest: k=50 (mutual information)
# -----------------------------

MODELS = {
    "LogReg": LogisticRegression(max_iter=3000),
    "SVM_RBF": SVC(kernel="rbf", C=5, gamma="scale"),
    "RandomForest": RandomForestClassifier(n_estimators=300, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=7),
    "NaiveBayes": GaussianNB(),
    "HistGB": HistGradientBoostingClassifier(random_state=42)
}

def make_pipeline(model, reduction: str):
    steps = [
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
    if reduction == "pca":
        steps.append(("pca", PCA(n_components=0.95, random_state=42)))
    elif reduction == "kbest":
        steps.append(("kbest", SelectKBest(mutual_info_classif, k=50)))
    steps.append(("model", model))
    return Pipeline(steps)

def evaluate_all_reductions(df):
    X = df.drop(columns=["exercise", "subject", "exercise_id"])
    y = df["exercise"].values
    groups = df["subject"].values

    cv = GroupKFold(n_splits=5)

    rows = []
    for reduction in ["none", "pca", "kbest"]:
        for name, model in MODELS.items():
            pipe = make_pipeline(model, reduction)

            scores = cross_validate(
                pipe, X, y,
                cv=cv,
                groups=groups,
                scoring={"acc": "accuracy", "f1": "f1_weighted"},
                n_jobs=-1
            )
            rows.append({
                "reduction": reduction,
                "model": name,
                "acc": float(np.mean(scores["test_acc"])),
                "f1": float(np.mean(scores["test_f1"]))
            })
    return pd.DataFrame(rows).sort_values(["reduction", "f1"], ascending=[True, False]).reset_index(drop=True)

res = evaluate_all_reductions(df_all_clean)
res


In [None]:
# Snimi rezultate
res_path = os.path.join(OUT_DIR, "model_comparison_all_reductions.csv")
res.to_csv(res_path, index=False)
print("Saved:", res_path)


In [None]:
# Graf: F1 po modelu za svaku redukciju
plt.figure(figsize=(10,5))
for r in ["none","pca","kbest"]:
    sub = res[res["reduction"] == r].sort_values("model")
    plt.plot(sub["model"], sub["f1"], marker="o", label=r)

plt.ylabel("F1-score (weighted)")
plt.title("Poređenje: ALL atributi vs PCA vs SelectKBest")
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# -----------------------------
# 7) Treniraj najbolji model na celom skupu (za isporuku)
# -----------------------------
best = res.sort_values("f1", ascending=False).iloc[0]
best_reduction = best["reduction"]
best_model_name = best["model"]

print("Best overall:", best_model_name, "| reduction:", best_reduction, "| acc:", best["acc"], "| f1:", best["f1"])

X = df_all_clean.drop(columns=["exercise", "subject", "exercise_id"])
y = df_all_clean["exercise"].values

best_model = MODELS[best_model_name]
best_pipe = make_pipeline(best_model, best_reduction)
best_pipe.fit(X, y)

import joblib
model_path = os.path.join(OUT_DIR, f"best_model_all_{best_model_name}_{best_reduction}.joblib")
joblib.dump(best_pipe, model_path)
print("Saved:", model_path)


## Šta predati (materijal za reprodukciju)

U folderu `outputs/` dobijaš:
- `mex_features_all_raw.csv` (konstruisani atributi, pre preprocesiranja)
- `mex_features_all_preprocessed.csv` (posle preprocesiranja)
- `model_comparison_all_reductions.csv` (poređenje ≥5 algoritama za 3 varijante atributa)
- `best_model_all_<...>.joblib` (najbolji pipeline)

Za PDF deo (tekstualni deo rada) uvrsti:
- opis skupa podataka i cilja (`exercise`)
- opis feature extraction (statističke karakteristike po senzoru)
- preprocesiranje (inf→NaN, median imputacija, standardizacija)
- validaciju (GroupKFold po `subject`)
- rezultate + grafike (PCA 2D/3D i poređenje F1)

