In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [7]:
# 1. Pfad zur Datei
df = pd.read_excel("/Users/yakubuzdilli/Desktop/UbiComp/Assignment 2/feature_list_all.xlsx")

# Daten prüfen
print(df.shape)
df.head()

(385, 22)


Unnamed: 0,meanFix,minFix,maxFix,varFix,stdFix,meanDis,minDis,maxDis,varDis,stdDisp,...,blinkMean,blinkMin,blinkMax,blinkRate,xDir,yDir,fixDensPerBB,label,duration,participant_id
0,250.882353,100,666,16554.228164,128.663235,0.050347,0.008784,0.174576,0.001026,0.032029,...,2291.5,33,8967,0.4,0.484848,0.606061,369.479986,Inspection,10,1
1,246.742857,100,900,23549.020168,153.4569,0.052125,0.013571,0.146709,0.000947,0.03078,...,0.0,0,0,0.0,0.529412,0.529412,363.711018,Inspection,10,1
2,237.257143,100,1067,36627.431933,191.382946,0.058595,0.028369,0.206161,0.002116,0.045998,...,0.0,0,0,0.0,0.441176,0.558824,266.246748,Inspection,10,1
3,293.137931,100,767,35208.551724,187.639419,0.047417,0.017613,0.114933,0.000828,0.028777,...,0.0,0,0,0.0,0.5,0.5,319.553727,Inspection,10,1
4,199.125,100,566,12614.163462,112.312793,0.045997,0.027946,0.155588,0.000939,0.030645,...,1899.8,33,9134,0.5,0.589744,0.487179,305.83209,Inspection,10,1


In [10]:
# 2. Ziel- und Metaspalten
target = "label"
meta = ["participant_id", "duration"]

# 3. Alle Features bestimmen (alle Spalten ausser label und meta)
all_features = [c for c in df.columns if c not in [target] + meta]

# 4. Features wie im Paper
paper_features = ["meanFix","maxFix","varFix","xDir","yDir","fixDensPerBB"]

# 5. Zwei DataFrames erstellen
all_features_df   = df[all_features + [target]].copy()
paper_df          = df[paper_features + [target]].copy()

print("All features df:", all_features_df.shape)
print("Paper df:", paper_df.shape)

All features df: (385, 20)
Paper df: (385, 7)


Unnamed: 0,meanFix,minFix,maxFix,varFix,stdFix,meanDis,minDis,maxDis,varDis,stdDisp,freqDisPerSec,number_of_blinks,blinkMean,blinkMin,blinkMax,blinkRate,xDir,yDir,fixDensPerBB,label
0,250.882353,100,666,16554.228164,128.663235,0.050347,0.008784,0.174576,0.001026,0.032029,3.4,4,2291.5,33,8967,0.4,0.484848,0.606061,369.479986,Inspection
1,246.742857,100,900,23549.020168,153.4569,0.052125,0.013571,0.146709,0.000947,0.03078,3.5,0,0.0,0,0,0.0,0.529412,0.529412,363.711018,Inspection
2,237.257143,100,1067,36627.431933,191.382946,0.058595,0.028369,0.206161,0.002116,0.045998,3.5,0,0.0,0,0,0.0,0.441176,0.558824,266.246748,Inspection
3,293.137931,100,767,35208.551724,187.639419,0.047417,0.017613,0.114933,0.000828,0.028777,2.9,0,0.0,0,0,0.0,0.5,0.5,319.553727,Inspection
4,199.125,100,566,12614.163462,112.312793,0.045997,0.027946,0.155588,0.000939,0.030645,4.0,5,1899.8,33,9134,0.5,0.589744,0.487179,305.83209,Inspection


In [14]:
# 4 Klassifier definieren
classifiers = {
    "LogisticRegression": LogisticRegression(max_iter=2000),
    "SVM_RBF": SVC(kernel="rbf"),
    "RandomForest": RandomForestClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(random_state=42)
}

def evaluate_models(df_in, label_col="label"):
    X = df_in.drop(columns=[label_col])
    y = df_in[label_col]

    # zufälliger Split, damit alle Teilnehmer vertreten sind
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    results = []
    for name, model in classifiers.items():
        pipe = Pipeline([("scaler", StandardScaler()), ("clf", model)])
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        f1w = f1_score(y_test, y_pred, average="weighted")
        results.append([name, acc, f1w])
        print(f"\n=== {name} ===")
        print(f"Accuracy: {acc:.3f} | F1-weighted: {f1w:.3f}")
        print(classification_report(y_test, y_pred))
    res_df = pd.DataFrame(results, columns=["Classifier","Accuracy","F1_weighted"])
    return res_df

In [15]:
print("=== Ergebnisse mit allen Features ===")
res_all = evaluate_models(all_features_df)

print("\n=== Ergebnisse mit Paper-Features ===")
res_paper = evaluate_models(paper_df)

# Ergebnisse zusammenführen und vergleichen
comparison = (
    res_all.rename(columns={"Accuracy":"Accuracy_all","F1_weighted":"F1_all"})
    .merge(res_paper, on="Classifier", suffixes=("", "_paper"))
)
comparison

=== Ergebnisse mit allen Features ===

=== LogisticRegression ===
Accuracy: 0.883 | F1-weighted: 0.884
              precision    recall  f1-score   support

  Inspection       0.91      0.77      0.83        26
     Reading       1.00      0.96      0.98        26
      Search       0.77      0.92      0.84        25

    accuracy                           0.88        77
   macro avg       0.89      0.88      0.88        77
weighted avg       0.89      0.88      0.88        77


=== SVM_RBF ===
Accuracy: 0.831 | F1-weighted: 0.830
              precision    recall  f1-score   support

  Inspection       0.80      0.77      0.78        26
     Reading       0.89      0.92      0.91        26
      Search       0.80      0.80      0.80        25

    accuracy                           0.83        77
   macro avg       0.83      0.83      0.83        77
weighted avg       0.83      0.83      0.83        77


=== RandomForest ===
Accuracy: 0.883 | F1-weighted: 0.883
              precisio

Unnamed: 0,Classifier,Accuracy_all,F1_all,Accuracy,F1_weighted
0,LogisticRegression,0.883117,0.883973,0.896104,0.896707
1,SVM_RBF,0.831169,0.830381,0.818182,0.818893
2,RandomForest,0.883117,0.882782,0.896104,0.897582
3,ExtraTrees,0.896104,0.897083,0.857143,0.858161


In [18]:
X = df[all_features]
y = df[target]

rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)

importances = pd.Series(rf.feature_importances_, index=all_features).sort_values(ascending=False)
print(importances)

xDir                0.235713
fixDensPerBB        0.188817
meanFix             0.155867
maxDis              0.064717
stdDisp             0.043383
stdFix              0.038371
varFix              0.033240
varDis              0.031295
blinkRate           0.028605
freqDisPerSec       0.024003
number_of_blinks    0.023401
meanDis             0.022295
blinkMax            0.021275
maxFix              0.021134
blinkMean           0.020890
yDir                0.020782
minDis              0.019158
blinkMin            0.007031
minFix              0.000023
dtype: float64


In [20]:
top_features = list(importances.head(8).index)
most_significant_features_df = df[top_features + [target]].copy()
most_significant_features_df.shape

most_significant_features_df.head()

Unnamed: 0,xDir,fixDensPerBB,meanFix,maxDis,stdDisp,stdFix,varFix,varDis,label
0,0.484848,369.479986,250.882353,0.174576,0.032029,128.663235,16554.228164,0.001026,Inspection
1,0.529412,363.711018,246.742857,0.146709,0.03078,153.4569,23549.020168,0.000947,Inspection
2,0.441176,266.246748,237.257143,0.206161,0.045998,191.382946,36627.431933,0.002116,Inspection
3,0.5,319.553727,293.137931,0.114933,0.028777,187.639419,35208.551724,0.000828,Inspection
4,0.589744,305.83209,199.125,0.155588,0.030645,112.312793,12614.163462,0.000939,Inspection


In [21]:
print("=== Ergebnisse mit Most-Significant-Features ===")
res_signif = evaluate_models(most_significant_features_df)

# Gesamter Vergleich
final_results = (
    res_all
    .merge(res_paper, on="Classifier", suffixes=("_all","_paper"))
    .merge(res_signif, on="Classifier", suffixes=("", "_signif"))
)
final_results

=== Ergebnisse mit Most-Significant-Features ===

=== LogisticRegression ===
Accuracy: 0.896 | F1-weighted: 0.896
              precision    recall  f1-score   support

  Inspection       0.91      0.81      0.86        26
     Reading       0.96      0.96      0.96        26
      Search       0.82      0.92      0.87        25

    accuracy                           0.90        77
   macro avg       0.90      0.90      0.90        77
weighted avg       0.90      0.90      0.90        77


=== SVM_RBF ===
Accuracy: 0.883 | F1-weighted: 0.883
              precision    recall  f1-score   support

  Inspection       0.88      0.81      0.84        26
     Reading       0.96      0.92      0.94        26
      Search       0.82      0.92      0.87        25

    accuracy                           0.88        77
   macro avg       0.89      0.88      0.88        77
weighted avg       0.89      0.88      0.88        77


=== RandomForest ===
Accuracy: 0.883 | F1-weighted: 0.884
           

Unnamed: 0,Classifier,Accuracy_all,F1_weighted_all,Accuracy_paper,F1_weighted_paper,Accuracy,F1_weighted
0,LogisticRegression,0.883117,0.883973,0.896104,0.896707,0.896104,0.895894
1,SVM_RBF,0.831169,0.830381,0.818182,0.818893,0.883117,0.88323
2,RandomForest,0.883117,0.882782,0.896104,0.897582,0.883117,0.883721
3,ExtraTrees,0.896104,0.897083,0.857143,0.858161,0.909091,0.909091


In [None]:
# === Interpretation der Ergebnisse ===
# Übersicht:
# 4 Classifier wurden auf drei Feature-Sets getestet:
# 1) all_features_df  -> enthält alle 19 Features
# 2) paper_df         -> enthält die 6 Features aus dem Paper (Bektaş et al., 2023)
# 3) most_significant_features_df -> enthält die wichtigsten Features basierend auf Feature-Importance

# Kurzinterpretation:
# Logistic Regression:
# - Sehr stabile Performance (~0.88–0.90 Accuracy) über alle Feature-Sets hinweg.
# - Kaum Unterschied zwischen allen und den Paper-Features → lineares Modell profitiert kaum von zusätzlichen Features.
# - Gute Baseline, aber nicht das stärkste Modell.

# SVM (RBF):
# - Geringere Genauigkeit bei allen Features (~0.83), noch niedriger mit Paper-Features (~0.82).
# - Deutliche Verbesserung (0.88) mit den signifikantesten Features → Feature-Selektion hilft stark.
# - Hinweis: SVM reagiert empfindlich auf irrelevante Features, daher wirkt Reduktion positiv.

# Random Forest:
# - Solide, stabile Werte (~0.88–0.90), geringe Schwankung zwischen Feature-Sets.
# - Ähnliche Tendenz wie Logistic Regression, aber minimal stärker.
# - Gute Robustheit, aber kein klarer Gewinner.

# Extra Trees:
# - Bestes Modell insgesamt.
# - Accuracy steigt von 0.86 (Paper-Features) über 0.90 (All-Features) bis 0.91 (Signifikanteste Features).
# - Entspricht den Ergebnissen aus dem Paper (Bektaş et al., 2023), wo Extra Trees ebenfalls am besten performte.
# - Profitiert von Feature-Selektion → zusätzliche Features scheinen eher Rauschen einzubringen.

# Fazit:
# - Extra Trees liefert insgesamt die höchste Genauigkeit (0.91) und F1-Werte.
# - Paper-Features sind bereits sehr aussagekräftig, fast gleich gut wie alle Features.
# - Feature-Selektion (signifikanteste Features) kann Performance leicht verbessern.
# - Ensemble-Modelle (RandomForest, ExtraTrees) übertreffen lineare Modelle.


# === Detaillierte Interpretation ExtraTrees ===
# 1) All Features:
#    - Accuracy ~0.90, F1 ~0.90
#    - Reading wird fast perfekt erkannt (F1=0.98)
#    - Inspection & Search etwas durchmischt, aber solide
#    => Modell hat hohe Gesamtleistung, leichte Unschärfe zwischen ähnlichen Aktivitäten.

# 2) Paper Features:
#    - Accuracy ~0.86, F1 ~0.86
#    - Leichter Performanceverlust, v.a. bei "Search"
#    - Zeigt aber, dass die 6 Paper-Features schon sehr informativ sind.

# 3) Most Significant Features:
#    - Accuracy ~0.91, F1 ~0.91
#    - Beste Gesamtleistung: gleichmäßig gute Erkennung aller drei Klassen
#    - Feature-Selektion entfernt unwichtige Merkmale und verbessert Generalisierung.