In [146]:
from itertools import combinations
from sklearn.base import clone
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV, LeaveOneOut, StratifiedKFold, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from typing import List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import umap
import warnings

warnings.filterwarnings("ignore")

In [147]:
EXPERIMENT = "EPM"

DATASET_PATH = "../../legacy_data/12_classification_dataset/EPM_data.csv"
LABELS_PATH  = "../../legacy_data/12_classification_dataset/video_anxiety_labels_for_ML_allLabels_May_2022.xlsx"

LABELS = [
    "High_Anxiety",
    "Middle_Anxiety",
    "Low_Anxiety"
]

In [148]:
def get_trial_number(video_name: str) -> int:
    result = video_name.split('_')
    return int(result[1])

def preprocess_labels(labels: pd.DataFrame) -> pd.DataFrame:
    labels = labels.dropna()
    encodings = pd.get_dummies(labels["labels"])
    labels = pd.concat([labels, encodings], axis=1)
    labels = labels.drop(columns=["test", "labels"])
    labels = labels.rename(columns={
        "high anxiety": "High_Anxiety",
        "mid anxiety": "Middle_Anxiety",
        "low anxiety": "Low_Anxiety"
    })
    
    return labels

def merge_dataset_and_labels(df: pd.DataFrame, labels: pd.DataFrame) -> pd.DataFrame:
    df = df[df["is_week_6"] == 1]
    df["trial_number"] = df["video_name"].map(get_trial_number)
    df = df.merge(labels, how="inner", on=["set", "trial_number"])
    df.drop(columns="week", inplace=True)
    
    return df

def save_new_dataset(dataset_path: str, result_path: str, experiment="EPM"):
    labels = pd.read_excel(LABELS_PATH, engine="openpyxl")
    labels = labels[labels["test"] == experiment]
    labels = preprocess_labels(labels)
    
    df = pd.read_csv(dataset_path)
    df = merge_dataset_and_labels(df, labels)
    df.to_csv(result_path)

def plot_labels_distribution(df: pd.DataFrame, experiment_name: str):
    targets = ["High_Anxiety", "Middle_Anxiety", "Low_Anxiety"]
    counts = df[targets].sum().values
    
    plt.figure(figsize=(8, 8))
    plt.bar(targets, counts, color=["red", "yellow", "green"])
    plt.title(f"Labels distribution for the {experiment_name}", fontsize=16)
    plt.ylabel("Frequency", fontsize=14)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.grid(axis="y")
    plt.show()
    
def preprocess_dataset(df: pd.DataFrame) -> pd.DataFrame:
    df.drop(columns=["video_name", "set", "trial_number", "has_treatment", "is_week_6"], inplace=True)
    return df

In [149]:
def read_labels(labels_path):
    labels = pd.read_excel(labels_path, engine="openpyxl")
    labels = labels[labels["test"] == EXPERIMENT]
    labels = preprocess_labels(labels)
    return labels

def get_test_labels(labels, train_labels):
    test_labels = pd.merge(train_labels, labels, on=["set", "week", "trial_number"], how="right", indicator=True).loc[lambda x: x["_merge"] == "right_only"]
    test_labels.drop(columns=["High_Anxiety_x", "Middle_Anxiety_x", "Low_Anxiety_x", "_merge"], inplace=True)
    test_labels.rename(columns={
        "High_Anxiety_y": "High_Anxiety", 
        "Middle_Anxiety_y": "Middle_Anxiety", 
        "Low_Anxiety_y": "Low_Anxiety"
    }, inplace=True)
    return test_labels

def read_dataset(dataset_path, labels):
    df = pd.read_csv(dataset_path)
    df = merge_dataset_and_labels(df, labels)
    df = preprocess_dataset(df)
    return df

def get_labels_dataset(df):
    y = df[LABELS]
    y = [2 - np.argmax(row) for row in y.values]

    df.drop(columns=LABELS, inplace=True)
    df.head()
    return y
    
def get_dataset():
    all_labels = read_labels(LABELS_PATH)
    X = read_dataset(DATASET_PATH, all_labels)
    y = get_labels_dataset(X)
    
    return X, y

In [150]:
X, y = get_dataset()

In [151]:
print("Shape of X:", X.shape)
print("Length of y:", len(y))

Shape of X: (98, 48)
Length of y: 98


## Temporary

In [152]:
X = X[[
    "time_in_open",
    "time_in_closed",
    "latency_to_enter_open",
    "time_head_dipping",
    "frequency_of_entry_to_open",
    "frequency_of_entry_to_closed",
    "distance_traveled_closed",
    "distance_traveled_open"
]]
X

Unnamed: 0,time_in_open,time_in_closed,latency_to_enter_open,time_head_dipping,frequency_of_entry_to_open,frequency_of_entry_to_closed,distance_traveled_closed,distance_traveled_open
0,22.72,251.88,0.00,8.40,0.075033,0.831836,1368.393145,193.277810
1,22.72,251.88,0.00,8.40,0.075033,0.831836,1368.393145,193.277810
2,3.60,278.32,0.00,1.68,0.011891,0.919276,1133.627110,25.454854
3,3.60,278.32,0.00,1.68,0.011891,0.919276,1133.627110,25.454854
4,44.76,227.00,5.40,32.40,0.147820,0.749670,1324.834645,193.881997
...,...,...,...,...,...,...,...,...
93,43.36,232.04,0.00,12.84,0.143178,0.766213,966.924470,468.204088
94,21.72,250.88,81.88,16.28,0.071731,0.828534,1301.832121,87.088105
95,21.72,250.88,81.88,16.28,0.071731,0.828534,1301.832121,87.088105
96,42.76,212.44,0.00,19.56,0.141178,0.701400,1103.958135,260.640166


In [161]:
import pickle

# model = RandomForestClassifier(min_samples_leaf=16, n_estimators=50, random_state=0, n_jobs=-1)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y, random_state=0, test_size=0.2)
model = pickle.load(open("EPM_classification_model.sav", "rb"))
# model.fit(X_train, y_train)
list(model.predict(X_valid)), y_valid

([0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1],
 [1, 1, 0, 1, 1, 0, 2, 1, 2, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 2])

## Independent Multiclass Classification

In [68]:
def calculate_metrics(model, X, y, metrics):
    skf = StratifiedKFold(n_splits=7, shuffle=True, random_state=0)
    
    scorings = {}
    
    for scoring in metrics:
        if scoring == "neg_root_mean_squared_error":
            sign = -1
        else:
            sign = 1
            
        skf_scores = cross_val_score(model, X, y, cv=skf, scoring=scoring)

        scorings[scoring] = np.round(sign * np.mean(skf_scores), 3)
    
    return scorings

In [69]:
def data_preprocessing(X, y, preprocessing, n_components):
    datasets = {}
    temp_X = X.copy()
    
    if len(preprocessing) == 0:
        print("Error! No preprocessing techniques were provided. Available techniques are: selectkbset, pca, umap, total.")
    
    for technique in preprocessing:
        if technique == "selectkbest":
            selector = SelectKBest(chi2, k=n_components)
            temp_X   = selector.fit_transform(temp_X.replace(-1, 999), y)
            
        scaler = StandardScaler()
        temp_X = scaler.fit_transform(temp_X)
        
        if technique == "pca":
            pca    = PCA(n_components=n_components, random_state=0)
            temp_X = pca.fit_transform(temp_X)
        elif technique == "umap":
            UMAP   = umap.UMAP(n_components=n_components, random_state=0, n_jobs=-1)
            temp_X = UMAP.fit_transform(temp_X)
        elif technique != "selectkbest":
            continue
        
        datasets[technique] = temp_X
        
    return datasets

In [70]:
def test_model(model, X: pd.DataFrame, y, n_components=8, preprocessing=["selectkbest", "pca", "umap"], 
               metrics=["accuracy", "neg_root_mean_squared_error"]) -> pd.DataFrame:
    results = {}
    all_scores = {}
    
    results = pd.DataFrame(columns=[
        "n_components", 
        "dataset_type", 
        "stratified-kfold acc",
        "stratified-kfold rmse"
    ])
    
    for n in range(1, n_components+1):
        datasets = data_preprocessing(X, y, preprocessing, n)
        
        for name, dataset in datasets.items():
            scores = calculate_metrics(model, dataset, y, metrics)
            row_values = [n, name]
            
            for metric, score in scores.items():
                row_values.append(score)
            
            results = results.append({k: v for k, v in zip(results, row_values)}, ignore_index=True)
                
        print(f"Number of components finished: {n} out of {n_components}", end='\r')
                    
    return results

In [71]:
class OrdinalClassifier:
    def __init__(self, **kwargs):
        self.clf = LogisticRegression(**kwargs)
        self.clfs = {}
        
    def fit(self, X, y):
        self.unique_class = np.sort(np.unique(y))
        
        if self.unique_class.shape[0] > 2:
            for i in range(self.unique_class.shape[0]-1):
                binary_y = (y > self.unique_class[i]).astype(np.int8)
                clf = clone(self.clf)
                clf.fit(X, binary_y)
                self.clfs[i] = clf
                
    def predict_proba(self, X):
        clfs_predict = {k: self.clfs[k].predict_proba(X) for k in self.clfs}
        predicted = []
        
        for i, y in enumerate(self.unique_class):
            if i == 0:
                predicted.append(1 - clfs_predict[y][:, 1])
            elif y in clfs_predict:
                predicted.append(clfs_predict[y-1][:, 1] - clfs_predict[y][:, 1])
            else:
                predicted.append(clfs_predict[y-1][:, 1])
                
        return np.vstack(predicted).T
    
    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)
    
    def score(self, X, y):
        return accuracy_score(y, self.predict(X))
    
    def get_params(self, deep=True):
        out = self.clf.get_params(deep=deep)
        return out

In [73]:
# best: 0.679, 63.9%, 4, selectkbest and pca (default)
model = OrdinalClassifier(n_jobs=-1, random_state=0)

results = test_model(model, X, y)
results.sort_values(["stratified-kfold rmse", "stratified-kfold acc", "n_components"], ascending=True)

Number of components finished: 8 out of 8

Unnamed: 0,n_components,dataset_type,stratified-kfold acc,stratified-kfold rmse
0,1,selectkbest,0.51,0.7
1,1,pca,0.51,0.7
2,1,umap,0.51,0.7
3,2,selectkbest,0.51,0.7
4,2,pca,0.51,0.7
8,3,umap,0.49,0.731
11,4,umap,0.449,0.739
6,3,selectkbest,0.459,0.761
7,3,pca,0.459,0.761
5,2,umap,0.459,0.766


In [93]:
# best: 0.687, 57.5%, 6, umap (default)
model = LogisticRegression(class_weight="balanced", n_jobs=-1, random_state=0)

results = test_model(model, X, y)
results.sort_values(["stratified-kfold rmse", "stratified-kfold acc", "n_components"], ascending=True)

Number of components finished: 8 out of 8

Unnamed: 0,n_components,dataset_type,stratified-kfold acc,stratified-kfold rmse
5,2,umap,0.398,0.989
14,5,umap,0.378,0.993
21,8,selectkbest,0.337,0.999
22,8,pca,0.337,0.999
3,2,selectkbest,0.378,0.999
4,2,pca,0.378,0.999
18,7,selectkbest,0.337,1.002
19,7,pca,0.337,1.002
12,5,selectkbest,0.306,1.005
13,5,pca,0.306,1.005


In [194]:
# best: 0.660, 55.1%, 6, umap (default)
model = RidgeClassifier(random_state=0)

results = test_model(model, X, y)
results.sort_values(["stratified-kfold rmse", "stratified-kfold acc", "n_components"], ascending=True)

Number of components finished: 10 out of 10

Unnamed: 0,n_components,dataset_type,stratified-kfold acc,stratified-kfold rmse
17,6,umap,0.551,0.66
27,10,selectkbest,0.588,0.709
28,10,pca,0.588,0.709
3,2,selectkbest,0.476,0.722
4,2,pca,0.476,0.722
15,6,selectkbest,0.565,0.722
16,6,pca,0.565,0.722
18,7,selectkbest,0.565,0.722
19,7,pca,0.565,0.722
2,1,umap,0.456,0.737


In [76]:
# best: 0.543, 70.4%, 10, pca (default)
model = RandomForestClassifier(max_depth=2, n_jobs=-1, random_state=0)

results = test_model(model, X, y)
results.sort_values(["stratified-kfold rmse", "stratified-kfold acc", "n_components"], ascending=True)

Number of components finished: 8 out of 8

Unnamed: 0,n_components,dataset_type,stratified-kfold acc,stratified-kfold rmse
23,8,umap,0.48,0.717
14,5,umap,0.49,0.725
0,1,selectkbest,0.5,0.725
1,1,pca,0.5,0.725
6,3,selectkbest,0.48,0.739
15,6,selectkbest,0.439,0.747
20,7,umap,0.449,0.757
21,8,selectkbest,0.439,0.765
9,4,selectkbest,0.459,0.765
16,6,pca,0.408,0.766


In [64]:
# best: 0.623, 68.0%, 6, selectkbest or pca (kernel="poly", degree=3)
model = SVC(kernel="poly", degree=5, random_state=0)

results = test_model(model, temp_X, y)
results.sort_values(["stratified-kfold rmse", "stratified-kfold acc", "n_components"], ascending=True)

Number of components finished: 8 out of 8

Unnamed: 0,n_components,dataset_type,stratified-kfold acc,stratified-kfold rmse
2,1,umap,0.51,0.7
5,2,umap,0.51,0.7
0,1,selectkbest,0.5,0.707
1,1,pca,0.5,0.707
15,6,selectkbest,0.49,0.713
16,6,pca,0.49,0.713
18,7,selectkbest,0.49,0.713
19,7,pca,0.49,0.713
21,8,selectkbest,0.49,0.713
22,8,pca,0.49,0.713


In [60]:
# best: 0.564, 66.3%, 10, pca (max_depth=1)
model = DecisionTreeClassifier(random_state=0)

results = test_model(model, X, y)
results.sort_values(["stratified-kfold rmse", "stratified-kfold acc", "n_components"], ascending=True)

Number of components finished: 8 out of 8

Unnamed: 0,n_components,dataset_type,stratified-kfold acc,stratified-kfold rmse
23,8,umap,0.388,0.835
10,4,pca,0.439,0.857
11,4,umap,0.408,0.861
5,2,umap,0.398,0.893
4,2,pca,0.408,0.894
12,5,selectkbest,0.408,0.894
13,5,pca,0.408,0.894
2,1,umap,0.408,0.895
7,3,pca,0.388,0.898
9,4,selectkbest,0.388,0.9


In [74]:
# best: 0.536, 70.7%, 8, selectkbest or pca (n_neighbors=1)
model = KNeighborsClassifier(n_neighbors=1, n_jobs=-1)

results = test_model(model, X, y)
results.sort_values(["stratified-kfold rmse", "stratified-kfold acc", "n_components"], ascending=True)

Number of components finished: 8 out of 8

Unnamed: 0,n_components,dataset_type,stratified-kfold acc,stratified-kfold rmse
5,2,umap,0.429,0.863
12,5,selectkbest,0.408,0.887
13,5,pca,0.408,0.887
18,7,selectkbest,0.408,0.887
19,7,pca,0.408,0.887
22,8,pca,0.408,0.887
23,8,umap,0.408,0.894
21,8,selectkbest,0.388,0.898
17,6,umap,0.408,0.9
15,6,selectkbest,0.418,0.901


In [75]:
X

Unnamed: 0,time_in_open,time_in_closed,latency_to_enter_open,time_head_dipping,frequency_of_entry_to_open,frequency_of_entry_to_closed,distance_traveled_closed,distance_traveled_open,time_in_open0,time_in_closed0,...,distance_traveled_closed4500,distance_traveled_open4500,time_in_open6000,time_in_closed6000,latency_to_open6000,time_head_dipping6000,open_frequency6000,closed_frequency6000,distance_traveled_closed6000,distance_traveled_open6000
0,22.72,251.88,0.00,8.40,0.075033,0.831836,1368.393145,193.277810,21.04,20.20,...,302.289879,10.337348,0.00,60.00,-1.00,0.00,0.000000,1.000000,200.624513,0.000000
1,22.72,251.88,0.00,8.40,0.075033,0.831836,1368.393145,193.277810,21.04,20.20,...,302.289879,10.337348,0.00,60.00,-1.00,0.00,0.000000,1.000000,200.624513,0.000000
2,3.60,278.32,0.00,1.68,0.011891,0.919276,1133.627110,25.454854,3.60,49.24,...,248.301133,0.000000,0.00,57.92,-1.00,0.00,0.000000,0.965333,242.621622,0.000000
3,3.60,278.32,0.00,1.68,0.011891,0.919276,1133.627110,25.454854,3.60,49.24,...,248.301133,0.000000,0.00,57.92,-1.00,0.00,0.000000,0.965333,242.621622,0.000000
4,44.76,227.00,5.40,32.40,0.147820,0.749670,1324.834645,193.881997,5.04,45.00,...,189.461998,76.299754,23.12,32.68,0.00,13.84,0.385333,0.544667,206.101392,83.735127
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,43.36,232.04,0.00,12.84,0.143178,0.766213,966.924470,468.204088,19.52,32.40,...,265.509781,2.786211,12.52,42.72,36.60,5.48,0.208667,0.712000,241.935333,144.866829
94,21.72,250.88,81.88,16.28,0.071731,0.828534,1301.832121,87.088105,0.00,54.96,...,277.172111,8.306890,16.20,35.56,3.20,9.00,0.270000,0.592667,181.186073,71.718605
95,21.72,250.88,81.88,16.28,0.071731,0.828534,1301.832121,87.088105,0.00,54.96,...,277.172111,8.306890,16.20,35.56,3.20,9.00,0.270000,0.592667,181.186073,71.718605
96,42.76,212.44,0.00,19.56,0.141178,0.701400,1103.958135,260.640166,8.28,42.80,...,241.214719,2.645144,29.80,25.72,27.16,9.80,0.496667,0.428667,135.367397,154.287681
