# Rumination classification - averaged participants' epochs

### Vectorization with ICA

### Imports

In [None]:
%load_ext lab_black
import os
import pickle
from time import time
import pywt
import mne
import scipy
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import cesium.featurize
from plotly.subplots import make_subplots
from ipywidgets import Dropdown, FloatRangeSlider, IntSlider, FloatSlider, interact
from sklearn.decomposition import FastICA
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

import sys

sys.path.append("..")
from utils import *

### Loading data

Loading EEG data and data from rumination questionnaire. By default create_df_data loads all info from given file but one can specify it by passing a list of desired labels from csv file.

In [None]:
tmin, tmax = -0.1, 0.6
signal_frequency = 256
ERROR = 0
CORRECT = 1
random_state = 0

In [None]:
channels_order_list = [
    "Fp1",
    "AF7",
    "AF3",
    "F1",
    "F3",
    "F5",
    "F7",
    "FT7",
    "FC5",
    "FC3",
    "FC1",
    "C1",
    "C3",
    "C5",
    "T7",
    "TP7",
    "CP5",
    "CP3",
    "CP1",
    "P1",
    "P3",
    "P5",
    "P7",
    "P9",
    "PO7",
    "PO3",
    "O1",
    "Iz",
    "Oz",
    "POz",
    "Pz",
    "CPz",
    "Fpz",
    "Fp2",
    "AF8",
    "AF4",
    "AFz",
    "Fz",
    "F2",
    "F4",
    "F6",
    "F8",
    "FT8",
    "FC6",
    "FC4",
    "FC2",
    "FCz",
    "Cz",
    "C2",
    "C4",
    "C6",
    "T8",
    "TP8",
    "CP6",
    "CP4",
    "CP2",
    "P2",
    "P4",
    "P6",
    "P8",
    "P10",
    "PO8",
    "PO4",
    "O2",
]

In [None]:
channels_dict = dict(zip(channels_order_list, np.arange(1, 64, 1)))

In [None]:
df_name = "go_nogo_df_mean"
pickled_data_filename = "../../data/" + df_name + ".pkl"
info_filename = "../../data/Demographic_Questionnaires_Behavioral_Results_N=163.csv"

# Check if data is already loaded
if os.path.isfile(pickled_data_filename):
    print("Pickled file found. Loading pickled data...")
    epochs_df = pd.read_pickle(pickled_data_filename)
    print("Done")
else:
    print("Pickled file not found. Loading data...")
    epochs_df = create_df_data(
        test_participants=False, info="all", personal=False, info_filename=info_filename
    )
    epochs_df.name = df_name
    # save loaded data into a pickle file
    epochs_df.to_pickle("../../data/" + epochs_df.name + ".pkl")
    print("Done. Pickle file created")

#### Average participants' error and correct epochs

In [None]:
averaged_epochs_df = (
    epochs_df.groupby(
        ["id", "marker"],
        sort=False,
    )
    .apply(
        lambda group_df: pd.Series(
            {
                "epoch": np.mean(group_df["epoch"]),
                "Rumination Full Scale": np.mean(group_df["Rumination Full Scale"]),
            }
        )
    )
    .reset_index()
)

## Training and predictions

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from tempfile import mkdtemp
from sklearn.model_selection import RepeatedKFold


from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
import warnings

warnings.filterwarnings("ignore")


import numpy as np
import scipy.stats

- Computes ICA and then at each channel computes CWT (ica_n_components = N).
- For each band (frequency) from CWT set it computes features given in feature_dict parameter (eg. std or mean).
- Then it computes PCA on flattened ICA channels and features (outer_components = N)
- Ending feature vector has shape: outer_components from (ica_n_components * len(feature_dict) * frequencies)

#### Standard features for EEG analysis provided by Guo et al. (2012)

In [None]:
def std_signal(t, m, e):
    return np.std(m)


def abs_diffs_signal(t, m, e):
    return np.sum(np.abs(np.diff(m)))


def mean_energy_signal(t, m, e):
    return np.mean(m ** 2)


def skew_signal(t, m, e):
    return scipy.stats.skew(m)


def mean_signal(t, m, e):
    return np.mean(m)

### Classification grid search

In [None]:
dataset = ERROR
dataset_name = "correct" if dataset == CORRECT else "error"

In [None]:
X_train = np.array(
    averaged_epochs_df[averaged_epochs_df["marker"] == dataset]["epoch"].to_list()
)
y_train = np.array(
    averaged_epochs_df[averaged_epochs_df["marker"] == dataset][
        "Rumination Full Scale"
    ].to_list()
)

In [None]:
X_test = []
y_test = []

#### Split data by median into two groups: high/low rumination

In [None]:
rumination_median = np.median(y_train)
HIGH = 1
LOW = 0

In [None]:
for i in range(len(y_train)):
    if y_train[i] < rumination_median:
        y_train[i] = LOW
    else:
        y_train[i] = HIGH

#### Defined data transformers - custom data transformation steps

In [None]:
def ChannelExtractionTransformer(channel_list):
    def transform(X):
        epochs_per_channels = np.transpose(X, (1, 0, 2))
        epochs_per_selected_channels = []

        for channel in channel_list:
            this_data = epochs_per_channels[channel]
            epochs_per_selected_channels.append(this_data)

        epochs_per_selected_channels = np.array(epochs_per_selected_channels)
        selected_channels_per_epoch = np.transpose(
            epochs_per_selected_channels, (1, 0, 2)
        )
        #         print(f"EXTRACTION {selected_channels_per_epoch.shape}")
        return selected_channels_per_epoch

    return FunctionTransformer(func=transform)


# swap channels and epochs axes: from epoch_channel_timepoints to channel_epoch_timepoints and vice versa
def ChannelDataSwap():
    def transform(X):
        data_channel_swaped = np.transpose(X, (1, 0, 2))
        return data_channel_swaped

    return FunctionTransformer(func=transform)


def IcaPreprocessingTransformer():
    def transform(X):
        timepoints_per_channel = np.concatenate(X, axis=1)
        return timepoints_per_channel.T

    return FunctionTransformer(func=transform)


def IcaPostprocessingTransformer(timepoints_count):
    def transform(X):
        X_ica_transposed = X.T
        ica_n_components = X.shape[1]

        epochs_count = int(X_ica_transposed.shape[1] / timepoints_count)
        data_per_channel = X_ica_transposed.reshape(
            ica_n_components, epochs_count, timepoints_count
        )
        return data_per_channel

    return FunctionTransformer(func=transform)


def CwtVectorizer(mwt="morl", cwt_density=2, cwt_octaves=6):
    def transform(X):
        cwt_per_channel = []
        for data in X:
            data_cwt = np.array(
                [cwt(epoch, mwt, cwt_density, octaves=6) for epoch in data]
            )
            cwt_per_channel.append(data_cwt)
        cwt_per_channel = np.array(cwt_per_channel)
        return cwt_per_channel

    return FunctionTransformer(func=transform)


def BinTransformer(step):
    def bin_epoch(epoch):
        new_channels = []
        for channel in epoch:
            bins_channel = []
            index = 0
            while index + step < len(channel):
                this_bin = np.mean(channel[index : index + step])
                bins_channel.append(this_bin)
                index += step
            new_channels.append(bins_channel)
        return new_channels

    def transform(X):
        binned_data = np.array([bin_epoch(epoch) for epoch in X])
        return binned_data

    return FunctionTransformer(func=transform)


def CwtFeatureVectorizer(feature_dict):
    def transform(X):
        vectorized_data = []

        for data_cwt in X:
            # cesium functions
            feature_set_cwt = cesium.featurize.featurize_time_series(
                times=None,
                values=data_cwt,
                errors=None,
                features_to_use=list(feature_dict.keys()),
                custom_functions=feature_dict,
            )
            features_per_epoch = feature_set_cwt.to_numpy()
            vectorized_data.append(features_per_epoch)
        vectorized_data = np.array(vectorized_data)
        return vectorized_data

    return FunctionTransformer(func=transform)


# transforms energy of each sub-band into relative energy of sub-band
def RelativeEnergyTransformer():
    def transform(X):
        vectorized_data = []

        for epoch in X:
            total_energy_of_epoch = np.sum(epoch)
            sub_band_relative_energies = np.array(
                [(sub_band_energy / total_energy_of_epoch) for sub_band_energy in epoch]
            )
            vectorized_data.append(sub_band_relative_energies)

        vectorized_data = np.array(vectorized_data)
        return vectorized_data

    return FunctionTransformer(func=transform)


# reshape data from (channels x epoch x features) to (epochs x channles x features)
# and then flatten it to (epoch x channels*features)
def PostprocessingTransformer():
    def transform(X):
        vectorized_data = np.stack(X, axis=1)
        epochs_per_channel_feature = vectorized_data.reshape(
            vectorized_data.shape[0], -1
        )
        return epochs_per_channel_feature

    return FunctionTransformer(func=transform)

Define regression models:

In [None]:
# knn = ("knn", KNeighborsClassifier())
# knn_params = dict(
#     knn__n_neighbors=np.arange(10, 20, 2),
# )

# svc = ("svc", SVC())
# svc_params = dict(
#     svc__kernel=["rbf", "linear", "sigmoid"],
#     svc__C=[0.01],
# )
# decision_tree = ("decision_tree", DecisionTreeClassifier(random_state=random_state))
# decision_tree_params = dict(
#     decision_tree__criterion=["gini", "entropy"],
#     decision_tree__max_depth=[4, 5],
#     decision_tree__min_samples_leaf=[16],
# )

# lr = ("lr", LogisticRegression())
# lr_params = dict(lr__penalty=["l2"], lr__C=[0.01])

In [None]:
def rate_classifier(
    X_train, y_train, X_test, y_test, classifier, classifier_params, base_steps, cv=5
):
    #     cv_ = RepeatedStratifiedKFold(n_splits=2, n_repeats=5, random_state=random_state)
    # define cross-validation method
    cv_skf = StratifiedKFold(n_splits=3)

    pipeline = Pipeline(steps=base_steps + [classifier])
    param_grid = classifier_params
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=cv_skf,
        scoring={"balanced_accuracy", "precision"},
        refit="balanced_accuracy",
        return_train_score=True,
        n_jobs=10,
        verbose=10,
    )
    grid_search.fit(X_train, y_train)

    return grid_search

Calculate p-value 

In [None]:
from mlxtend.evaluate import paired_ttest_5x2cv
from sklearn.dummy import DummyClassifier


def calculate_p(estimator, X, y):
    baseline_estimator = DummyClassifier(strategy="most_frequent")

    # check if difference between algorithms is real
    t, p = paired_ttest_5x2cv(
        estimator1=baseline_estimator,
        estimator2=estimator,
        X=X,
        y=y,
        scoring="accuracy",
        random_seed=0,
    )

    # summarize
    print(f"     The P-value is = {p:.3f}")
    print(f"     The t-statistics is = {t:.3f}\n")

    return t, p

In [None]:
def create_yes_no_array(y_true, y_pred):
    yes_no_array = np.array(np.array(y_pred) == np.array(y_true))
    return yes_no_array

In [None]:
from sklearn.metrics.cluster import contingency_matrix
from statsmodels.stats.contingency_tables import mcnemar


def calculate_p_mcnemar(yes_no_1, yes_no_2):
    contingency_matrix_ = np.array(contingency_matrix(yes_no_1, yes_no_2))
    #     print(contingency_matrix_)
    result = mcnemar(contingency_matrix_, exact=True)
    p = result.pvalue
    t = result.statistic

    print(f"     The P-value is = {p:.3f}")
    print(f"     The t-statistics is = {t:.3f}\n")

    return t, p

Create validation curves for parameters' insight 

In [None]:
import matplotlib.pyplot as plt


def pooled_var(stds):
    # https://en.wikipedia.org/wiki/Pooled_variance#Pooled_standard_deviation
    n = 5  # size of each group
    return np.sqrt(sum((n - 1) * (stds ** 2)) / len(stds) * (n - 1))


def show_validation_curves(cv_results, grid_params):

    df = pd.DataFrame(cv_results)
    results = [
        "mean_test_balanced_accuracy",
        "mean_train_balanced_accuracy",
        "std_test_balanced_accuracy",
        "std_train_balanced_accuracy",
    ]

    fig, axes = plt.subplots(
        1, len(grid_params), figsize=(5 * len(grid_params), 7), sharey="row"
    )
    axes[0].set_ylabel("Score", fontsize=25)

    for idx, (param_name, param_range) in enumerate(grid_params.items()):
        grouped_df = df.groupby(f"param_{param_name}")[results].agg(
            {
                "mean_train_balanced_accuracy": "mean",
                "mean_test_balanced_accuracy": "mean",
                "std_train_balanced_accuracy": pooled_var,
                "std_test_balanced_accuracy": pooled_var,
            }
        )

        previous_group = df.groupby(f"param_{param_name}")[results]
        axes[idx].set_xlabel(param_name, fontsize=10)
        axes[idx].set_ylim(0.0, 1.1)
        axes[idx].set_xscale("log")
        lw = 2
        axes[idx].plot(
            param_range,
            grouped_df["mean_train_balanced_accuracy"],
            label="Training score",
            color="darkorange",
            lw=lw,
        )
        axes[idx].fill_between(
            param_range,
            grouped_df["mean_train_balanced_accuracy"]
            - grouped_df["std_train_balanced_accuracy"],
            grouped_df["mean_train_balanced_accuracy"]
            + grouped_df["std_train_balanced_accuracy"],
            alpha=0.2,
            color="darkorange",
            lw=lw,
        )
        axes[idx].plot(
            param_range,
            grouped_df["mean_test_balanced_accuracy"],
            label="Cross-validation score",
            color="navy",
            lw=lw,
        )
        axes[idx].fill_between(
            param_range,
            grouped_df["mean_test_balanced_accuracy"]
            - grouped_df["std_test_balanced_accuracy"],
            grouped_df["mean_test_balanced_accuracy"]
            + grouped_df["std_test_balanced_accuracy"],
            alpha=0.2,
            color="navy",
            lw=lw,
        )

    handles, labels = axes[0].get_legend_handles_labels()
    fig.suptitle("Validation curves", fontsize=40)
    fig.legend(handles, labels, loc=8, ncol=2, fontsize=20)

    fig.subplots_adjust(bottom=0.25, top=0.85)
    plt.show()

In [None]:
def run_experiment(
    tested_classifiers,
    classifier_params,
    pipeline_name,
    X_train,
    X_test,
    y_train,
    y_test,
    dataset_name,
    base_steps,
    results_df,
    function_name="-",
):

    for (classifier, params) in tested_classifiers:
        print(f"Rating {classifier} \n")
        tested_params = {**classifier_params, **params}

        grid_result = rate_classifier(
            X_train,
            y_train,
            X_test,
            y_test,
            classifier,
            tested_params,
            base_steps,
            cv=2,
        )

        #     predictions = grid_result.predict(X_test)
        #     r2 = grid_result.score(X_test, y_test)
        #     mae = mean_absolute_error(y_test, predictions)
        #     r2_adj = r2_adjusted_scorer(y_test, predictions, len(X_test[0]), len(X_test))

        best_estimator_index = grid_result.best_index_
        mean_cv_balanced_accuracy = grid_result.cv_results_[
            "mean_test_balanced_accuracy"
        ][best_estimator_index]
        std_cv_balanced_accuracy = grid_result.cv_results_[
            "std_test_balanced_accuracy"
        ][best_estimator_index]
        mean_cv_precision = grid_result.cv_results_["mean_test_precision"][
            best_estimator_index
        ]
        std_cv_precision = grid_result.cv_results_["std_test_precision"][
            best_estimator_index
        ]
        mean_train_balanced_accuracy = grid_result.cv_results_[
            "mean_train_balanced_accuracy"
        ][best_estimator_index]

        print(f"     Best parameters: {grid_result.best_params_}")
        print(
            f"     mean acc: {mean_cv_balanced_accuracy}           ± {round(std_cv_balanced_accuracy,3)}"
        )
        print(f"     mean acc train: {mean_train_balanced_accuracy}")

        cv_results = grid_result.cv_results_
        t_statistics, p_value = calculate_p(
            grid_result.best_estimator_, X_train, y_train
        )

        #         baseline_estimator = DummyClassifier(strategy="most_frequent")
        #         baseline_estimator.fit(X_train, y_train)
        #         y_pred_dummy = baseline_estimator.predict(X_train)
        #         yes_no_dummy = create_yes_no_array(y_train, y_pred_dummy)
        #         y_pred_est = grid_result.predict(X_train)
        #         yes_no_est = create_yes_no_array(y_train, y_pred_est)
        #         t_statistics, p_value = calculate_p_mcnemar(yes_no_est, yes_no_dummy)

        show_validation_curves(grid_result.cv_results_, tested_params)

        data = {
            "data_set": dataset_name,
            "pipeline_name": pipeline_name + "-" + function_name,
            "function": function_name,
            "model": classifier[0],
            "parameters": grid_result.best_params_,
            "mean_cv_balanced_accuracy": mean_cv_balanced_accuracy,
            "std_cv_balanced_accuracy": std_cv_balanced_accuracy,
            "mean_cv_precision": mean_cv_precision,
            "std_cv_precision": std_cv_precision,
            "cv_results": cv_results,
            "mean_train_balanced_accuracy": mean_train_balanced_accuracy,
            "p-value": p_value,
            "t-stats": t_statistics,
        }

        results_df = results_df.append(data, ignore_index=True)
    return results_df

Define significant channels - the rest will be excluded

In [None]:
red_box = [
    "F1",
    "Fz",
    "F2",
    "FC1",
    "FCz",
    "FC2",
    "C1",
    "Cz",
    "C2",
    "CP1",
    "CPz",
    "CP2",
    "P1",
    "Pz",
    "P2",
]
significant_channels = [channels_dict[channel] for channel in red_box]

# Experiments

In [None]:
results_df = pd.DataFrame()

### Experiment 1
- Models: KNN, GBR, Lasso, SVR
- vectorize with ICA-cwt-PCA

In [None]:
pipeline_name = "ICA_cut_cwt"

In [None]:
knn = ("knn", KNeighborsClassifier())
knn_params = dict(
    knn__n_neighbors=np.arange(15, 30, 3),
)
# 10
svc = ("svc", SVC())
svc_params = dict(
    svc__kernel=["rbf", "linear", "sigmoid"],
    svc__C=[10],
)

decision_tree = ("decision_tree", DecisionTreeClassifier(random_state=random_state))
decision_tree_params = dict(
    decision_tree__criterion=["gini", "entropy"],
    decision_tree__max_depth=[6],
    decision_tree__min_samples_leaf=[15],
)
# 0.001
lr = ("lr", LogisticRegression())
lr_params = dict(lr__C=[0.001])

In [None]:
classifier_params = dict(
    ica__n_components=np.arange(3, 16, 2),
    pca__n_components=np.arange(3, 9, 1),
)

In [None]:
tested_classifiers = [
    (lr, lr_params),
    #     (decision_tree, decision_tree_params),
    #     (knn, knn_params),
    #     (svc, svc_params),
]

In [None]:
base_steps = [
    ("channels_filtering", ChannelExtractionTransformer(significant_channels)),
    ("ica_preprocessing", IcaPreprocessingTransformer()),
    ("ica", FastICA(random_state=random_state)),
    (
        "ica_postprocessing",
        IcaPostprocessingTransformer(timepoints_count=X_train.shape[-1]),
    ),
    #     ("channel_data_swap", ChannelDataSwap()),
    ("cwt", CwtVectorizer()),
    ("postprocessing", PostprocessingTransformer()),
    ("scaler", StandardScaler()),
    ("pca", PCA(random_state=random_state)),
]

Run experiment:

In [None]:
results_df = run_experiment(
    tested_classifiers,
    classifier_params,
    pipeline_name,
    X_train,
    X_test,
    y_train,
    y_test,
    dataset_name,
    base_steps,
    results_df,
)

In [None]:
results_df.to_pickle("../../data/classification_ICA_cut_" + dataset_name + ".pkl")

### Experiment 2

- Models: KNN, GBR, Lasso, SVR
- vectorize with ICA-bins-PCA

In [None]:
knn = ("knn", KNeighborsClassifier())
knn_params = dict(
    knn__n_neighbors=np.arange(8, 28, 3),
)

# 0.01
svc = ("svc", SVC())
svc_params = dict(
    svc__kernel=["rbf", "linear", "sigmoid"],
    svc__C=[0.01],
)

decision_tree = ("decision_tree", DecisionTreeClassifier(random_state=random_state))
decision_tree_params = dict(
    decision_tree__criterion=["gini", "entropy"],
    decision_tree__max_depth=[3],
    decision_tree__min_samples_leaf=[15],
)

# 0.1
lr = ("lr", LogisticRegression())
lr_params = dict(lr__C=[0.1])

In [None]:
step_in_ms = 50
step_tp = int(signal_frequency * step_in_ms / 1000)

In [None]:
pipeline_name = "ICA_cut_bins"

In [None]:
classifier_params = dict(
    ica__n_components=np.arange(3, 16, 2),
    pca__n_components=np.arange(3, 9, 1),
)

In [None]:
tested_classifiers = [
    (lr, lr_params),
    (decision_tree, decision_tree_params),
    (knn, knn_params),
    (svc, svc_params),
]

In [None]:
base_steps = [
    ("channels_filtering", ChannelExtractionTransformer(significant_channels)),
    ("ica_preprocessing", IcaPreprocessingTransformer()),
    ("ica", FastICA(random_state=random_state)),
    (
        "ica_postprocessing",
        IcaPostprocessingTransformer(timepoints_count=X_train.shape[-1]),
    ),
    ("channel_data_swap", ChannelDataSwap()),
    ("binning", BinTransformer(step=step_tp)),
    ("data_channel_swap", ChannelDataSwap()),
    ("postprocessing", PostprocessingTransformer()),
    ("scaler", StandardScaler()),
    ("pca", PCA(random_state=random_state)),
]

Run experiment:

In [None]:
results_df = run_experiment(
    tested_classifiers,
    classifier_params,
    pipeline_name,
    X_train,
    X_test,
    y_train,
    y_test,
    dataset_name,
    base_steps,
    results_df,
)

### Experiment 3
- Models: KNN, GBR, Lasso, SVR
- vectorize with ICA-PCA

In [None]:
pipeline_name = "ICA_cut"

In [None]:
knn = ("knn", KNeighborsClassifier())
knn_params = dict(
    knn__n_neighbors=np.arange(8, 24, 3),
)

# 0.01
svc = ("svc", SVC())
svc_params = dict(
    svc__kernel=["rbf", "linear", "sigmoid"],
    svc__C=[0.01],
)

decision_tree = ("decision_tree", DecisionTreeClassifier(random_state=random_state))
decision_tree_params = dict(
    decision_tree__criterion=["gini", "entropy"],
    decision_tree__max_depth=[3],
    decision_tree__min_samples_leaf=[15],
)

# 0.1
lr = ("lr", LogisticRegression())
lr_params = dict(lr__C=[0.1])

In [None]:
classifier_params = dict(
    ica__n_components=np.arange(3, 16, 2),
    pca__n_components=np.arange(3, 9, 1),
)

In [None]:
tested_classifiers = [
    (lr, lr_params),
    (decision_tree, decision_tree_params),
    (knn, knn_params),
    (svc, svc_params),
]

In [None]:
base_steps = [
    ("channels_filtering", ChannelExtractionTransformer(significant_channels)),
    ("ica_preprocessing", IcaPreprocessingTransformer()),
    ("ica", FastICA(random_state=random_state)),
    (
        "ica_postprocessing",
        IcaPostprocessingTransformer(timepoints_count=X_train.shape[-1]),
    ),
    #     ("channel_data_swap", ChannelDataSwap()),
    ("postprocessing", PostprocessingTransformer()),
    ("scaler", StandardScaler()),
    ("pca", PCA(random_state=random_state)),
]

In [None]:
results_df = run_experiment(
    tested_classifiers,
    classifier_params,
    pipeline_name,
    X_train,
    X_test,
    y_train,
    y_test,
    dataset_name,
    base_steps,
    results_df,
)

### Experiment 4

- Models: KNN, GBR, Lasso, SVR
- vectorize with ICA-bins-cwt-PCA

Tuned

In [None]:
knn = ("knn", KNeighborsClassifier())
knn_params = dict(
    knn__n_neighbors=np.arange(15, 30, 3),
)
# 100
svc = ("svc", SVC())
svc_params = dict(
    svc__kernel=["linear", "sigmoid", "poly"],
    svc__C=[100],
)

decision_tree = ("decision_tree", DecisionTreeClassifier(random_state=random_state))
decision_tree_params = dict(
    decision_tree__criterion=["gini", "entropy"],
    decision_tree__max_depth=[3],
    decision_tree__min_samples_leaf=[15],
)
# 0.0001
lr = ("lr", LogisticRegression())
lr_params = dict(lr__C=[0.0001])

In [None]:
pipeline_name = "ICA_cut_bins_cwt"

In [None]:
step_in_ms = 50
step_tp = int(signal_frequency * step_in_ms / 1000)

In [None]:
classifier_params = dict(
    ica__n_components=np.arange(3, 16, 2),
    pca__n_components=np.arange(3, 8, 1),
)

In [None]:
tested_classifiers = [
    (lr, lr_params),
    (decision_tree, decision_tree_params),
    (knn, knn_params),
    (svc, svc_params),
]

In [None]:
base_steps = [
    ("channels_filtering", ChannelExtractionTransformer(significant_channels)),
    ("ica_preprocessing", IcaPreprocessingTransformer()),
    ("ica", FastICA(random_state=random_state)),
    (
        "ica_postprocessing",
        IcaPostprocessingTransformer(timepoints_count=X_train.shape[-1]),
    ),
    ("channel_data_swap", ChannelDataSwap()),
    ("binning", BinTransformer(step=step_tp)),
    ("data_channel_swap", ChannelDataSwap()),
    ("cwt", CwtVectorizer()),
    ("postprocessing", PostprocessingTransformer()),
    ("scaler", StandardScaler()),
    ("pca", PCA(random_state=random_state)),
]

In [None]:
results_df = run_experiment(
    tested_classifiers,
    classifier_params,
    pipeline_name,
    X_train,
    X_test,
    y_train,
    y_test,
    dataset_name,
    base_steps,
    results_df,
)

In [None]:
results_df.to_pickle("../../data/classification_ICA_cut_" + dataset_name + ".pkl")

In [None]:
results_df.to_csv("../../data/classification_ICA_cut_" + dataset_name + ".csv")

### Experiment 4
- Models: KNN, GBR, Lasso, SVR
- ICA-cwt-functions

In [None]:
knn = ("knn", KNeighborsClassifier())
knn_params = dict(
    knn__n_neighbors=np.arange(12, 28, 2),
)
# 1
svc = ("svc", SVC())
svc_params = dict(
    svc__kernel=["linear", "sigmoid", "poly"],
    svc__C=[1],
)

decision_tree = ("decision_tree", DecisionTreeClassifier(random_state=random_state))
decision_tree_params = dict(
    decision_tree__criterion=["gini", "entropy"],
    decision_tree__max_depth=[3],
    decision_tree__min_samples_leaf=[15],
)
# 0.01
lr = ("lr", LogisticRegression())
lr_params = dict(lr__C=[0.01])

In [None]:
pipeline_name = "ICA_cut_function"

In [None]:
guo_features = [
    {"std": std_signal},
    {"abs_diffs": abs_diffs_signal},
    {"energy": mean_energy_signal},
    {"skew": skew_signal},
    {"mean": mean_signal},
]

In [None]:
classifier_params = dict(
    ica__n_components=np.arange(3, 16, 2),
    pca__n_components=np.arange(3, 9, 1),
)

In [None]:
tested_classifiers = [
    (lr, lr_params),
    (decision_tree, decision_tree_params),
    (knn, knn_params),
    (svc, svc_params),
]

In [None]:
for feature_function_dict in guo_features:
    print(f"Featurize with {feature_function_dict.keys()} function")

    # define base steps
    this_base_steps = [
        ("channels_filtering", ChannelExtractionTransformer(significant_channels)),
        ("ica_preprocessing", IcaPreprocessingTransformer()),
        ("ica", FastICA(random_state=random_state)),
        (
            "ica_postprocessing",
            IcaPostprocessingTransformer(timepoints_count=X_train.shape[-1]),
        ),
        ("cwt", CwtVectorizer()),
        ("cwt_feature", CwtFeatureVectorizer(feature_dict=feature_function_dict)),
        ("postprocessing", PostprocessingTransformer()),
        ("scaler", StandardScaler()),
        ("pca", PCA(random_state=random_state)),
    ]

    # rate different models
    results_df = run_experiment(
        tested_classifiers,
        classifier_params,
        pipeline_name,
        X_train,
        X_test,
        y_train,
        y_test,
        dataset_name,
        this_base_steps,
        results_df,
        function_name=list(feature_function_dict.keys())[0],
    )

In [None]:
results_df.to_pickle("../../data/classification_ICA_cut_" + dataset_name + ".pkl")

## Experiment 5

In [None]:
pipeline_name = "ICA_cut_function_std"

feature_function_dict = {"std": std_signal}

Tuned

In [None]:
knn = ("knn", KNeighborsClassifier())
knn_params = dict(
    knn__n_neighbors=np.arange(15, 30, 3),
)
# 1
svc = ("svc", SVC())
svc_params = dict(
    svc__kernel=["linear", "sigmoid", "poly"],
    svc__C=[1],
)
# 3/20
decision_tree = ("decision_tree", DecisionTreeClassifier(random_state=random_state))
decision_tree_params = dict(
    decision_tree__criterion=["gini", "entropy"],
    decision_tree__max_depth=[3],
    decision_tree__min_samples_leaf=[20],
)
# 0.1
lr = ("lr", LogisticRegression())
lr_params = dict(lr__C=[0.1])

In [None]:
classifier_params = dict(
    ica__n_components=np.arange(3, 16, 2),
    pca__n_components=np.arange(3, 9, 1),
)

In [None]:
tested_classifiers = [
    (lr, lr_params),
    (decision_tree, decision_tree_params),
    (knn, knn_params),
    (svc, svc_params),
]

In [None]:
print(f"Featurize with {feature_function_dict.keys()} function")

# define base steps
this_base_steps = [
    ("channels_filtering", ChannelExtractionTransformer(significant_channels)),
    ("ica_preprocessing", IcaPreprocessingTransformer()),
    ("ica", FastICA(random_state=random_state)),
    (
        "ica_postprocessing",
        IcaPostprocessingTransformer(timepoints_count=X_train.shape[-1]),
    ),
    ("cwt", CwtVectorizer()),
    ("cwt_feature", CwtFeatureVectorizer(feature_dict=feature_function_dict)),
    ("postprocessing", PostprocessingTransformer()),
    ("scaler", StandardScaler()),
    ("pca", PCA(random_state=random_state)),
]

# rate different models
results_df = run_experiment(
    tested_classifiers,
    classifier_params,
    pipeline_name,
    X_train,
    X_test,
    y_train,
    y_test,
    dataset_name,
    this_base_steps,
    results_df,
    function_name=list(feature_function_dict.keys())[0],
)

## Experiment 6

In [None]:
pipeline_name = "ICA_cut_function_abs_diff"

feature_function_dict = {"abs_diffs": abs_diffs_signal}

Tuned

In [None]:
knn = ("knn", KNeighborsClassifier())
knn_params = dict(
    knn__n_neighbors=np.arange(15, 30, 3),
)
# 10
svc = ("svc", SVC())
svc_params = dict(
    svc__kernel=["linear", "rbf", "poly"],
    svc__C=[10],
)

decision_tree = ("decision_tree", DecisionTreeClassifier(random_state=random_state))
decision_tree_params = dict(
    decision_tree__criterion=["gini", "entropy"],
    decision_tree__max_depth=[3],
    decision_tree__min_samples_leaf=[20],
)
# 0.01
lr = ("lr", LogisticRegression())
lr_params = dict(lr__C=[0.01])

In [None]:
classifier_params = dict(
    ica__n_components=np.arange(3, 16, 2),
    pca__n_components=np.arange(3, 9, 1),
)

In [None]:
tested_classifiers = [
    (lr, lr_params),
    (decision_tree, decision_tree_params),
    (knn, knn_params),
    (svc, svc_params),
]

In [None]:
print(f"Featurize with {feature_function_dict.keys()} function")

# define base steps
this_base_steps = [
    ("channels_filtering", ChannelExtractionTransformer(significant_channels)),
    ("ica_preprocessing", IcaPreprocessingTransformer()),
    ("ica", FastICA(random_state=random_state)),
    (
        "ica_postprocessing",
        IcaPostprocessingTransformer(timepoints_count=X_train.shape[-1]),
    ),
    ("cwt", CwtVectorizer()),
    ("cwt_feature", CwtFeatureVectorizer(feature_dict=feature_function_dict)),
    ("postprocessing", PostprocessingTransformer()),
    ("scaler", StandardScaler()),
    ("pca", PCA(random_state=random_state)),
]

# rate different models
results_df = run_experiment(
    tested_classifiers,
    classifier_params,
    pipeline_name,
    X_train,
    X_test,
    y_train,
    y_test,
    dataset_name,
    this_base_steps,
    results_df,
    function_name=list(feature_function_dict.keys())[0],
)

### Experiment 7

Tuned

In [None]:
pipeline_name = "ICA_cut_function_energy"

feature_function_dict = {"energy": mean_energy_signal}

In [None]:
knn = ("knn", KNeighborsClassifier())
knn_params = dict(
    knn__n_neighbors=np.arange(15, 30, 3),
)
# 1
svc = ("svc", SVC())
svc_params = dict(
    svc__kernel=["rbf", "linear", "poly"],
    svc__C=[1],
)

decision_tree = ("decision_tree", DecisionTreeClassifier(random_state=random_state))
decision_tree_params = dict(
    decision_tree__criterion=["gini", "entropy"],
    decision_tree__max_depth=[3],
    decision_tree__min_samples_leaf=[25],
)
# 0.01
lr = ("lr", LogisticRegression())
lr_params = dict(lr__C=[0.01])

In [None]:
classifier_params = dict(
    ica__n_components=np.arange(3, 16, 2),
    pca__n_components=np.arange(3, 9, 1),
)

In [None]:
tested_classifiers = [
    (lr, lr_params),
    (decision_tree, decision_tree_params),
    (knn, knn_params),
    (svc, svc_params),
]

In [None]:
print(f"Featurize with {feature_function_dict.keys()} function")

# define base steps
this_base_steps = [
    ("channels_filtering", ChannelExtractionTransformer(significant_channels)),
    ("ica_preprocessing", IcaPreprocessingTransformer()),
    ("ica", FastICA(random_state=random_state)),
    (
        "ica_postprocessing",
        IcaPostprocessingTransformer(timepoints_count=X_train.shape[-1]),
    ),
    ("cwt", CwtVectorizer()),
    ("cwt_feature", CwtFeatureVectorizer(feature_dict=feature_function_dict)),
    ("postprocessing", PostprocessingTransformer()),
    ("scaler", StandardScaler()),
    ("pca", PCA(random_state=random_state)),
]

# rate different models
results_df = run_experiment(
    tested_classifiers,
    classifier_params,
    pipeline_name,
    X_train,
    X_test,
    y_train,
    y_test,
    dataset_name,
    this_base_steps,
    results_df,
    function_name=list(feature_function_dict.keys())[0],
)

### Experiment 8

Tuned

In [None]:
pipeline_name = "ICA_cut_function_mean"

feature_function_dict = {"mean": mean_signal}

In [None]:
knn = ("knn", KNeighborsClassifier())
knn_params = dict(
    knn__n_neighbors=np.arange(15, 30, 3),
)
# 1
svc = ("svc", SVC())
svc_params = dict(
    svc__kernel=["linear", "rbf", "poly"],
    svc__C=[1],
)

decision_tree = ("decision_tree", DecisionTreeClassifier(random_state=random_state))
decision_tree_params = dict(
    decision_tree__criterion=["gini", "entropy"],
    decision_tree__max_depth=[3],
    decision_tree__min_samples_leaf=[20],
)
# 0.01
lr = ("lr", LogisticRegression())
lr_params = dict(lr__C=[0.01])

In [None]:
classifier_params = dict(
    ica__n_components=np.arange(3, 16, 2),
    pca__n_components=np.arange(3, 9, 1),
)

In [None]:
tested_classifiers = [
    (lr, lr_params),
    (decision_tree, decision_tree_params),
    (knn, knn_params),
    (svc, svc_params),
]

In [None]:
print(f"Featurize with {feature_function_dict.keys()} function")

# define base steps
this_base_steps = [
    ("channels_filtering", ChannelExtractionTransformer(significant_channels)),
    ("ica_preprocessing", IcaPreprocessingTransformer()),
    ("ica", FastICA(random_state=random_state)),
    (
        "ica_postprocessing",
        IcaPostprocessingTransformer(timepoints_count=X_train.shape[-1]),
    ),
    ("cwt", CwtVectorizer()),
    ("cwt_feature", CwtFeatureVectorizer(feature_dict=feature_function_dict)),
    ("postprocessing", PostprocessingTransformer()),
    ("scaler", StandardScaler()),
    ("pca", PCA(random_state=random_state)),
]

# rate different models
results_df = run_experiment(
    tested_classifiers,
    classifier_params,
    pipeline_name,
    X_train,
    X_test,
    y_train,
    y_test,
    dataset_name,
    this_base_steps,
    results_df,
    function_name=list(feature_function_dict.keys())[0],
)

### Experiment 9

In [None]:
pipeline_name = "ICA_cut_bins_function_std"

feature_function_dict = {"std": std_signal}

In [None]:
knn = ("knn", KNeighborsClassifier())
knn_params = dict(
    knn__n_neighbors=np.arange(15, 30, 3),
)
# sprawdzic innym testem na p-value bo tym daje nan -> 0.1
# 1
svc = ("svc", SVC())
svc_params = dict(
    svc__kernel=["linear", "rbf", "poly"],
    svc__C=[1],
)

decision_tree = ("decision_tree", DecisionTreeClassifier(random_state=random_state))
decision_tree_params = dict(
    decision_tree__criterion=["gini", "entropy"],
    decision_tree__max_depth=[3],
    decision_tree__min_samples_leaf=[20],
)
# 0.1
lr = ("lr", LogisticRegression())
lr_params = dict(lr__C=[0.1])

In [None]:
classifier_params = dict(
    ica__n_components=np.arange(3, 16, 2),
    pca__n_components=np.arange(3, 9, 1),
)

In [None]:
tested_classifiers = [
    (lr, lr_params),
    (decision_tree, decision_tree_params),
    (knn, knn_params),
    (svc, svc_params),
]

In [None]:
print(f"Featurize with {feature_function_dict.keys()} function")

# define base steps
this_base_steps = [
    ("channels_filtering", ChannelExtractionTransformer(significant_channels)),
    ("ica_preprocessing", IcaPreprocessingTransformer()),
    ("ica", FastICA(random_state=random_state)),
    (
        "ica_postprocessing",
        IcaPostprocessingTransformer(timepoints_count=X_train.shape[-1]),
    ),
    ("channel_data_swap", ChannelDataSwap()),
    ("binning", BinTransformer(step=step_tp)),
    ("data_channel_swap", ChannelDataSwap()),
    ("cwt", CwtVectorizer()),
    ("cwt_feature", CwtFeatureVectorizer(feature_dict=feature_function_dict)),
    ("postprocessing", PostprocessingTransformer()),
    ("scaler", StandardScaler()),
    ("pca", PCA(random_state=random_state)),
]

# rate different models
results_df = run_experiment(
    tested_classifiers,
    classifier_params,
    pipeline_name,
    X_train,
    X_test,
    y_train,
    y_test,
    dataset_name,
    this_base_steps,
    results_df,
    function_name=list(feature_function_dict.keys())[0],
)

### Experiment 10

In [None]:
pipeline_name = "ICA_cut_bins_function_abs_diff"

feature_function_dict = {"abs_diffs": abs_diffs_signal}

In [None]:
knn = ("knn", KNeighborsClassifier())
knn_params = dict(
    knn__n_neighbors=np.arange(15, 30, 3),
)
# 1
svc = ("svc", SVC())
svc_params = dict(
    svc__kernel=["linear", "rbf", "poly"],
    svc__C=[1],
)

decision_tree = ("decision_tree", DecisionTreeClassifier(random_state=random_state))
decision_tree_params = dict(
    decision_tree__criterion=["gini", "entropy"],
    decision_tree__max_depth=[3],
    decision_tree__min_samples_leaf=[20],
)
# 0.01
lr = ("lr", LogisticRegression())
lr_params = dict(lr__C=[0.01])

In [None]:
classifier_params = dict(
    ica__n_components=np.arange(3, 16, 2),
    pca__n_components=np.arange(3, 9, 1),
)

In [None]:
tested_classifiers = [
    (lr, lr_params),
    (decision_tree, decision_tree_params),
    (knn, knn_params),
    (svc, svc_params),
]

In [None]:
print(f"Featurize with {feature_function_dict.keys()} function")

# define base steps
this_base_steps = [
    ("channels_filtering", ChannelExtractionTransformer(significant_channels)),
    ("ica_preprocessing", IcaPreprocessingTransformer()),
    ("ica", FastICA(random_state=random_state)),
    (
        "ica_postprocessing",
        IcaPostprocessingTransformer(timepoints_count=X_train.shape[-1]),
    ),
    ("channel_data_swap", ChannelDataSwap()),
    ("binning", BinTransformer(step=step_tp)),
    ("data_channel_swap", ChannelDataSwap()),
    ("cwt", CwtVectorizer()),
    ("cwt_feature", CwtFeatureVectorizer(feature_dict=feature_function_dict)),
    ("postprocessing", PostprocessingTransformer()),
    ("scaler", StandardScaler()),
    ("pca", PCA(random_state=random_state)),
]

# rate different models
results_df = run_experiment(
    tested_classifiers,
    classifier_params,
    pipeline_name,
    X_train,
    X_test,
    y_train,
    y_test,
    dataset_name,
    this_base_steps,
    results_df,
    function_name=list(feature_function_dict.keys())[0],
)

### Experiment 11

In [None]:
pipeline_name = "ICA_cut_bins_function_energy"

feature_function_dict = {"energy": mean_energy_signal}

In [None]:
knn = ("knn", KNeighborsClassifier())
knn_params = dict(
    knn__n_neighbors=np.arange(15, 30, 3),
)
# 1
svc = ("svc", SVC())
svc_params = dict(
    svc__kernel=["linear", "rbf", "poly"],
    svc__C=[1],
)

decision_tree = ("decision_tree", DecisionTreeClassifier(random_state=random_state))
decision_tree_params = dict(
    decision_tree__criterion=["gini", "entropy"],
    decision_tree__max_depth=[3],
    decision_tree__min_samples_leaf=[20],
)
# 1
lr = ("lr", LogisticRegression())
lr_params = dict(lr__C=[1])

In [None]:
classifier_params = dict(
    ica__n_components=np.arange(3, 16, 2),
    pca__n_components=np.arange(3, 9, 1),
)

In [None]:
tested_classifiers = [
    (lr, lr_params),
    (decision_tree, decision_tree_params),
    (knn, knn_params),
    (svc, svc_params),
]

In [None]:
print(f"Featurize with {feature_function_dict.keys()} function")

# define base steps
this_base_steps = [
    ("channels_filtering", ChannelExtractionTransformer(significant_channels)),
    ("ica_preprocessing", IcaPreprocessingTransformer()),
    ("ica", FastICA(random_state=random_state)),
    (
        "ica_postprocessing",
        IcaPostprocessingTransformer(timepoints_count=X_train.shape[-1]),
    ),
    ("channel_data_swap", ChannelDataSwap()),
    ("binning", BinTransformer(step=step_tp)),
    ("data_channel_swap", ChannelDataSwap()),
    ("cwt", CwtVectorizer()),
    ("cwt_feature", CwtFeatureVectorizer(feature_dict=feature_function_dict)),
    ("postprocessing", PostprocessingTransformer()),
    ("scaler", StandardScaler()),
    ("pca", PCA(random_state=random_state)),
]

# rate different models
results_df = run_experiment(
    tested_classifiers,
    classifier_params,
    pipeline_name,
    X_train,
    X_test,
    y_train,
    y_test,
    dataset_name,
    this_base_steps,
    results_df,
    function_name=list(feature_function_dict.keys())[0],
)

### Experiment 12

In [None]:
pipeline_name = "ICA_cut_bins_function_mean"

feature_function_dict = {"mean": mean_signal}

In [None]:
knn = ("knn", KNeighborsClassifier())
knn_params = dict(
    knn__n_neighbors=np.arange(15, 30, 3),
)
# 10000 p=0.04 i nie ma overfitu
svc = ("svc", SVC())
svc_params = dict(
    svc__kernel=["linear", "rbf", "poly"],
    svc__C=[10000],
)

decision_tree = ("decision_tree", DecisionTreeClassifier(random_state=random_state))
decision_tree_params = dict(
    decision_tree__criterion=["gini", "entropy"],
    decision_tree__max_depth=[3],
    decision_tree__min_samples_leaf=[20],
)
# 1
lr = ("lr", LogisticRegression())
lr_params = dict(lr__C=[1])

In [None]:
classifier_params = dict(
    ica__n_components=np.arange(3, 16, 2),
    pca__n_components=np.arange(3, 9, 1),
)

In [None]:
tested_classifiers = [
    #     (lr, lr_params),
    #     (decision_tree, decision_tree_params),
    #     (knn, knn_params),
    (svc, svc_params),
]

In [None]:
print(f"Featurize with {feature_function_dict.keys()} function")

# define base steps
this_base_steps = [
    ("channels_filtering", ChannelExtractionTransformer(significant_channels)),
    ("ica_preprocessing", IcaPreprocessingTransformer()),
    ("ica", FastICA(random_state=random_state)),
    (
        "ica_postprocessing",
        IcaPostprocessingTransformer(timepoints_count=X_train.shape[-1]),
    ),
    ("channel_data_swap", ChannelDataSwap()),
    ("binning", BinTransformer(step=step_tp)),
    ("data_channel_swap", ChannelDataSwap()),
    ("cwt", CwtVectorizer()),
    ("cwt_feature", CwtFeatureVectorizer(feature_dict=feature_function_dict)),
    ("postprocessing", PostprocessingTransformer()),
    ("scaler", StandardScaler()),
    ("pca", PCA(random_state=random_state)),
]

# rate different models
results_df = run_experiment(
    tested_classifiers,
    classifier_params,
    pipeline_name,
    X_train,
    X_test,
    y_train,
    y_test,
    dataset_name,
    this_base_steps,
    results_df,
    function_name=list(feature_function_dict.keys())[0],
)

In [None]:
results_df

In [None]:
results_df.to_pickle(
    "../../data/classification_ICA_cut_reg_with_add_" + dataset_name + ".pkl"
)

In [None]:
results_df.to_csv(
    "../../data/classification_ICA_cut_reg_with_add_" + dataset_name + ".csv"
)