In [2]:
import numpy as np
import mne
import os
import matplotlib.pyplot as plt
import scipy

In [None]:
labels_folder = os.path.join("data", "mi", "labels")
all_labels = []

for l in os.listdir(labels_folder):
    y_train = scipy.io.loadmat(os.path.join(labels_folder, l))["y_train"]
    y_test = scipy.io.loadmat(os.path.join(labels_folder, l))["y_test"]
    all_labels = np.concatenate((y_train, y_test), axis=1).reshape(-1)

# create a n,3 matrix with first column increasing from 0 to n-1, second column 0 and third column label value
events = np.array(
    [np.arange(len(all_labels)), np.zeros(len(all_labels)), all_labels]).T

data = np.load(os.path.join("intermediate", "mi",
               "transformed", "sp1s_aa_100hz.npy"), allow_pickle=True)
n_channels = data.shape[0]
sampling_rate = 100
raw_data = data.reshape(n_channels, -1)
epoch_data = data.reshape(data.shape[2], data.shape[0], data.shape[1])

info = mne.create_info(ch_names=['F3', 'F1', 'Fz', 'F2', 'F4', 'FC5', 'FC3', 'FC1', 'FCz', 'FC2', 'FC4', 'FC6',
                       'C5', 'C3', 'C1', 'Cz', 'C2', 'C4', 'C6', 'CP5', 'CP3', 'CP1', 'CPz', 'CP2', 'CP4', 'CP6', 'O1', 'O2'], ch_types=['eeg']*n_channels, sfreq=sampling_rate)
raw = mne.io.RawArray(raw_data, info)
event_id = dict(left=0, right=1)
events = mne.make_fixed_length_events(raw, duration=0.5)
events[:, 2] = all_labels
epochs = mne.EpochsArray(epoch_data, info, events, tmin=0,
                         event_id=event_id, baseline=None)

# # epochs.plot(scalings ='auto') # plot all epochs
# right = epochs['right'].average()
# left = epochs['left'].average()
# left.plot()
# pplot psd of C3 channel

In [110]:
from sklearn.model_selection import train_test_split
import pandas as pd

train_size = 0.8
test_size = 1 - train_size
features_path = os.path.join(
    "intermediate", "mi", "features", "wavelet_features.npy")
features = np.load(features_path)
X_train, X_test, Y_train, Y_test = train_test_split(
    features[:, :-1], features[:, -1], train_size=train_size, test_size=test_size, random_state=0)

In [95]:
l = ["rms", "mav", "integrated_eeg",
     "simple_square_integral", "var", "aac", "wl", "zc", "ssc"]
feature_labels = []
for i in range(1, 4):
    lvl_str = f"L{i}"
    if i == 0:
        lvl_str = "A"
        feature_labels.append(f"{lvl_str}_{label}")
        continue

    for j in range(3):
        coef_str = ""
        if j == 0:
            coef_str = "cH"
        elif j == 1:
            coef_str = "cV"
        elif j == 2:
            coef_str = "cD"
        for label in l:
            feature_labels.append(f"{lvl_str}_{coef_str}_{label}")


In [108]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

bestfeatures = SelectKBest(score_func=chi2, k=30)
fit = bestfeatures.fit(X_train, Y_train)

dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(feature_labels)
# concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns, dfscores], axis=1)
featureScores.columns = ['Specs', 'Score']  # naming the dataframe columns
featureScores.sort_values(by=['Score'], ascending=False, inplace=True, ignore_index=True)
featureScores.head(30)

# pick the top 30 features from X_train
idx = fit.scores_.argsort()[-30:][::-1]
X_train = X_train[:, idx]
X_test = X_test[:, idx]

In [111]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

clf = LinearDiscriminantAnalysis()
clf.fit(X_train, Y_train)
clf.score(X_test, Y_test)


0.6309523809523809

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from tqdm import tqdm

models = [
    ('LogReg', LogisticRegression(max_iter=20000,)),
    ('RF', RandomForestClassifier()),
    ('SVM', SVC()),
    ('GNB', GaussianNB()),
    ('XGB', XGBClassifier()),
    ("DecisionTree", DecisionTreeClassifier()),
    ("MLPClassifier", MLPClassifier(random_state=0)),
    ("GradientBoostingClassifier", GradientBoostingClassifier(loss='exponential',
                                                              n_estimators=1000, random_state=0))
]

results = []
names = []
scoring = ['accuracy', 'precision_weighted',
           'recall_weighted', 'f1_weighted', 'roc_auc']
for name, model in tqdm(models):
    kfold = model_selection.KFold(
        n_splits=10, random_state=0, shuffle=True)
    cv_results = model_selection.cross_validate(
        model, X_train, Y_train, cv=kfold, scoring=scoring)
    clf = model.fit(X_train, Y_train)
    y_pred = clf.predict(X_test)
    print(name)
    print(classification_report(
        Y_test, y_pred, target_names=["Left", "Right"]))

results.append(cv_results)
names.append(name)
