In [1]:
import sys
import os

import pandas as pd
import numpy as np
import natsort
import random as rn
from tqdm import tqdm_notebook as tqdm
import tensorflow as tf
import pyeeg

import matplotlib.pyplot as plt

#Keras

#Sklearn
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neural_network import MLPRegressor
from sklearn.svm import (SVC, SVR)
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.utils.class_weight import compute_class_weight

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import (StratifiedKFold, KFold)

from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error

from sklearn.manifold import TSNE
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

seed=42
np.random.seed(seed)
rn.seed(seed)
tf.set_random_seed(seed)
dir_path = os.getcwd()

  from ._conv import register_converters as _register_converters


In [2]:
def get_train_from_csv(csv_file):
    '''
    get a numpy array y of labels. the order follows the id of 4 second sample. 
    argument: relative path to the csv_file from the source folder.
    '''
    csv_file = os.path.join(dir_path, csv_file)
    print(f"Reading {csv_file}")
    with open(csv_file, 'r') as csvfile:
        train_reader = pd.read_csv(csvfile)
        train_reader.drop(labels="Id", axis=1, inplace=True)
        
        
    return train_reader.values

def get_target_from_csv(csv_file):
    '''
    get a numpy array y of labels. the order follows the id of 4 second sample. 
    argument: relative path to the csv_file from the source folder.
    '''
    csv_file = os.path.join(dir_path,csv_file)
    with open(csv_file, 'r') as csvfile:
        label_reader = pd.read_csv(csvfile)
        #print("Labels: ", label_reader['id'])
        y = label_reader['y']
        
    y = np.array(y)
    return y

def get_features_emg(X):
    all_featues = []
    for i in tqdm(range(X.shape[0])):
        features = list()
        # https://ieeexplore.ieee.org/document/7748960
        x_i = X[i,:]
        # Root Mean Square (RMS): RMS of EMG
        features.append(mean_squared_error(x_i, np.zeros(x_i.shape)))
        
        #Integrated Absolute Value (IAV)
        features.append(np.sum(np.abs(x_i)))
        
        # Mean Absolute Value (MAV): MAV feature can be expressed as
        features.append(np.mean(x_i))
        
        # TBD:
        # Modified Mean Absolute Value type 1
        # Modified Mean Absolute Value type 2
        
        # Simple Square Integral (SSI): SSI is calculated as
        features.append(np.sum(x_i ** 2))
        
        # Variance (VAR): VAR is calculated as
        features.append(np.var(x_i))
        
        #The 3rd, 4th and 5th temporal moments
        features.append(np.mean(x_i ** 3))
        features.append(np.mean(x_i ** 4))
        features.append(np.mean(x_i ** 5))
        
        # TBD
        # v-Order 
        
        # Waveform Length
        features.append(np.sum(np.abs(np.diff(x_i))))
        
        # Average Amplitude Change
        features.append(np.mean(np.abs(np.diff(x_i))))
        
        # Difference Absolute Standard Deviation Value
        features.append(np.sqrt(np.mean(np.power(np.diff(x_i), 2))))
        
        # AX BASIC FEATUERS
        features.append(np.std(x_i))
        features.append(np.min(x_i))
        features.append(np.max(x_i))
        features.append(np.sum(np.abs(x_i) < 0.0005))
        
        
        
        all_featues.append(features)
    return np.array(all_featues)

def get_features_eeg(X):
    all_featues = []
    # NOT SURE ABOUT THIS VALUES 
    # LETS DOUBLE CHECK
    K_MAX = 6
    SAMPLE_PER_SEC = 32
    FREQ_BANDS = list(range(16))
    TAU = 16
    # embedding dimension
    DE = 32
    
    for i in tqdm(range(X.shape[0])):
        features = list()
        # http://pyeeg.sourceforge.net/
        x_i = X[i,:]
        
        # Power Spectral Intensity (PSI) and Relative Intensity Ratio (RIR)	bin_power()	Two 1-D vectors
        
        
        # Petrosian Fractal Dimension (PFD)	pdf()	A scalar
        features.append(pyeeg.pfd(x_i))
        
        # Higuchi Fractal Dimension (HFD)	hfd()	A scalar
        features.append(pyeeg.hfd(x_i, K_MAX))
        
        # Hjorth mobility and complexity	hjorth()	Two scalars
        
        # Spectral Entropy (Shannon's entropy of RIRs)	spectral_entropy()	A scalar
        #features.append(pyeeg.spectral_entropy(x_i, FREQ_BANDS, SAMPLE_PER_SEC))
        
        # SVD Entropy	svd_entropy()	A scalar
        #features.append(pyeeg.svd_entropy(x_i, TAU, DE))
        
        # Fisher Information	fisher_info()	A scalar
        features.append(pyeeg.fisher_info(x_i, TAU, DE))
          
        # Detrended Fluctuation Analysis (DFA)	dfa()	A scalar
        features.append(pyeeg.dfa(x_i))
        
        # Hurst Exponent (Hurst)	hurst()	A scalar
        #features.append(pyeeg.hurst(x_i))
        
        # AX BASIC FEATUERS
        features.append(np.mean(x_i))
        features.append(np.std(x_i))
        features.append(np.min(x_i))
        features.append(np.max(x_i))
        features.append(np.sum(x_i < 0.0005))
        
        
        
        all_featues.append(features)
    return np.array(all_featues)


def get_data_of_rat(X, y, i):
    sample_cnt = int(X.shape[0] / 3)
    if i == 0:
        return X[:sample_cnt, :], y[:sample_cnt]
    if i == 1:
        return X[sample_cnt: 2 * sample_cnt, :], y[sample_cnt: 2 * sample_cnt]
    if i == 2:
        return X[2 * sample_cnt:, :], y[2 * sample_cnt:]
    

In [3]:
# Load the data
# train
train_emg = os.path.join(dir_path,"data/train/train_emg.csv")
train_eeg_1 = os.path.join(dir_path,"data/train/train_eeg1.csv")
train_eeg_2 = os.path.join(dir_path,"data/train/train_eeg2.csv")

# trest
test_emg = os.path.join(dir_path,"data/test/test_emg.csv")
test_eeg_1 = os.path.join(dir_path,"data/test/test_eeg1.csv")
test_eeg_2 = os.path.join(dir_path,"data/test/test_eeg2.csv")

# labels
train_target = os.path.join(dir_path,'data/train/train_labels.csv')

x_train_emg = get_train_from_csv(train_emg) #List of numpy arrays
x_train_eeg_1 = get_train_from_csv(train_eeg_1) #List of numpy arrays
x_train_eeg_2 = get_train_from_csv(train_eeg_2) #List of numpy arrays
y_train = get_target_from_csv(train_target) #Numpy array of labels

x_test_emg = get_train_from_csv(test_emg) #List of numpy arrays
x_test_eeg_1 = get_train_from_csv(test_eeg_1) #List of numpy arrays
x_test_eeg_2 = get_train_from_csv(test_eeg_2) #List of numpy arrays

print(f"Class 1: {np.sum(y_train == 1)}")
print(f"Class 2: {np.sum(y_train == 2)}")
print(f"Class 3: {np.sum(y_train == 3)}")

Reading /home/francesco/Scrivania/AML-18/task5/data/train/train_emg.csv
Reading /home/francesco/Scrivania/AML-18/task5/data/train/train_eeg1.csv
Reading /home/francesco/Scrivania/AML-18/task5/data/train/train_eeg2.csv
Reading /home/francesco/Scrivania/AML-18/task5/data/test/test_emg.csv
Reading /home/francesco/Scrivania/AML-18/task5/data/test/test_eeg1.csv
Reading /home/francesco/Scrivania/AML-18/task5/data/test/test_eeg2.csv
Class 1: 34114
Class 2: 27133
Class 3: 3553


In [4]:
# scale each rat individually
for i in range(3):
    x_train_emg_i, _  = get_data_of_rat(x_train_emg, y_train, i)
    print(np.max(x_train_emg_i))


0.0033
0.0033
0.0033


In [27]:
# compute features

print("Computing features for training set ..")
x_train_emg_feat = get_features_emg(x_train_emg)
x_train_eeg_1_feat = get_features_eeg(x_train_eeg_1)
x_train_eeg_2_feat = get_features_eeg(x_train_eeg_2)

print("Computing features for test set ..")
x_test_emg_feat = get_features_emg(x_test_emg)
x_test_eeg_1_feat = get_features_eeg(x_test_eeg_1)
x_test_eeg_2_feat = get_features_eeg(x_test_eeg_2)

HBox(children=(IntProgress(value=0, max=64800), HTML(value='')))




HBox(children=(IntProgress(value=0, max=64800), HTML(value='')))




HBox(children=(IntProgress(value=0, max=64800), HTML(value='')))




HBox(children=(IntProgress(value=0, max=43200), HTML(value='')))




HBox(children=(IntProgress(value=0, max=43200), HTML(value='')))




HBox(children=(IntProgress(value=0, max=43200), HTML(value='')))




In [28]:
print("\nTRAIN")
print(x_train_eeg_1_feat.shape)
print(x_train_eeg_2_feat.shape)
print(x_train_emg_feat.shape)
print("\nTEST")
print(x_test_eeg_1_feat.shape)
print(x_test_eeg_2_feat.shape)
print(x_test_emg_feat.shape)

X_train = np.concatenate((x_train_emg_feat,x_train_eeg_1_feat,x_train_eeg_2_feat), axis = 1)
print("X train final shape -> ",X_train.shape)
X_test = np.concatenate((x_test_emg_feat,x_test_eeg_1_feat,x_test_eeg_2_feat), axis = 1)
print("X test final shape -> ",X_test.shape)



TRAIN
(64800, 9)
(64800, 9)
(64800, 15)

TEST
(43200, 9)
(43200, 9)
(43200, 15)
X train final shape ->  (64800, 33)
X test final shape ->  (43200, 33)


In [41]:
#Fit scaler on all data
X_total = np.concatenate((X_train, X_test))
print("X total shape -> ",X_total.shape)

scaler = StandardScaler().fit(X_total)

X total shape ->  (108000, 33)


In [46]:
# class weights
classes = np.array([1,2,3])
weights = compute_class_weight("balanced", classes, y_train)
print(weights)
cw = dict(zip(classes, weights))

[0.63317113 0.79607858 6.07936955]


In [47]:
# classifiers for CV
rf = RandomForestClassifier(n_estimators=100, random_state=seed, n_jobs=-1, verbose=False, class_weight = cw)
svc = SVC(class_weight="balanced")

In [48]:
# CROSS VALDATION
kfold = KFold(n_splits=3, shuffle=False, random_state=seed)

print("Start")
bmac_scores_rf = []
bmac_scores_svc = []

for train, valid in kfold.split(X_train):
    # get the folds
    X_train_fold = X_train[train]
    y_train_fold = y_train[train]

    X_valid_fold = X_train[valid]
    y_valid_fold = y_train[valid]
    
    print(X_train_fold.shape)
    print(X_valid_fold.shape)
    
    X_train_fold_scaled = scaler.transform(X_train_fold)
    X_valid_fold_scaled = scaler.transform(X_valid_fold)

   
    # fit classifier
    rf.fit(X_train_fold_scaled, y_train_fold)
    svc.fit(X_train_fold_scaled, y_train_fold)
    
    y_pred_rf = rf.predict(X_valid_fold_scaled)
    y_pred_svc = svc.predict(X_valid_fold_scaled)
    
    bmac_score_rf = balanced_accuracy_score(y_valid_fold, y_pred_rf)
    print(f"{len(bmac_scores_rf)}: current balanced_accuracy_score RF: {bmac_score_rf}")
    bmac_score_svc = balanced_accuracy_score(y_valid_fold, y_pred_svc)
    print(f"{len(bmac_scores_svc)}: current balanced_accuracy_score SVC: {bmac_score_svc}")

    bmac_scores.append(bmac_score_rf)
    bmac_scores_svc.append(bmac_score_svc)

print("========================================")
print(f"RFC BMAC avg score RF {np.mean(bmac_scores_rf)} +/- {np.std(bmac_scores_rf)}" )
print(f"RFC BMAC avg score SVC {np.mean(bmac_scores_svc)} +/- {np.std(bmac_scores_svc)}" )


Start
(43200, 33)
(21600, 33)
0: current balanced_accuracy_score RF: 0.7446124180917901
0: current balanced_accuracy_score SVC: 0.8873178848051705
(43200, 33)
(21600, 33)
0: current balanced_accuracy_score RF: 0.7548386146280349
1: current balanced_accuracy_score SVC: 0.9267494446132698
(43200, 33)
(21600, 33)
0: current balanced_accuracy_score RF: 0.4676051881332905
2: current balanced_accuracy_score SVC: 0.8330195884239023
RFC BMAC avg score RF nan +/- nan
RFC BMAC avg score SVC 0.8823623059474475 +/- 0.03842516368805916


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)


In [30]:
#Scale, fit, predict
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

svc = SVC(class_weight="balanced")
svc.fit(X_train_scaled, y_train)
y_pred = svc.predict(X_test_scaled)
print(y_pred)

X total shape ->  (108000, 33)
[1 1 1 ... 1 1 1]


In [31]:
submission_name = "fv_allfeat.csv"

print(f"Class 1: {np.sum(y_pred == 1)}")
print(f"Class 2: {np.sum(y_pred == 2)}")
print(f"Class 3: {np.sum(y_pred == 3)}")

y_pred_df = pd.DataFrame(y_pred)
y_pred_df = y_pred_df.assign(Id=list(range(y_pred.shape[0])))
y_pred_df.columns = ['y', 'Id']
display(y_pred_df)


submission_folder = os.path.join(dir_path,"submissions/")
csv_file = submission_folder + submission_name

with open(csv_file, 'w') as csv:
    y_pred_df.to_csv(csv,index = False)
"""
Class 1: 23933
Class 2: 18553
Class 3: 714
"""

Class 1: 19527
Class 2: 21259
Class 3: 2414


Unnamed: 0,y,Id
0,1,0
1,1,1
2,1,2
3,1,3
4,1,4
5,1,5
6,1,6
7,1,7
8,1,8
9,1,9


'\nClass 1: 23933\nClass 2: 18553\nClass 3: 714\n'