In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import scipy
from scipy.stats import pearsonr
from scipy import signal as sig
from scipy.io import loadmat
from scipy.signal import ellip, lfilter, filtfilt, find_peaks, butter, sosfiltfilt, sosfilt
from sklearn.model_selection import train_test_split
from scipy.interpolate import interp1d
from scipy.interpolate import CubicSpline

leaderboard_data = loadmat('/Users/carlosaguila/Downloads/drive-download-20220406T170639Z-001/leaderboard_data.mat')
raw_training_data = loadmat('/Users/carlosaguila/Downloads/drive-download-20220406T170639Z-001/raw_training_data.mat')

# glove data for training - per subject
train_dg_s1 = raw_training_data['train_dg'][0][0]
train_dg_s2 = raw_training_data['train_dg'][1][0]
train_dg_s3 = raw_training_data['train_dg'][2][0]

# ecog data for training - per subject
train_ecog_s1 = raw_training_data['train_ecog'][0][0]
train_ecog_s2 = raw_training_data['train_ecog'][1][0]
train_ecog_s3 = raw_training_data['train_ecog'][2][0]

# leaderboard ecog signal per patient
leaderboard_data_s1 = leaderboard_data['leaderboard_ecog'][0][0]
leaderboard_data_s2 = leaderboard_data['leaderboard_ecog'][1][0]
leaderboard_data_s3 = leaderboard_data['leaderboard_ecog'][2][0]

In [2]:
#delete bad channels for subject 1
train_ecog_s1 = np.delete(train_ecog_s1,[54],1)
leaderboard_data_s1 = np.delete(leaderboard_data_s1,[54],1)

In [3]:
#delete bad channels for subject 2
train_ecog_s2 = np.delete(train_ecog_s2,[20,37],1)
leaderboard_data_s2 = np.delete(leaderboard_data_s2,[20,37],1)

In [7]:
# number of windows in signal given winLen and winDisp
def NumWins(x, fs, winLen, winDisp):
    return (len(x) - winLen * fs + winDisp * fs) // (winDisp * fs)


def filter_data(raw_eeg, fs=1000):
    """
    Write a filter function to clean underlying data.
    Filter type and parameters are up to you. Points will be awarded for reasonable filter type, parameters and application.
    Please note there are many acceptable answers, but make sure you aren't throwing out crucial data or adversly
    distorting the underlying data!

    Input:
      raw_eeg (samples x channels): the raw signal
      fs: the sampling rate (1000 for this dataset)
    Output:
      clean_data (samples x channels): the filtered signal
    """
    raw_eeg_t = raw_eeg.transpose()
    filtered = []

    nyq = fs / 2

    # (b, a) = ellip(4, 0.1, 40, 20/nyq, btype='lowpass')
    sos = butter(8, [0.15, 200], btype='bandpass', output='sos', fs=fs)

    for ch_data in raw_eeg_t:
        # filtered_ch = filtfilt(b, a, ch_data)
        filtered_ch = sosfiltfilt(sos, ch_data)
        filtered.append(filtered_ch)

    filtered = np.array(filtered)

    return filtered.transpose()


# line length
def LL(x):
    return np.sum(np.absolute(np.ediff1d(x)))


# energy
def E(x):
    return np.sum(x ** 2)

#RMS
def RMS(x):
    return np.sqrt(np.mean(x**2))

# area
def A(x):
    return np.sum(np.absolute(x))


# spectral amp
def spectral_amplitude(x):
    x_fft = np.fft.fft(x)
    return np.mean(x_fft)

def mean_amplitude_freq(X, fs, lF, uF):
    time_step = 1/fs
    ps = np.abs(np.fft.fft(X)) ** 2
    freqs = np.fft.fftfreq(X.size, time_step)
    mask = np.logical_and(freqs >= lF, freqs <= uF )
    avgValue = ps[mask].mean()
    return avgValue

# number of crossings (zero) - not in
def ZX(x):
    x_demean = x - np.mean(x)
    num_crossings = 0
    for i in range(1, len(x)):
        fromAbove = False
        fromBelow = False
        if x_demean[i - 1] > 0 and x_demean[i] < 0:
            fromAbove = True
        if x_demean[i - 1] < 0 and x_demean[i] > 0:
            fromBelow = True

        if fromAbove or fromBelow:
            num_crossings += 1
    return num_crossings

def MEAN(x):
    return np.mean(x)

def bandpower(x, fs, fmin, fmax):
    f, Pxx = sig.periodogram(x, fs=fs)
    ind_min = np.argmax(f > fmin) - 1
    ind_max = np.argmax(f > fmax) - 1
    return np.trapz(Pxx[ind_min: ind_max], f[ind_min: ind_max])

# gets features, load features you want calculated from here
def get_features(filtered_window, fs=1000):
    """
      Write a function that calculates features for a given filtered window.
      Feel free to use features you have seen before in this class, features that
      have been used in the literature, or design your own!

      Input:
        filtered_window (window_samples x channels): the window of the filtered ecog signal
        fs: sampling rate
      Output:
        features (channels x num_features): the features calculated on each channel for the window
    """

    filtered_window_t = filtered_window.transpose()

    features = []

    for ch in filtered_window_t:
        features.append(np.array([MEAN(ch),
                                  mean_amplitude_freq(ch, fs, 5, 15),
                                  mean_amplitude_freq(ch, fs, 20, 25),
                                  mean_amplitude_freq(ch, fs, 75, 115),
                                  mean_amplitude_freq(ch, fs, 125, 160),
                                  mean_amplitude_freq(ch, fs, 160, 175)
                                  ]))

    features = np.array(features)

    return features

# Bandpower, can try specifying the following:
# Delta: fmin = 0.5, fmax = 4
# Theta: fmin = 4, fmax = 7
# Alpha: fmin = 8, fmax = 12
# Beta: fmin = 12.5, fmax = 30
# Gamma: fmin = 25, fmax = 140
# get_windowed_feats - filters raw ecog signal and finds features

# From the paper suggestion:
# fmin = 5, fmax = 15
# fmin = 20, fmax = 25
# fmin = 75, fmax = 115
# fmin = 125, fmax = 160
# fmin = 160, fmax = 175

def get_windowed_feats(raw_ecog, fs, window_length, window_overlap):
    """
      Write a function which processes data through the steps of filtering and
      feature calculation and returns features. Points will be awarded for completing
      each step appropriately (note that if one of the functions you call within this script
      returns a bad output, you won't be double penalized). Note that you will need
      to run the filter_data and get_features functions within this function.

      Inputs:
        raw_eeg (samples x channels): the raw signal
        fs: the sampling rate (1000 for this dataset)
        window_length: the window's length
        window_overlap: the window's overlap
      Output:
        all_feats (num_windows x (channels x features)): the features for each channel for each time window
          note that this is a 2D array.
    """

    cleaned_ecog = filter_data(raw_ecog)
    num_wins = NumWins(cleaned_ecog.transpose()[0], fs, window_length, window_overlap)
    all_feats_3d = []
    for winStart in np.arange(0, int(num_wins), 1):
        clip = cleaned_ecog[
               int(winStart * window_overlap * fs):int(winStart * window_overlap * fs + (window_length * fs))]
        all_feats_3d.append(get_features(clip))

    num_channels = len(all_feats_3d[0])
    num_features = len(all_feats_3d[0][0])

    all_feats = np.zeros([len(all_feats_3d), num_features * num_channels])

    for k in range(int(len(all_feats_3d))):
        q = flatten_list = [j for sub in all_feats_3d[k] for j in sub]
        all_feats[k, :] = q

    return np.array(all_feats)

def repeat_preds(preds, window_to_time_ratio=50):
    pred_all = []
    for row in preds:
        for i in range(window_to_time_ratio):
            pred_all.append(row)

    # For out problem, it is short 50 entries, so add the last row 50 more times
    for i in range(window_to_time_ratio):
        pred_all.append(row)
    
    return np.array(pred_all)

def interp_preds(preds, time_length):
    # N samples
    preds_sample_orig = np.arange(len(preds))
    
    # T time points
    preds_sample_target = np.arange(time_length)
    
    preds = preds.transpose()

    preds_interp = []
    
    for finger_preds in preds:
        f = interp1d(preds_sample_orig, finger_preds)
        new_preds = f(preds_sample_target)
        preds_interp.append(new_preds)
    
    preds_interp = np.array(preds_interp).transpose()
    
    return preds_interp

def spline_preds(preds, time_length):
    # N samples
    preds_sample_orig = np.arange(len(preds))
    
    # T time points
    preds_sample_target = np.linspace(0,len(preds),time_length)
    print(preds_sample_target)
    preds = preds.transpose()

    preds_interp = []
    
    for finger_preds in preds:
        f = CubicSpline(preds_sample_orig, finger_preds, bc_type='natural')
        new_preds = f(preds_sample_target)
        preds_interp.append(new_preds)
    
    preds_interp = np.array(preds_interp).transpose()
    
    return preds_interp

def compute_corr(preds, truth):
    subj_corr = []
    for i in range(5):
        finger_pred = preds.transpose()[i]
        finger_truth = truth.transpose()[i]
        subj_corr.append(pearsonr(finger_pred, finger_truth)[0])
    
    return subj_corr

In [8]:
all_feats_s1 = get_windowed_feats(train_ecog_s1, 1000, 0.1, 0.05)  # output of get_windowed_feats
all_feats_s2 = get_windowed_feats(train_ecog_s2, 1000, 0.1, 0.05)
all_feats_s3 = get_windowed_feats(train_ecog_s3, 1000, 0.1, 0.05)

feats_LB_s1 = get_windowed_feats(leaderboard_data_s1, 1000, 0.1, 0.05)
feats_LB_s2 = get_windowed_feats(leaderboard_data_s2, 1000, 0.1, 0.05)
feats_LB_s3 = get_windowed_feats(leaderboard_data_s3, 1000, 0.1, 0.05)

train_dg_s1_downsample = train_dg_s1[::50][:-1]
train_dg_s2_downsample = train_dg_s2[::50][:-1]
train_dg_s3_downsample = train_dg_s3[::50][:-1]

In [9]:
#split 0.2/0.8 ecog and downsampled glove data
all_feats_train_s1, all_feats_test_s1, T_train_dg_s1_down, test_dg_s1_down = train_test_split(all_feats_s1, train_dg_s1_downsample, test_size=0.2, random_state=24)
all_feats_train_s2, all_feats_test_s2, T_train_dg_s2_down, test_dg_s2_down = train_test_split(all_feats_s2, train_dg_s2_downsample, test_size=0.2, random_state=24)
all_feats_train_s3, all_feats_test_s3, T_train_dg_s3_down, test_dg_s3_down = train_test_split(all_feats_s3, train_dg_s3_downsample, test_size=0.2, random_state=24)

In [10]:
#mean voltage, and mean amplitude from bandwidths
np.save('/Users/carlosaguila/PycharmProjects/BE521/results_v4/all_feats_s1.npy', all_feats_s1)
np.save('/Users/carlosaguila/PycharmProjects/BE521/results_v4/all_feats_s2.npy', all_feats_s2)
np.save('/Users/carlosaguila/PycharmProjects/BE521/results_v4/all_feats_s3.npy', all_feats_s3)

np.save('/Users/carlosaguila/PycharmProjects/BE521/results_v4/feats_LB_s1.npy', feats_LB_s1)
np.save('/Users/carlosaguila/PycharmProjects/BE521/results_v4/feats_LB_s2.npy', feats_LB_s2)
np.save('/Users/carlosaguila/PycharmProjects/BE521/results_v4/feats_LB_s3.npy', feats_LB_s3)

In [11]:
from sklearn.ensemble import RandomForestRegressor

In [12]:
#Subject 1 - train, test, and LB_data.
rfr_reg_s1 = RandomForestRegressor(n_estimators=1000).fit(all_feats_train_s1, T_train_dg_s1_down)
pred_test_s1 = rfr_reg_s1.predict(all_feats_test_s1) #test data

subj1_corr = []
for i in range(5):
    finger_pred = pred_test_s1.transpose()[i]
    finger_truth = test_dg_s1_down.transpose()[i]
    subj1_corr.append(pearsonr(finger_pred, finger_truth)[0])

print(subj1_corr)

model_fname_s1 = '/Users/carlosaguila/PycharmProjects/BE521/results_v4/subject1_rfr_1000.model'
pickle.dump(rfr_reg_s1, open(model_fname_s1, 'wb'))

[0.24007872490208715, 0.30260916710112473, 0.1994356223060803, 0.3328618566023698, 0.19028522095005912]


In [13]:
#Subject 2 - train, test, and LB_data.
rfr_reg_s2 = RandomForestRegressor(n_estimators=1000).fit(all_feats_train_s2, T_train_dg_s2_down)
pred_test_s2 = rfr_reg_s2.predict(all_feats_test_s2) #test data

subj2_corr = []
for i in range(5):
    finger_pred = pred_test_s2.transpose()[i]
    finger_truth = test_dg_s2_down.transpose()[i]
    subj2_corr.append(pearsonr(finger_pred, finger_truth)[0])

print(subj2_corr)

model_fname_s2 = '/Users/carlosaguila/PycharmProjects/BE521/results_v4/subject2_rfr_1000.model'
pickle.dump(rfr_reg_s2, open(model_fname_s2, 'wb'))

KeyboardInterrupt: 

In [None]:
#Subject 3 - train, test, and LB_data.
rfr_reg_s3 = RandomForestRegressor(n_estimators=1000).fit(all_feats_train_s3, T_train_dg_s3_down)
pred_test_s3 = rfr_reg_s3.predict(all_feats_test_s3) #test data

subj3_corr = []
for i in range(5):
    finger_pred = pred_test_s3.transpose()[i]
    finger_truth = test_dg_s3_down.transpose()[i]
    subj3_corr.append(pearsonr(finger_pred, finger_truth)[0])

print(subj3_corr)

model_fname_s3 = '/Users/carlosaguila/PycharmProjects/BE521/results_v4/subject3_rfr_1000.model'
pickle.dump(rfr_reg_s3, open(model_fname_s3, 'wb'))

In [None]:
#leaderboard data - subject 1
LB_pred_s1 = rfr_reg_s1.predict(feats_LB_s1)  # predicting from features from leaderboard data
np.save('/Users/carlosaguila/PycharmProjects/BE521/results_v4/LB_pred_s1.npy', LB_pred_s1)

#leaderboard data - subject 2
LB_pred_s2 = rfr_reg_s2.predict(feats_LB_s2)  # predicting from features from leaderboard data
np.save('/Users/carlosaguila/PycharmProjects/BE521/results_v4/LB_pred_s2.npy', LB_pred_s2)

#leaderboard data - subject 3
LB_pred_s3 = rfr_reg_s3.predict(feats_LB_s3)  # predicting from features from leaderboard data
np.save('/Users/carlosaguila/PycharmProjects/BE521/results_v4/LB_pred_s3.npy', LB_pred_s3)

In [None]:
#spline for leaderboard predictions
pred_s1_test_spline = spline_preds(LB_pred_s1, 147500)
pred_s2_test_spline = spline_preds(LB_pred_s2, 147500)
pred_s3_test_spline = spline_preds(LB_pred_s3, 147500)

In [None]:
from scipy.io import loadmat, savemat

predictions_array = np.zeros((3,1), dtype=object)
predictions_array[0,0] = pred_s1_test_spline
predictions_array[1,0] = pred_s2_test_spline
predictions_array[2,0] = pred_s3_test_spline

savemat('/Users/carlosaguila/PycharmProjects/BE521/results_v4/predictions.mat', {'predicted_dg':predictions_array})