In [1]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from scipy.stats import kurtosis, skew

In [2]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_pickle('prelim_df.pkl')

In [4]:
df['target_bin'] = df['target'].apply(lambda x: 1 if x=='preictal' else 0)

In [5]:
def generate_simple_columns(columns, function, func_name):
    for index, column in enumerate(columns, start=1):
        df['{}_{}'.format(func_name, index)] = df[column].apply(function)
    return df

In [6]:
def generate_more_columns(columns, function, func_name):
    """Correlations contained mostly noise.
    """
    columns_2 = columns[1:]
    columns = columns[:-1]
    for col in list(columns):
        for col_2 in columns_2:
            df['{}_{}_{}'.format(func_name, str(col), str(col_2))] = \
            np.vectorize(function)(df[col], df[col_2])
        del columns_2[0]
    
    return df

In [7]:
def generate_percentiles(columns, percentile):
    for index, column in enumerate(columns, start=1):
        df['c{}_p{}'.format(index, percentile)] = \
        df[column].apply(lambda x: np.percentile(x, percentile))
    return df

In [8]:
def generate_mean_pos(columns):
    for index, column in enumerate(columns, start=1):
        df['c{}_{}'.format(index, 'mean+')] = \
        df[column].apply(lambda x: np.mean(x[x > 0]))
    return df

In [9]:
def features_mean(columns, features):
    for column_set, feature in zip(columns, features):
        df['mean_{}'.format(feature)] = df[column_set].mean(axis=1)
    return df

In [10]:
def get_consecutive_values(array):
    """Returns the greatest number of consecutive values in-between 
    -50 and 50 for each 15s interval.
    VERY inefficient.
    """
    max_len = 0
    temp_len = 0
    array = abs(array)
    for i in array:
        if i < 50:
            temp_len += 1
        else:
            if temp_len > max_len:
                max_len = temp_len
            temp_len = 0
    return max_len

def consecutive_values_columns(columns):
    for index, column in enumerate(columns, start=1):
        df['c{}_con'.format(index)] = df[column].apply(get_consecutive_values)
    return df

In [11]:
channels = ['ch_01', 'ch_02', 'ch_03', 'ch_04', 'ch_05', 'ch_06', 'ch_07', 'ch_08',
       'ch_09', 'ch_10', 'ch_11', 'ch_12', 'ch_13', 'ch_14', 'ch_15', 'ch_16']

**Generate features**

In [12]:
#df = generate_simple_columns(channel_columns, np.mean, 'mean') # essentially no signal
df = generate_simple_columns(channels, np.std, 'std')
#df = generate_simple_columns(channels, kurtosis, 'kurt') # no signal
df = generate_percentiles(channels, 1)
df = generate_mean_pos(channels)

In [None]:
df = consecutive_values_columns(channels) # very slow to run

**Generate means across channels from new features**

In [None]:
std_columns = ['std_{}'.format(i) for i in range(1,17)]
percentile_columns = ['c{}_p1'.format(i) for i in range(1, 17)]
mean_pos_columns = ['c{}_mean+'.format(i) for i in range(1,17)]
mean_consecutive_columns = ['c{}_con'.format(i) for i in range(1,17)]

columns = [std_columns, percentile_columns, mean_pos_columns, mean_consecutive_columns]
features = ['std', '1p', 'mean+', 'con']

df = features_mean(columns, features)

In [None]:
df.head(1)

In [None]:
df.to_pickle('featured_df.pkl')