# Ion channel challenge: rolling statistics features, analytic functions, integral

In this notebook, we generated features on the train and test sets as a whole. This method failed to acknowledge differences between batchs, and the rolling statistics windows of different batches were overlapping.

Later, the features were generated per batch ([notebook 2](https://github.com/berenice-d/Ion-channels/blob/master/notebooks/Ion%20channel%20-%20Feature%20engineering%202.ipynb)).

Finally, we also include an improved way to compute the square root of the signal so negative values are not returning NAs ([notebook 3](https://github.com/berenice-d/Ion-channels/blob/master/notebooks/Ion%20channel%20-%20Feature%20engineering%203.ipynb)).

In [1]:
# Load packages
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import f1_score
import torch
from tqdm import tqdm
import catboost
from catboost import *

In [2]:
# for each window in the list, calculate the aggregation function twice (
#the second time the winow is centered, not to the left of value being computed)
def feature_engineering(df):
    for window in tqdm(windows):
        # Basic aggregation functions
        # mean 
        name = 'mean_'+str(window)+'_r'
        df[name] = df.signal.rolling(window = window).mean()
        name = 'mean_'+str(window)+'_c'
        df[name] = df.signal.rolling(window = window, center = True).mean()
        # standard dev
        name = 'std_'+str(window)+'_r'
        df[name] = df.signal.rolling(window = window).std()
        name = 'std_'+str(window)+'_c'
        df[name] = df.signal.rolling(window = window, center = True).std()
        # min
        name = 'min_'+str(window)+'_r'
        df[name] = df.signal.rolling(window = window).min()
        name = 'min_'+str(window)+'_c'
        df[name] = df.signal.rolling(window = window, center = True).min()
        # max
        name = 'max_'+str(window)+'_r'
        df[name] = df.signal.rolling(window = window).max()
        name = 'max_'+str(window)+'_c'
        df[name] = df.signal.rolling(window = window, center = True).max()
        # skew 
        name = 'skew_'+str(window)+'_r'
        df[name] = df.signal.rolling(window = window).skew()
        name = 'skew_'+str(window)+'_c'
        df[name] = df.signal.rolling(window = window, center = True).skew() 
        # kurtosis 
        name = 'kurt_'+str(window)+'_r'
        df[name] = df.signal.rolling(window = window).kurt()
        name = 'kurt_'+str(window)+'_c'
        df[name] = df.signal.rolling(window = window, center = True).kurt()     
        
        # Exponentially weighted functions (values closed to value being computed have more weight)
        # mean 
        name = 'mean_'+str(window)+'_ew'
        df[name] = df.signal.ewm(span = window).mean()
        # standard dev
        name = 'std_'+str(window)+'_ew'
        df[name] = df.signal.ewm(span = window).std()
        
        # Span max/min
        name = 'max_min_diff_'+str(window) + '_r'
        df[name] = df['max_'+str(window)+'_r'] - df['min_'+str(window)+'_r']
        name = 'min_max_ratio_'+str(window) + '_r'
        df[name] = (df['max_'+str(window)+'_r'] - df['min_'+str(window)+'_r'])/df.signal
        name = 'min_max_ratio_mean_'+str(window) + '_r'
        df[name] = (df['max_'+str(window)+'_r'] - df['min_'+str(window)+'_r'])/df.signal.rolling(window = window).mean()
        name = 'max_min_diff_'+str(window) + '_c'
        df[name] = df['max_'+str(window)+'_c'] - df['min_'+str(window)+'_c']
        name = 'min_max_ratio_'+str(window) + '_c'
        df[name] = (df['max_'+str(window)+'_c'] - df['min_'+str(window)+'_c'])/df.signal
        name = 'min_max_ratio_mean_'+str(window) + '_c'
        df[name] = (df['max_'+str(window)+'_c'] - df['min_'+str(window)+'_c'])/df.signal.rolling(window = window).mean()

        

In [3]:
def lag_features(df):
    '''Create new feature columns from 'signal' and concatenate them to the existing dataframe'''
    
    add_df = pd.concat([df.signal.shift(1), df.signal.shift(2), df.signal.shift(3), 
                        df.signal.shift(5), df.signal.shift(-1), df.signal.shift(-2), #lag features
                        df.signal**2, np.sqrt(df.signal), df.signal**3, pd.DataFrame(np.gradient(df.signal)), 
                        pd.DataFrame(np.gradient(df.signal)).shift(1), 
                        pd.DataFrame(np.gradient(df.signal)).shift(3), 
                        pd.DataFrame(np.gradient(df.signal)).shift(-1),
                        pd.DataFrame(np.gradient(df.signal)).shift(-3),np.exp(df.signal)], # power features
                       axis=1)
    add_df.columns = ['lag-1', 'lag-2', 'lag-3', 'lag-5', 'lag+1', 'lag+2', 'power2', 'sqroot', 'power3', 
                      'deriv', 'deriv_lag-1', 'deriv_lag-3', 'deriv_lag+1', 'deriv_lag+3', 
                      'exp']
    df = pd.concat([df, add_df], axis=1)
    return df


The signal with kalman filter was downloaded from [michaln](https://www.kaggle.com/michaln/data-without-drift-with-kalman-filter)

In [4]:
# Load data
train = pd.read_csv('../data/external/train_kalman.csv')
test = pd.read_csv('../data/external/test_kalman.csv')

## Generating features - 3rd set

On signal with Kalman filter and broader windows

### Rolling aggregations

In [5]:
windows = [5, 15, 45, 135, 405, 1215]

In [6]:
feature_engineering(train)
train.head()

100%|██████████| 6/6 [00:32<00:00,  5.42s/it]


Unnamed: 0,time,signal,open_channels,mean_5_r,mean_5_c,std_5_r,std_5_c,min_5_r,min_5_c,max_5_r,...,kurt_1215_r,kurt_1215_c,mean_1215_ew,std_1215_ew,max_min_diff_1215_r,min_max_ratio_1215_r,min_max_ratio_mean_1215_r,max_min_diff_1215_c,min_max_ratio_1215_c,min_max_ratio_mean_1215_c
0,0.0001,-2.7607,0,,,,,,,,...,,,-2.7607,,,,,,,
1,0.0002,-2.848,0,,,,,,,,...,,,-2.804386,0.06173,,,,,,
2,0.0003,-2.4243,0,,-2.86158,,0.297445,,-3.1449,,...,,,-2.677482,0.223827,,,,,,
3,0.0004,-3.13,0,,-2.83942,,0.310673,,-3.1449,,...,,,-2.790891,0.290945,,,,,,
4,0.0005,-3.1449,0,-2.86158,-2.80924,0.297445,0.316898,-3.1449,-3.1449,-2.4243,...,,,-2.861926,0.297592,,,,,,


### Generate lag and power features (analytical functions)

In [7]:
train_copy = train.copy()

In [8]:
train = lag_features(train_copy)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [9]:
train.head()

Unnamed: 0,time,signal,open_channels,mean_5_r,mean_5_c,std_5_r,std_5_c,min_5_r,min_5_c,max_5_r,...,lag+2,power2,sqroot,power3,deriv,deriv_lag-1,deriv_lag-3,deriv_lag+1,deriv_lag+3,exp
0,0.0001,-2.7607,0,,,,,,,,...,-2.4243,7.621464,,-21.040577,-0.0873,,,0.1682,-0.3603,0.063247
1,0.0002,-2.848,0,,,,,,,,...,-3.13,8.111104,,-23.100424,0.1682,-0.0873,,-0.141,0.24005,0.05796
2,0.0003,-2.4243,0,,-2.86158,,0.297445,,-3.1449,,...,-3.1449,5.87723,,-14.24817,-0.141,0.1682,,-0.3603,0.2239,0.08854
3,0.0004,-3.13,0,,-2.83942,,0.310673,,-3.1449,,...,-2.6499,9.7969,,-30.664297,-0.3603,-0.141,-0.0873,0.24005,0.0269,0.043718
4,0.0005,-3.1449,0,-2.86158,-2.80924,0.297445,0.316898,-3.1449,-3.1449,-2.4243,...,-2.6971,9.890396,,-31.104306,0.24005,-0.3603,0.1682,0.2239,0.0143,0.043071


In [10]:
# Create a column with signal normalized per batch
train['signal_norm'] = 0
for i in tqdm(range(int(train.shape[0]/500000))):
        # normalize
    mean = train.signal[i*500000:(i+1)*500000].mean()
    train.signal_norm[i*500000:(i+1)*500000] = train.signal[i*500000:(i+1)*500000] - mean


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
100%|██████████| 10/10 [00:08<00:00,  1.11it/s]


In [11]:
train['integration'] = np.cumsum(train.signal_norm)
train['integration_shift10'] = (train['integration'] -train['integration'].shift(10))


In [12]:
train.to_csv('../data/interim/6thset_train.csv', index=True)

In [13]:
del train
del train_copy

In [14]:
X_test = test.copy()
feature_engineering(X_test)
X_test = lag_features(X_test)


100%|██████████| 6/6 [00:13<00:00,  2.17s/it]


In [15]:
X_test['signal_norm'] = 0
for i in tqdm(range(int(X_test.shape[0]/500000))):
        # normalize
    mean = X_test.signal[i*500000:(i+1)*500000].mean()
    X_test.signal_norm[i*500000:(i+1)*500000] = X_test.signal[i*500000:(i+1)*500000] - mean


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
100%|██████████| 4/4 [00:03<00:00,  1.12it/s]


In [16]:
X_test['integration'] = np.cumsum(X_test.signal_norm)
X_test['integration_shift10'] = (X_test['integration'] -X_test['integration'].shift(10))

In [17]:
X_test.head()

Unnamed: 0,time,signal,mean_5_r,mean_5_c,std_5_r,std_5_c,min_5_r,min_5_c,max_5_r,max_5_c,...,power3,deriv,deriv_lag-1,deriv_lag-3,deriv_lag+1,deriv_lag+3,exp,signal_norm,integration,integration_shift10
0,500.0001,-2.6513,,,,,,,,,...,-18.637026,-0.1953,,,-0.10125,0.12065,0.070559,-1.574542,-1.574542,
1,500.0002,-2.8466,,,,,,,,,...,-23.066374,-0.10125,-0.1953,,0.2014,-0.0627,0.058041,-1.769842,-3.344384,
2,500.0003,-2.8538,,-2.6816,,0.172569,,-2.8538,,-2.4438,...,-23.241845,0.2014,-0.10125,,0.12065,-0.06185,0.057625,-1.777042,-5.121426,
3,500.0004,-2.4438,,-2.66518,,0.179922,,-2.8538,,-2.4438,...,-14.594761,0.12065,0.2014,-0.1953,-0.0627,-0.10505,0.08683,-1.367042,-6.488469,
4,500.0005,-2.6125,-2.6816,-2.6431,0.172569,0.157464,-2.8538,-2.8538,-2.4438,-2.4438,...,-17.830721,-0.0627,0.12065,-0.10125,-0.06185,-0.0521,0.073351,-1.535742,-8.024211,


In [18]:
X_test.to_csv('../data/interim/6thset_test.csv', index=True)

In [19]:
del test 