In [1]:
# importing the all the necessary library for final pipeline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler,MinMaxScaler,Normalizer
import tqdm
from tqdm import tqdm
import warnings
warnings. filterwarnings("ignore") 
from scipy import signal
from biosppy.signals import ecg 
from biosppy.signals import eeg
from biosppy.signals import resp
from scipy.interpolate import interp1d 
import pickle
import joblib
import dask.dataframe as dd

In [3]:
def interpolation_fn(timestamps,biosppy_ts, biosppy_values):
    """linear interpolation function to produce heart rate, resp rate all time steps"""
    interpolation = interp1d(biosppy_ts,biosppy_values, kind="linear", fill_value="extrapolate")  
    return interpolation(timestamps) 

In [20]:
def biosppy(df):
    """THIS FUNCTION WILL DERIVE ALL FEATURE THAT IS GENEARTED USING BIOSPPY MODULE"""
  
    bio=ecg.ecg(df["ecg"],sampling_rate=256,show=False)  #heart rate from ecg
    df["heart_rate"]=interpolation_fn(df["time"],bio["heart_rate_ts"],bio["heart_rate"])
    
    
    bio=resp.resp(df["r"],sampling_rate=256,show=False) #resp rate from r signal
    df["resp_rate"]=interpolation_fn(df["time"],bio["resp_rate_ts"],bio["resp_rate"])
    
    bio=eeg.get_power_features(df[eeg_features],sampling_rate=256) 
    
    alpha_low_feature=[i+"_alpha_low" for i in eeg_features] # alpha_low frequency band
    for i in range(20):
        df[alpha_low_feature[i]]=interpolation_fn(df["time"],bio["ts"],bio["alpha_low"][:,i])
    
    alpha_high_feature=[i+"_alpha_high" for i in eeg_features]#  alpha_high frequency band
    for i in range(20):
        df[alpha_high_feature[i]]=interpolation_fn(df["time"],bio["ts"],bio["alpha_high"][:,i])
    
    theta_feature=[i+"_theta" for i in eeg_features]#theta frequency band
    for i in range(20):
        df[theta_feature[i]]=interpolation_fn(df["time"],bio["ts"],bio["theta"][:,i])

   
    return df

In [8]:
def potential(df):
    
    """FUNCTION TO CALCULATE POTENTIAL DIFFERENCE BETWEEN ELECTRODES"""
    
    df["fp1_f7"]=df["eeg_fp1"]-df["eeg_f7"]
    df["f7_t3"]=df["eeg_f7"]-df["eeg_t3"]
    df["t3_t5"]=df["eeg_t3"]-df["eeg_t5"]
    df["t5_o1"]=df["eeg_t5"]-df["eeg_o1"]
    df["p3_o1"]=df["eeg_p3"]-df["eeg_o1"]
    df["c3_p3"]=df["eeg_c3"]-df["eeg_p3"]
    df["f3_c3"]=df["eeg_f3"]-df["eeg_c3"]
    df["fp1_f3"]=df["eeg_fp1"]-df["eeg_f3"]

    df["fz_cz"]=df["eeg_fz"]-df["eeg_cz"]
    df["cz_pz"]=df["eeg_cz"]-df["eeg_pz"]

    df["fp2_f4"]=df["eeg_fp2"]-df["eeg_f4"]
    df["f4_c4"]=df["eeg_f4"]-df["eeg_c4"]
    df["c4_p4"]=df["eeg_c4"]-df["eeg_p4"]
    df["p4_o2"]=df["eeg_p4"]-df["eeg_o2"]
    df["t6_o2"]=df["eeg_t6"]-df["eeg_o2"]
    df["t4_t6"]=df["eeg_t4"]-df["eeg_t6"]
    df["f8_t4"]=df["eeg_f8"]-df["eeg_t4"]
    df["fp2_f8"]=df["eeg_fp2"]-df["eeg_f8"]
    
    return df

In [38]:
eeg_features=['eeg_fp1', 'eeg_f7', 'eeg_f8','eeg_t4', 'eeg_t6', 'eeg_t5', 'eeg_t3', 'eeg_fp2', 'eeg_o1', 'eeg_p3','eeg_pz', 'eeg_f3',
              'eeg_fz', 'eeg_f4', 'eeg_c4', 'eeg_p4', 'eeg_poz','eeg_c3', 'eeg_cz', 'eeg_o2']

p_d=["fp1_f7","f7_t3","t3_t5","t5_o1","p3_o1","c3_p3","f3_c3","fp1_f3","fz_cz","cz_pz","fp2_f4","f4_c4","c4_p4","p4_o2","t6_o2","t4_t6","f8_t4","fp2_f8"]
alpha_low_feature=[i+"_alpha_low" for i in eeg_features]
alpha_high_feature=[i+"_alpha_high" for i in eeg_features]
theta_feature=[i+"_theta" for i in eeg_features]

features=["ecg","r","gsr","resp_rate","heart_rate"]
features.extend(eeg_features)
features.extend(p_d)
features.extend(alpha_low_feature)# storing all the features in a list
features.extend(alpha_high_feature)
features.extend(theta_feature)
print(len(features))

103


In [10]:
std=np.load("std.npy",allow_pickle=True)# load standardization pickle file that is used in training

In [29]:
def standardisation(X_test):
    """TRANSFORM TEST DATA BASED ON STANDARDIZATION OF TRAIN DATA"""
    for i in range(len(features)):
        std_1= std[i]
        X_test[features[i]]=std_1.transform(X_test[features[i]].values.reshape(-1,1))
    return X_test

In [12]:
def final_pipeline(X_test):
    X_test=biosppy(X_test)
    X_test=potential(X_test)
    X_test=standardisation(X_test)
    
    return X_test

## Predicting over test data

In [13]:
test_data_set=pd.read_csv("test.csv")

In [14]:
test_data_set.drop(['id','experiment','seat'],axis=1,inplace=True)
test_data_set=test_data_set.fillna(0)

In [15]:
test_data_set.head()

Unnamed: 0,crew,time,eeg_fp1,eeg_f7,eeg_f8,eeg_t4,eeg_t6,eeg_t5,eeg_t3,eeg_fp2,...,eeg_f4,eeg_c4,eeg_p4,eeg_poz,eeg_c3,eeg_cz,eeg_o2,ecg,r,gsr
0,1,0.0,17.8995,6.12783,0.994807,-28.2062,-47.695499,-187.080002,-33.183498,-4.22078,...,-7.04448,-14.4051,-4.03384,-0.393799,31.8381,17.0756,-8.13735,-7323.120117,643.177002,594.778992
1,1,0.0,45.883202,94.749001,23.2908,1.392,2.06094,-5.14529,6.39594,33.420101,...,19.887501,-215.179001,2.11832,7.34094,9.66785,1169.23999,-0.024414,-3335.080078,826.724976,136.235001
2,1,0.003906,33.120098,28.356501,-7.23922,-7.69086,-25.833799,-107.236,12.8452,1.21528,...,-7.64256,-10.3636,10.9505,6.51336,36.0186,24.3566,0.956158,-7323.120117,643.177002,594.778992
3,1,0.003906,43.280102,95.887001,18.702299,-1.43289,-4.2326,-8.02118,7.42743,27.345699,...,13.8266,-214.223007,-4.91354,3.50452,3.87811,1159.400024,-0.047607,-3335.080078,826.724976,136.235001
4,1,0.007812,7.92911,3.46038,-10.8608,-26.366699,-25.894699,37.0079,-50.334202,-11.6764,...,2.04545,-20.788799,-3.61418,-7.53983,3.08397,13.6479,1.47372,-7323.120117,643.177002,594.778992


In [16]:
test_data_set= final_pipeline(test_data_set)

In [41]:
test_data_set.to_csv("pilot_test_after_fe")

In [42]:
model=joblib.load('final_model1.pkl')

In [44]:
submission=model.predict_proba(test_data_set)

In [45]:
submission

array([[9.93866400e-01, 7.81390417e-05, 5.89480913e-03, 1.60651512e-04],
       [9.98167247e-01, 9.70446636e-05, 1.60037395e-03, 1.35334474e-04],
       [9.93866400e-01, 7.81390417e-05, 5.89480913e-03, 1.60651512e-04],
       ...,
       [9.82776081e-01, 4.56551344e-04, 7.91598473e-03, 8.85138305e-03],
       [9.78686650e-01, 6.36923265e-04, 1.82672777e-02, 2.40914888e-03],
       [9.82776081e-01, 4.56551344e-04, 7.91598473e-03, 8.85138305e-03]])

In [46]:
final_sub=pd.DataFrame()
final_sub["id"]=np.arange(0,17965143)
final_sub["A"]=submission[:,0]
final_sub["B"]=submission[:,1]
final_sub["C"]=submission[:,2]
final_sub["D"]=submission[:,3]

In [50]:
final_sub.head(10)

Unnamed: 0,id,A,B,C,D
0,0,0.993866,7.8e-05,0.005895,0.000161
1,1,0.998167,9.7e-05,0.0016,0.000135
2,2,0.993866,7.8e-05,0.005895,0.000161
3,3,0.998167,9.7e-05,0.0016,0.000135
4,4,0.993866,7.8e-05,0.005895,0.000161
5,5,0.998167,9.7e-05,0.0016,0.000135
6,6,0.993866,7.8e-05,0.005895,0.000161
7,7,0.998167,9.7e-05,0.0016,0.000135
8,8,0.993866,7.8e-05,0.005895,0.000161
9,9,0.998167,9.7e-05,0.0016,0.000135


The final data ("submission.csv") is submitted using kaggle api.For reference you check this link
https://medium.com/@nokknocknok/make-your-kaggle-submissions-with-kaggle-official-api-f49093c04f8a

In [25]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "https://imgur.com/pBZP6lz.jpg")