In [1]:
import pandas as pd
import numpy as np

### Function

In [185]:
#==========================================================================================
#making time stamp uniform by Interpolation
from scipy import interpolate
def preprocess(data):
    freq=50
    ls=['X','Y','Z']
    t1=np.arange(data.Timestamp[0],data.Timestamp[(data.shape[0])-1],0.02)
    df=pd.DataFrame({'Timestamp':t1})
    for i in ls:
        fcubic = interpolate.interp1d(data.Timestamp, data[i], kind='cubic')
        df[i]=fcubic(t1)
    df.columns=['Timestamp','acc_X','acc_Y','acc_Z']
    return df

#making time stamp uniform by Interpolation 
def preprocess_real_smartphone(data):
    data.rename(columns={'t':'Timestamp','x':'X','y':'Y','z':'Z'},inplace=True)
    ls=['X','Y','Z']
    freq=round((1/((data.Timestamp.max()/data.Timestamp.shape[0]).round(3))),0)
    t1=np.arange(data.Timestamp[0],data.Timestamp[(data.shape[0])-1],(data.Timestamp.max()/data.Timestamp.shape[0]).round(3))
    #freq=100
    #t1=np.arange(data.Timestamp[0],data.Timestamp[(data.shape[0])-1],0.01)
    df=pd.DataFrame({'Timestamp':t1})
    for i in ls:
        fcubic = interpolate.interp1d(data.Timestamp, data[i], kind='cubic')
        df[i]=fcubic(t1)
    df.columns=['Timestamp','acc_X','acc_Y','acc_Z']
    return df,freq

def preprocess_real_smartwatch(data):
    
    a=data.groupby('device_id').agg({'x':'var','y':'count'}).reset_index()
    deviceid=a.loc[a.x.idxmax(),'device_id']
    if int(a.loc[a.device_id==deviceid,'y'])<=data.shape[0]*0.2:
        deviceid=a.loc[a.x.idxmin(),'device_id']
    
    data=data[data.device_id==deviceid].reset_index()
    data.rename(columns={'t':'Timestamp','x':'X','y':'Y','z':'Z'},inplace=True)
   
    ls=['X','Y','Z']
    #freq=round((1/((data.Timestamp.max()/data.Timestamp.shape[0]).round(3))),0)
    freq=50
    t1=np.arange(data.Timestamp[0],data.Timestamp[(data.shape[0])-1],0.02)
    #t1=np.arange(data.Timestamp[0],data.Timestamp[(data.shape[0])-1],(data.Timestamp.max()/data.Timestamp.shape[0]).round(3))
    df=pd.DataFrame({'Timestamp':t1})
        
    for i in ls:
        fcubic = interpolate.interp1d(data.Timestamp, data[i])
        df[i]=fcubic(t1)
    df.rename(columns={'X':'acc_X','Y':'acc_Y','Z':'acc_Z'},inplace=True)
    return df[['Timestamp','acc_X','acc_Y','acc_Z']],deviceid,freq

#==========================================================================================
#median filter
from scipy.signal import medfilt # import the median filter function
def median(signal):# input: numpy array 1D (one column)  
    #applying the median filter
    return  medfilt(np.array(signal), kernel_size=3) # applying the median filter order3(kernel_size=3)


#==========================================================================================
#components_selection_one_signal
import math # import math library


def components_selection_one_signal(t_signal,sampling_freq):
    nyq=sampling_freq/float(2) # nyq is the nyquist frequency equal to the half of the sampling frequency[50/2= 25 Hz]

    freq1 = 0.3
    freq2 = 20

    t_signal=np.array(t_signal)
    t_signal_length=len(t_signal) # number of points in a t_signal
    
    # the t_signal in frequency domain after applying fft
    f_signal=np.fft.fft(t_signal) # 1D numpy array contains complex values (in C)
    
    # generate frequencies associated to f_signal complex values
    freqs=np.array(np.fft.fftfreq(t_signal_length, d=1/float(sampling_freq))) # frequency values between [-25hz:+25hz]
        
    df=pd.DataFrame({'freq':abs(freqs),'amplitute':f_signal})
    df['f_DC_signal']=np.where(df.freq>freq1,0,df.amplitute)
    #df['f_noise_signal']=np.where(df.freq<=freq2,0,df.amplitute)
    df['f_body_signal']=np.where(df.freq<=freq1,0,np.where(df.freq>freq2,0,df.amplitute))

    
    # Inverse the transformation of signals in freq domain #
    # applying the inverse fft(ifft) to signals in freq domain and put them in float format
    t_DC_component= np.fft.ifft(np.array(df['f_DC_signal'])).real
    t_body_component= np.fft.ifft(np.array(df['f_body_signal'])).real
    #t_noise=np.fft.ifft(np.array(df['f_noise_signal'])).real
    t_noise=[]
    
    #total_component=t_signal-t_noise # extracting the total component(filtered from noise) 
                                     #  by substracting noise from t_signal (the original signal).
    total_component=[]
    
    # return outputs mentioned earlier
    return (total_component,t_DC_component,t_body_component,t_noise) 


#=================================================================================================================
#Define verify gravity function
def mag_3_signals(df): # Euclidian magnitude
    return np.array(np.sqrt(np.square(df).sum(axis=1)))

def verify_gravity(data):
    
    acc_x=np.array(data['acc_X']) # copy acc_X column from dataframe in raw_dic having the key mentioned above
    acc_y=np.array(data['acc_Y'])# copy acc_Y column  from dataframe in raw_dic having the key mentioned above
    acc_z=np.array(data['acc_Z'])# copy acc_Z column  from dataframe in raw_dic having the key mentioned above

    # apply the filtering method to acc_[X,Y,Z] and store gravity components
    grav_acc_X=components_selection_one_signal(acc_x)[1] 
    grav_acc_Y=components_selection_one_signal(acc_y)[1]
    grav_acc_Z=components_selection_one_signal(acc_z)[1]
    
    # calculating gravity magnitude signal
    grav_acc_mag=mag_3_signals(grav_acc_X, grav_acc_Y,grav_acc_Z)
    print('mean value = ',round((sum(grav_acc_mag) / len(grav_acc_mag)),3),' g')
    
#=================================================================================================================    
#Define jerking and magnitude functions
def jerk_one_signal(signal,sampling_freq):
    signal=pd.DataFrame(signal)
    jerk=(signal.shift(-1)-signal)*sampling_freq
    return np.array(jerk.dropna()).transpose()[0]







In [53]:
#from scipy.signal import butter,filtfilt

### IDs_Labels Data

In [54]:
#Test Data
cis_pd_testing_id=pd.read_csv('test_data_Id/cis-pd.CIS-PD_Test_Data_IDs.csv')
real_pd_testing_id=pd.read_csv('test_data_Id/real-pd.REAL-PD_Test_Data_IDs.csv')

#Training Data
cis_pd_training_id=pd.read_csv('data_labels/CIS-PD_Training_Data_IDs_Labels.csv')
real_pd_training_id=pd.read_csv('data_labels/REAL-PD_Training_Data_IDs_Labels.csv')

#Ancillary Data
cis_pd_ancillary_id=pd.read_csv('data_labels/CIS-PD_Ancillary_Data_IDs_Labels.csv')
real_pd_ancillary_id=pd.read_csv('data_labels/REAL-PD_Ancillary_Data_IDs_Labels.csv')

### Cispd Data

In [55]:
import glob
a=glob.glob("training_data/*.csv")


In [56]:
data=preprocess(pd.read_csv(a[14]))

data.head()

Unnamed: 0,Timestamp,acc_X,acc_Y,acc_Z
0,0.0,0.186035,0.170654,-0.878174
1,0.02,0.148438,0.209961,-0.894287
2,0.04,0.166016,0.219727,-0.925049
3,0.06,0.206299,0.194092,-0.926758
4,0.08,0.173584,0.204102,-0.983154


In [57]:
data.shape[0]

60000

In [58]:
def time_domain_signal(data,sampling_freq):
    time_sig_df=pd.DataFrame()
    for column in ['acc_X','acc_Y','acc_Z']:
        t_signal=np.array(data[column])
        #med_filtred=median(t_signal)
        med_filtred=(t_signal)
        _,grav_acc,body_acc,_=components_selection_one_signal(med_filtred,sampling_freq)
        body_acc_jerk=jerk_one_signal(body_acc,sampling_freq)
        time_sig_df['t_body_'+column]=body_acc[:-1]
        time_sig_df['t_grav_'+column]= grav_acc[:-1]
        time_sig_df['t_body_acc_jerk_'+column[-1]]=body_acc_jerk

    # all 15 axial signals generated above are reordered to facilitate magnitudes signals generation
    new_columns_ordered=['t_body_acc_X','t_body_acc_Y','t_body_acc_Z',
                              't_grav_acc_X','t_grav_acc_Y','t_grav_acc_Z',
                              't_body_acc_jerk_X','t_body_acc_jerk_Y','t_body_acc_jerk_Z']


    # create new dataframe to order columns
    time_sig_df=time_sig_df[new_columns_ordered]

    # Magnitude Features
    for i in range(0,9,3):
        mag_col_name=new_columns_ordered[i][:-1]+'mag'# Create the magnitude column name related to each 3-axial signals
        time_sig_df[mag_col_name]=mag_3_signals(time_sig_df[new_columns_ordered[i:i+3]]) # store the signal_mag with its appropriate column name

    return(time_sig_df)

In [59]:
time_sig_df=time_domain_signal(data,50)
time_sig_df.shape

(59999, 12)

In [60]:
#from scipy import fftpack # import fftpack to use all fft functions
from numpy.fft import *

# fast_fourier_transform_one_signal 
def fast_fourier_transform_one_signal(t_signal):
    return np.abs(np.fft.rfft(t_signal))

# fast fourier transform for data frames
def fast_fourier_transform(t_window,sampling_freq):
    f_window=pd.DataFrame() 
    for column in t_window.columns: 
        if 'grav' not in column: # verify if time domain signal is not related to gravity components
            t_signal=np.array(t_window[column]) # convert the column to a 1D numpy array
            f_signal= fast_fourier_transform_one_signal(t_signal) # apply the function defined above to the column
            f_window["f_"+column[2:]]=f_signal # storing the frequency signal in f_window with an appropriate column name
    dfreq=np.array(np.fft.rfftfreq(len(t_signal), d=1/float(sampling_freq))) 
    return f_window,dfreq # return the frequency domain window


In [61]:
freq_sig_df,dfreq=fast_fourier_transform(time_sig_df,50)

In [62]:
freq_sig_df.head()

Unnamed: 0,f_body_acc_X,f_body_acc_Y,f_body_acc_Z,f_body_acc_jerk_X,f_body_acc_jerk_Y,f_body_acc_jerk_Z,f_body_acc_mag,f_body_acc_jerk_mag
0,0.052618,0.28433,0.543517,0.045829,29.691091,48.687941,10776.777934,205803.224921
1,0.052618,0.28433,0.543518,0.04583,29.691094,48.687946,3784.454865,45070.684448
2,0.052619,0.284332,0.543522,0.045832,29.691103,48.687963,837.634454,16061.776519
3,0.052622,0.284335,0.543528,0.045835,29.691118,48.687991,1273.150911,26879.153451
4,0.052625,0.284339,0.543536,0.045839,29.691139,48.688031,376.69041,6166.657962


### Common Axial Features Functions

In [63]:
# mean
def mean_axial(df):
    array=np.array(df) # convert dataframe into 2D numpy array for efficiency
    mean_vector = list(array.mean(axis=0)) # calculate the mean value of each column
    return mean_vector # return mean vetor
# std
def std_axial(df):
    array=np.array(df)# convert dataframe into 2D numpy array for efficiency
    std_vector = list(array.std(axis=0))# calculate the standard deviation value of each column
    return std_vector

# mad
from statsmodels.robust import mad as median_deviation # import the median deviation function
def mad_axial(df):
    array=np.array(df)# convert dataframe into 2D numpy array for efficiency
    mad_vector = list(median_deviation(array,axis=0)) # calculate the median deviation value of each column
    return mad_vector

# max

def max_axial(df):
    array=np.array(df)# convert dataframe into 2D numpy array for efficiency
    max_vector=list(array.max(axis=0))# calculate the max value of each column
    return max_vector
# min
def min_axial(df):
    array=np.array(df)# convert dataframe into 2D numpy array for efficiency
    min_vector=list(array.min(axis=0))# calculate the min value of each column
    return min_vector
# IQR
from scipy.stats import iqr as IQR # import interquartile range function (Q3(column)-Q1(column))
def IQR_axial(df):
    array=np.array(df)# convert dataframe into 2D numpy array for efficiency
    IQR_vector=list(np.apply_along_axis(IQR,0,array))# calculate the inter quartile range value of each column
    return IQR_vector


# Entropy
from scipy.stats import entropy # import the entropy function
def entropy_axial(df):
    array=np.array(df)# convert dataframe into 2D numpy array for efficiency
    entropy_vector=list(np.apply_along_axis(entropy,0,abs(array)))# calculate the entropy value of each column
    return entropy_vector

### Common Magnitude Features Functions

In [64]:
# mean
def mean_mag(mag_column):
    array=np.array(mag_column)
    mean_value = float(array.mean())
    return mean_value

# std: standard deviation of mag column
def std_mag(mag_column):
    array=np.array(mag_column)
    std_value = float(array.std()) # std value 
    return std_value

# mad: median deviation
def mad_mag(mag_column):
    array=np.array(mag_column)
    mad_value = float(median_deviation(array))# median deviation value of mag_column
    return mad_value

# max
def max_mag(mag_column):
    array=np.array(mag_column)
    max_value=float(array.max()) # max value 
    return max_value
# min
def min_mag(mag_column):
    array=np.array(mag_column)
    min_value= float(array.min()) # min value
    return min_value

# IQR
def IQR_mag(mag_column):
    array=np.array(mag_column)
    IQR_value=float(IQR(array))# Q3(column)-Q1(column)
    return IQR_value

# Entropy
def entropy_mag(mag_column):
    array=np.array(mag_column)
    entropy_value=float(entropy(array)) # entropy signal
    return entropy_value

### Time Axial Features functions

In [65]:
# sma
def t_sma_axial(df):
    array=np.array(df)
    sma_axial=float(abs(array).sum())/float(3) # sum of areas under each signal
    return sma_axial # return sma value

# energy
def t_energy_axial(df):
    array=np.array(df)
    energy_vector=list((array**2).sum(axis=0)) # energy value of each df column
    return energy_vector # return energy vector energy_X,energy_Y,energy_Z

# define the arbugr function
#auto regression coefficients with using burg method with order from 1 to 4
from spectrum import *

##############################################################################################
# I took this function as it is from this link ------>    https://github.com/faroit/freezefx/blob/master/fastburg.py
# This fucntion and the original function arburg in the library spectrum generate the same first 3 coefficients 
#for all windows the original burg method is low and for some windows it cannot generate all 4th coefficients 

def _arburg2(X, order):
    """This version is 10 times faster than arburg, but the output rho is not correct.
    returns [1 a0,a1, an-1]
    """
    x = numpy.array(X)
    N = len(x)

    if order == 0.:
        raise ValueError("order must be > 0")

    # Initialisation
    # ------ rho, den
    rho = sum(abs(x)**2.) / N  # Eq 8.21 [Marple]_
    den = rho * 2. * N

    # ------ backward and forward errors
    ef = numpy.zeros(N, dtype=complex)
    eb = numpy.zeros(N, dtype=complex)
    for j in range(0, N):  # eq 8.11
        ef[j] = x[j]
        eb[j] = x[j]

    # AR order to be stored
    a = numpy.zeros(1, dtype=complex)
    a[0] = 1
    # ---- rflection coeff to be stored
    ref = numpy.zeros(order, dtype=complex)

    E = numpy.zeros(order+1)
    E[0] = rho

    for m in range(0, order):
        # print m
        # Calculate the next order reflection (parcor) coefficient
        efp = ef[1:]
        ebp = eb[0:-1]
        # print efp, ebp
        num = -2. * numpy.dot(ebp.conj().transpose(), efp)
        den = numpy.dot(efp.conj().transpose(),  efp)
        den += numpy.dot(ebp,  ebp.conj().transpose())
        ref[m] = num / den

        # Update the forward and backward prediction errors
        ef = efp + ref[m] * ebp
        eb = ebp + ref[m].conj().transpose() * efp

        # Update the AR coeff.
        a.resize(len(a)+1)
        a = a + ref[m] * numpy.flipud(a).conjugate()

        # Update the prediction error
        E[m+1] = numpy.real((1 - ref[m].conj().transpose() * ref[m])) * E[m]
        # print 'REF', ref, num, den
    return a, E[-1], ref

#################################################################################################################

# to generate arburg (order 4) coefficents for 3 columns [X,Y,Z]
def t_arburg_axial(df):
    # converting signals to 1D numpy arrays for efficiency
    array_X=np.array(df[df.columns[0]])
    array_Y=np.array(df[df.columns[1]])
    array_Z=np.array(df[df.columns[2]])
    
    AR_X = list(_arburg2(array_X,4)[0][1:].real) # list contains real parts of all 4th coefficients generated from signal_X
    AR_Y = list(_arburg2(array_Y,4)[0][1:].real) # list contains real parts of all 4th coefficients generated from signal_Y
    AR_Z = list(_arburg2(array_Z,4)[0][1:].real) # list contains real parts of all 4th coefficients generated from signal_Z
    
    # selecting [AR1 AR2 AR3 AR4] real components for each axis concatenate them in one vector
    AR_vector= AR_X + AR_Y+ AR_Z
    
    
    # AR_vector contains 12 values 4values per each axis 
    return AR_vector


from scipy.stats import pearsonr
def t_corr_axial(df): # it returns 3 correlation features per each 3-axial signals in  time_window
    
    array=np.array(df)
    
    Corr_X_Y=float(pearsonr(array[:,0],array[:,1])[0]) # correlation value between signal_X and signal_Y
    Corr_X_Z=float(pearsonr(array[:,0],array[:,2])[0]) # correlation value between signal_X and signal_Z
    Corr_Y_Z=float(pearsonr(array[:,1],array[:,2])[0]) # correlation value between signal_Y and signal_Z
    
    corr_vector =[Corr_X_Y, Corr_X_Z, Corr_Y_Z] # put correlation values in list
    
    return corr_vector 
 
#hurst exponent
def hurst(signal):
    """
    **Experimental**/untested implementation taken from:
    http://drtomstarke.com/index.php/calculation-of-the-hurst-exponent-to-test-for-trend-and-mean-reversion/
    Use at your own risks.
    """
    
    signal=np.array(signal)
    tau = []; lagvec = []
    #  Step through the different lags
    for lag in range(2,20):
    #  produce price difference with lag
        pp = np.subtract(signal[lag:],signal[:-lag])
    #  Write the different lags into a vector
        lagvec.append(lag)
    #  Calculate the variance of the difference vector
        tau.append(np.std(pp))
    #  linear fit to double-log graph (gives power)
    m = np.polyfit(np.log10(lagvec),np.log10(tau),1)
    # calculate hurst
    hurst = m[0]
    return hurst


# to generate hurst  for 3 columns [X,Y,Z]
def t_hurst_axial(df):
    return list(df.apply(hurst,axis=0))


### Time Axial Features PipeLine

In [66]:
def t_axial_features_generation(t_window):
    
    # select axial columns : the first 9 columns
    axial_columns=t_window.columns[0:9]
    
    # select axial columns in a dataframe
    axial_df=t_window[axial_columns]
    
    ## a list will contain all axial features values resulted from applying: 
    #  common axial features functions and time axial features functions to all time domain signals in t_window
    t_axial_features=[]
    for col in range(0,9,3):
        df=axial_df[axial_columns[col:col+3]] # select each group of 3-axial signal: signal_name[X,Y,Z]
        
        # apply all common axial features functions and time axial features functions to each 3-axial signals dataframe
        mean_vector   = mean_axial(df) # 3values
        std_vector    = std_axial(df) # 3 values
        mad_vector    = mad_axial(df)# 3 values
        max_vector    = max_axial(df)# 3 values
        min_vector    = min_axial(df)# 3 values
        sma_value     = t_sma_axial(df)# 1 value
        energy_vector = t_energy_axial(df)# 3 values
        IQR_vector    = IQR_axial(df)# 3 values
        entropy_vector= entropy_axial(df)# 3 values
        AR_vector     = t_arburg_axial(df)# 3 values
        corr_vector   = t_corr_axial(df)# 3 values
        hurst_vector  = t_hurst_axial(df)# 3 values
        # 40 value per each 3-axial signals
        t_3axial_vector= mean_vector + std_vector + mad_vector + max_vector + min_vector + [sma_value] + energy_vector + IQR_vector + entropy_vector + AR_vector + corr_vector+hurst_vector
        
        # append these features to the global list of features
        t_axial_features= t_axial_features+ t_3axial_vector
    
    # t_axial_features contains 200 values = 40 value per each 3axial x 5 tri-axial-signals[X,Y,Z]
    return t_axial_features
    

In [67]:
len(t_axial_features_generation(time_sig_df))

129

### Define Time Magnitudes Features functions

In [68]:
# Functions used to generate time magnitude features

# sma: signal magnitude area
def t_sma_mag(mag_column):
    array=np.array(mag_column)
    sma_mag=float(abs(array).sum())# signal magnitude area of one mag column
    return sma_mag

# energy
def t_energy_mag(mag_column):
    array=np.array(mag_column)
    energy_value=float((array**2).sum()) # energy of the mag signal
    return energy_value



# arburg: auto regression coefficients using the burg method
def t_arburg_mag(mag_column):
    
    array = np.array(mag_column)
    
    AR_vector= list(_arburg2(array,4)[0][1:].real) # AR1, AR2, AR3, AR4 of the mag column
    #print(AR_vector)
    return AR_vector

In [69]:
hurst(time_sig_df.t_body_acc_X)

0.4194307331480186

### Time Magnitude Features PipLine

In [70]:
def t_mag_features_generation(t_window):
    
    # select mag columns : the last 3 columns in a time domain window
    
    mag_columns=t_window.columns[9:] # mag columns' names
    mag_columns=t_window[mag_columns] # mag data frame
    
    t_mag_features=[] # a global list will contain all time domain magnitude features
    
    for col in mag_columns: # iterate throw each mag column
        
        mean_value   = mean_mag(mag_columns[col]) # 1 value
        std_value    = std_mag(mag_columns[col])# 1 value
        mad_value    = mad_mag(mag_columns[col])# 1 value
        max_value    = max_mag(mag_columns[col])# 1 value
        min_value    = min_mag(mag_columns[col])# 1 value
        sma_value    = t_sma_mag(mag_columns[col])# 1 value
        energy_value = t_energy_mag(mag_columns[col])# 1 value
        IQR_value    = IQR_mag(mag_columns[col])# 1 value
        entropy_value= entropy_mag(mag_columns[col])# 1 value
        #hurst_vector    = hurst(mag_columns[col])# 1 value
        AR_vector    = t_arburg_mag(mag_columns[col])# 1 value
        
        # 13 value per each t_mag_column
        col_mag_values = [mean_value, std_value, mad_value, max_value, min_value, sma_value, 
                          energy_value,IQR_value, entropy_value]+ AR_vector
        
        # col_mag_values will be added to the global list
        t_mag_features= t_mag_features+ col_mag_values
    
    # t_mag_features contains 65 values = 13 values (per each t_mag_column) x 5 (t_mag_columns)
    return t_mag_features
 

In [71]:
len(t_mag_features_generation(time_sig_df))

39

In [72]:
len(t_axial_features_generation(time_sig_df))+len(t_mag_features_generation(time_sig_df))

168

### Time Features names Generation

In [73]:
def time_features_names():
    # Generating time feature names
    
    # time domain axial signals' names
    t_axis_signals=[['t_body_acc_X','t_body_acc_Y','t_body_acc_Z'],
                    ['t_grav_acc_X','t_grav_acc_Y','t_grav_acc_Z'],
                    ['t_body_acc_jerk_X','t_body_acc_jerk_Y','t_body_acc_jerk_Z']]
    
    # time domain magnitude signals' names
    magnitude_signals=['t_body_acc_Mag','t_grav_acc_Mag','t_body_acc_jerk_Mag']

    # functions' names:
    t_one_input_features_name1=['_mean()','_std()','_mad()','_max()','_min()']

    t_one_input_features_name2=['_energy()','_iqr()','_entropy()']

    t_one_input_features_name3=['_AR1()','_AR2()','_AR3()','_AR4()']

    correlation_columns=['_Corr(X,Y)','_Corr(X,Z)','_Corr(Y,Z)']

    t_one_input_features_name4=['_hurst()']
    

    features=[]# Empty list : it will contain all time domain features' names
    
    for columns in t_axis_signals: # iterate throw  each group of 3-axial signals'
        
        for feature in t_one_input_features_name1: # iterate throw the first list of functions names
            
            for column in columns: # iterate throw each axial signal in that group
                
                newcolumn=column[:-2]+feature+column[-2:] # build the feature name
                features.append(newcolumn) # add it to the global list
        
        sma_column=column[:-2]+'_sma()' # build the feature name sma related to that group
        features.append(sma_column) # add the feature to the list
        
        for feature in t_one_input_features_name2: # same process for the second list of features functions
            for column in columns:
                newcolumn=column[:-2]+feature+column[-2:]
                features.append(newcolumn)
        
        for column in columns:# same process for the third list of features functions
            for feature in t_one_input_features_name3:
                newcolumn=column[0:-2]+feature+column[-2:]
                features.append(newcolumn)
        
        for feature in correlation_columns: # adding correlations features
            newcolumn=column[0:-2]+feature
            features.append(newcolumn)
            
        for feature in t_one_input_features_name4: # adding correlations features
            for column in columns:
                newcolumn=column[:-2]+feature+column[-2:]
                features.append(newcolumn)
        

    for columns in magnitude_signals: # iterate throw time domain magnitude column names

        # build feature names related to that column
        #list 1
        for feature in t_one_input_features_name1:
            newcolumn=columns+feature
            features.append(newcolumn)
        # sma feature name
        sma_column=columns+'_sma()'
        features.append(sma_column)
        
        # list 2
        for feature in t_one_input_features_name2: 
            newcolumn=columns+feature
            features.append(newcolumn)
            
       
        # list 3
        for feature in t_one_input_features_name3:
            newcolumn=columns+feature
            features.append(newcolumn)
            
        
    ###########################################################################################################
    time_list_features=features
    
    return time_list_features # return all time domain features' names

In [74]:
len(time_features_names())

168

### Frequency Axial features functions

In [75]:
# sma
def f_sma_axial(df):
    array=np.array(df)
    sma_value=float((abs(array)/math.sqrt(array.shape[0])).sum())/float(3) # sma value of 3-axial f_signals
    return sma_value


# energy
def f_energy_axial(df):
    array=np.array(df)
    # spectral energy vector
    energy_vector=list((array**2).sum(axis=0)/float(len(array))) # energy of: f_signalX,f_signalY, f_signalZ
    return energy_vector # enrgy veactor=[energy(signal_X),energy(signal_Y),energy(signal_Z)]

#Max Inds and Mean_Freq Functions
# max_Inds
def f_max_Inds_axial(df,dfreq):
    array=np.array(df)
    max_Inds_X =dfreq[array[:,0].argmax()] # return the frequency related to max value of f_signal X
    max_Inds_Y =dfreq[array[:,1].argmax()] # return the frequency related to max value of f_signal Y
    max_Inds_Z =dfreq[array[:,2].argmax()] # return the frequency related to max value of f_signal Z
    max_Inds_vector= [max_Inds_X,max_Inds_Y,max_Inds_Z]# put those frequencies in a list
    return max_Inds_vector

# mean freq()
def f_mean_Freq_axial(df,dfreq):
    array=np.array(df)
    # sum of( freq_i * f_signal[i])/ sum of signal[i]
    mean_freq_X = np.dot(dfreq,array[:,0]).sum() / float(array[:,0].sum()) #  frequencies weighted sum using f_signalX
    mean_freq_Y = np.dot(dfreq,array[:,1]).sum() / float(array[:,1].sum()) #  frequencies weighted sum using f_signalY 
    mean_freq_Z = np.dot(dfreq,array[:,2]).sum() / float(array[:,2].sum()) #  frequencies weighted sum using f_signalZ
    mean_freq_vector=[mean_freq_X,mean_freq_Y,mean_freq_Z] # vector contain mean frequencies[X,Y,Z]
    return  mean_freq_vector


# Skewness & Kurtosis Functions
from scipy.stats import kurtosis       # kurtosis function
from scipy.stats import skew           # skewness function
    
def f_skewness_and_kurtosis_axial(df):
    array=np.array(df)
    skew_axial=list(skew(array,axis=0))
    kur_axial=list(kurtosis(array,axis=0))
    skew_kur_3axial_vector = [i for tup in zip(skew_axial,kur_axial) for i in tup]
    return  skew_kur_3axial_vector



#f_one_band_energy
def f_one_band_energy(psd, bands,dfreq):
    psd = np.abs(np.array(psd))**2
    bands = np.asarray(bands)
    freq_limits_low = np.concatenate([[1],bands])
    freq_limits_up = np.concatenate([bands, [25]])
    power_per_band_mean = [np.mean(psd[np.bitwise_and(dfreq >= low, dfreq<up)])
            for low,up in zip(freq_limits_low, freq_limits_up)]
    return power_per_band_mean

#spectral_entropy
def spectral_entropy(psd, bands,dfreq):
    psd = np.abs(np.array(psd))**2
    psd2 =psd/np.sum(psd) # psd as a pdf (normalised to one)
    bands = np.asarray(bands)
    freq_limits_low = np.concatenate([[1],bands])
    freq_limits_up = np.concatenate([bands, [25]])
    power_per_band = [np.sum(psd2[np.bitwise_and(dfreq >= low, dfreq<up)])
        for low,up in zip(freq_limits_low, freq_limits_up)]
    power_per_band=np.asarray(power_per_band)
    power_per_band= power_per_band[ power_per_band > 0]
    t=[- np.sum(power_per_band * np.log2(power_per_band))]
    return t

#Bands Energy FUNCTIONS
B1=[4,7,10,13,16,19,22] 
B2=[7,13,19]
B3=[12]


def f_all_bands_energy_axial(df,dfreq): # df is dataframe contain 3 columns (3-axial f_signals [X,Y,Z])
    E_3_axis =[]
    SE_3_axis =[]
    array=np.array(df)
    for i in range(0,3): # iterate throw signals
        E1=f_one_band_energy(array[:,i],B1,dfreq) # energy bands1 values of f_signal
        E2=f_one_band_energy(array[:,i],B2,dfreq)# energy bands2 values of f_signal
        E3=f_one_band_energy(array[:,i],B3,dfreq)# energy bands3 values of f_signal
        E_one_axis = E1+E2+E3 # list of energy bands values of one f_signal
        E_3_axis= E_3_axis + E_one_axis # add values to the global list
        
        SE1=spectral_entropy(array[:,i],B1,dfreq) # Senergy bands1 values of f_signal
        SE2=spectral_entropy(array[:,i],B2,dfreq)# Senergy bands2 values of f_signal
        SE3=spectral_entropy(array[:,i],B3,dfreq)# Senergy bands3 values of f_signal
        SE_one_axis = SE1+SE2+SE3 # list of energy bands values of one f_signal
        SE_3_axis= SE_3_axis + SE_one_axis # add values to the global list        
    return E_3_axis+SE_3_axis

### Frequency axial features PipeLine

In [76]:
def f_axial_features_generation(f_window,dfreq):
    
    
    axial_columns=f_window.columns[0:6] # select frequency axial column names
    axial_df=f_window[axial_columns] # select frequency axial signals in one dataframe
    f_all_axial_features=[] # a global list will contain all frequency axial features values
    
    
    
    for col in range(0,6,3):# iterate throw each group of frequency axial signals in a window
        
        df=axial_df[axial_columns[col:col+3]]  # select each group of 3-axial signals
      
        # mean
        mean_vector                  = mean_axial(df) # 3 values
        # std
        std_vector                   = std_axial(df) # 3 values
        # mad
        mad_vector                   = mad_axial(df) # 3 values
        # max
        max_vector                   = max_axial(df) # 3 values
        # min
        min_vector                   = min_axial(df) # 3 values
        # sma
        sma_value                    = f_sma_axial(df)
        # energy
        energy_vector                = f_energy_axial(df)# 3 values
        # IQR
        IQR_vector                   = IQR_axial(df) # 3 values
        # entropy
        entropy_vector               = entropy_axial(df) # 3 values
        # max_inds
        max_inds_vector              = f_max_Inds_axial(df,dfreq)# 3 values
        # mean_Freq
        mean_Freq_vector             = f_mean_Freq_axial(df,dfreq)# 3 values
        # skewness and kurtosis
        skewness_and_kurtosis_vector = f_skewness_and_kurtosis_axial(df)# 6 values
        # bands energy
        bands_energy_vector          = f_all_bands_energy_axial(df,dfreq) # 42 values

        # append all values of each 3-axial signals in a list
        f_3axial_features = mean_vector +std_vector + mad_vector + max_vector + min_vector + [sma_value] + energy_vector + IQR_vector + entropy_vector + max_inds_vector + mean_Freq_vector + skewness_and_kurtosis_vector + bands_energy_vector

        f_all_axial_features = f_all_axial_features+ f_3axial_features # add features to the global list
        
    return f_all_axial_features

In [77]:
len(f_axial_features_generation(freq_sig_df,dfreq))

176

### Define Frequency Magnitudes features functions

In [78]:
# Functions used to generate frequency magnitude features

# sma
def f_sma_mag(mag_column):
    
    array=np.array(mag_column)
    sma_value=float((abs(array)/math.sqrt(len(mag_column))).sum()) # sma of one mag f_signals
    
    return sma_value

# energy
def f_energy_mag(mag_column):
    
    array=np.array(mag_column)
    # spectral energy value
    energy_value=float((array**2).sum()/float(len(array))) # energy value of one mag f_signals
    return energy_value


####### Max Inds and Mean_Freq Functions#######################################


# max_Inds
def f_max_Inds_mag(mag_column,dfreq):
    
    array=np.array(mag_column)
    
    max_Inds_value =float(dfreq[array.argmax()]) # freq value related with max component
    
    return max_Inds_value

# mean freq()
def f_mean_Freq_mag(mag_column,dfreq):
    
    array=np.array(mag_column)
    
    mean_freq_value = float(np.dot(dfreq,array).sum() / float(array.sum())) # weighted sum of one mag f_signal
    
    return  mean_freq_value

###################################################################################

########## Skewness & Kurtosis Functions #######################################

from scipy.stats import skew           # skewness
def f_skewness_mag(mag_column):
    
    array=np.array(mag_column)
    skew_value     = float(skew(array)) # skewness value of one mag f_signal
    return skew_value



from scipy.stats import kurtosis       # kurtosis
def f_kurtosis_mag(mag_column):
    array=np.array(mag_column)
    kurtosis_value = float(kurtosis(array)) # kurotosis value of on mag f_signal

    return kurtosis_value
##################################################################################

### Define Frequency Magnitude features pipline

In [79]:
def f_mag_features_generation(f_window,dfreq):
    
    # select frequnecy mag columns : the last 2 columns in f_window
    mag_columns=f_window.columns[-2:]
    mag_columns=f_window[mag_columns]
    
    f_mag_features=[]
    for col in mag_columns: # iterate throw each mag column in f_window
        
        # calculate common mag features and frequency mag features for each column
        mean_value   = mean_mag(mag_columns[col])
        std_value    = std_mag(mag_columns[col])
        mad_value    = mad_mag(mag_columns[col])
        max_value    = max_mag(mag_columns[col])
        min_value    = min_mag(mag_columns[col])
        sma_value    = f_sma_mag(mag_columns[col])
        energy_value = f_energy_mag(mag_columns[col])
        IQR_value    = IQR_mag(mag_columns[col])
        entropy_value= entropy_mag(mag_columns[col])
        max_Inds_value=f_max_Inds_mag(mag_columns[col],dfreq)
        mean_Freq_value= f_mean_Freq_mag (mag_columns[col],dfreq)
        skewness_value=  f_skewness_mag(mag_columns[col])
        kurtosis_value = f_kurtosis_mag(mag_columns[col])
        # 13 value per each t_mag_column
        col_mag_values = [mean_value, std_value, mad_value, max_value, 
                          min_value, sma_value, energy_value,IQR_value, 
                          entropy_value, max_Inds_value, mean_Freq_value,
                          skewness_value, kurtosis_value ]
        
        
        f_mag_features= f_mag_features+ col_mag_values # append feature values of one mag column to the global list
    
    # f_mag_features contains 65 values = 13 value (per each t_mag_column) x 4 (f_mag_columns)
    return f_mag_features
    

In [80]:
len(f_mag_features_generation(freq_sig_df,dfreq))

26

In [81]:
len(f_mag_features_generation(freq_sig_df,dfreq))+len(f_axial_features_generation(freq_sig_df,dfreq))

202

### Frequency features name generation

In [82]:
def frequency_features_names():
    #Generating Frequency feature names
    
    # frequency axial signal names 
    axial_signals=[
                    ['f_body_acc_X','f_body_acc_Y','f_body_acc_Z'],
                    ['f_body_acc_Jerk_X','f_body_acc_Jerk_Y','f_body_acc_Jerk_Z']]

    # frequency magnitude signals
    mag_signals=['f_body_acc_Mag','f_body_acc_Jerk_Mag']


    # features functions names will be applied to f_signals
    f_one_input_features_name1=['_mean()','_std()','_mad()','_max()','_min()']

    f_one_input_features_name2=['_energy()','_iqr()','_entropy()','_maxInd()','_meanFreq()']

    f_one_input_features_name3= ['_skewness()','_kurtosis()']


    f_one_input_features_name4=[
                                '_BE[1-4]','_BE[4-7]','_BE[7-10]','_BE[10-13]',
                                '_BE[13-16]','_BE[16-19]','_BE[19-22]','_BE[22-25]',
                                '_BE[1-7]','_BE[7-13]','_BE[13-19]','_BE[19-25]',
                                '_BE[1-12]','_BE[12-25]','_SE_B1','_SE_B2','_SE_B3'
                               ]
    
    frequency_features_names=[] # global list of frequency features
    
    for columns in axial_signals: # iterate throw each group of 3-axial signals
        
        # iterate throw the first list of features
        for feature in f_one_input_features_name1: 
            for column in columns:# iterate throw each signal name of that group
                newcolumn=column[:-2]+feature+column[-2:] # build the full feature name
                frequency_features_names.append(newcolumn) # add the feature name to the global list
        
        # sma feature name
        sma_column=column[:-2]+'_sma()'
        frequency_features_names.append(sma_column)

        # iterate throw the first list of features
        for feature in f_one_input_features_name2:
            for column in columns:
                newcolumn=column[:-2]+feature+column[-2:]
                frequency_features_names.append(newcolumn)
        
        # iterate throw each signal name of that group
        for column in columns:
            for feature in f_one_input_features_name3: # iterate throw [skewness ,kurtosis]
                newcolumn=column[:-2]+feature+column[-2:] # build full feature name
                frequency_features_names.append(newcolumn) # append full feature names
        
        # same process above will be applied to list number 4
        for column in columns:
            for feature in f_one_input_features_name4:
                newcolumn=column[:-2]+feature+column[-2:]
                frequency_features_names.append(newcolumn)
   
    #################################################################################################################
    # generate frequency mag features names
    for column in mag_signals:# iterate throw each frequency mag signal name
        for feature in f_one_input_features_name1:# iterate throw the first list of features functions names
            frequency_features_names.append(column+feature) # build the full feature name and add it to the global list

        sma_column=column+'_sma()' # build the sma full feature name
        frequency_features_names.append(sma_column) # add it to the global list

        for feature in f_one_input_features_name2:# iterate throw the second list of features functions names
            frequency_features_names.append(column+feature)# build the full feature name and add it to the global list
        
        for feature in f_one_input_features_name3:# iterate throw the third list of features functions names
            frequency_features_names.append(column+feature)# build the full feature name and add it to the global list
    ####################################################################################################################
    
    return frequency_features_names

In [83]:
len(frequency_features_names())

202

### Define Addtional features functions

In [84]:
############### Angles Functions ####################################
from math import acos # inverse of cosinus function
from math import sqrt # square root function

########Euclidian magnitude 3D############
def magnitude_vector(vector3D): # vector[X,Y,Z]
    return sqrt((vector3D**2).sum()) # eulidian norm of that vector

###########angle between two vectors in radian ###############
def angle(vector1, vector2):
    vector1_mag=magnitude_vector(vector1) # euclidian norm of V1
    vector2_mag=magnitude_vector(vector2) # euclidian norm of V2
   
    scalar_product=np.dot(vector1,vector2) # scalar product of vector 1 and Vector 2
    cos_angle=scalar_product/float(vector1_mag*vector2_mag) # the cosinus value of the angle between V1 and V2
    
    # just in case some values were added automatically
    if cos_angle>1:
        cos_angle=1
    elif cos_angle<-1:
        cos_angle=-1
    
    angle_value=float(acos(cos_angle)) # the angle value in radian
    return angle_value # in radian.

################## angle_features ############################
def angle_features(t_window): # it returns 7 angles per window
    angles_list=[]# global list of angles values
    
    # mean value of each column t_body_acc[X,Y,Z]
    V2_columns=['t_grav_acc_X','t_grav_acc_Y','t_grav_acc_Z']
    V2_Vector=np.array(t_window[V2_columns].mean()) # mean values
    
    # angle 0: angle between (t_body_acc[X.mean,Y.mean,Z.mean], t_gravity[X.mean,Y.mean,Z.mean])
    V1_columns=['t_body_acc_X','t_body_acc_Y','t_body_acc_Z']
    V1_Vector=np.array(t_window[V1_columns].mean()) # mean values of t_body_acc[X,Y,Z]
    angles_list.append(angle(V1_Vector, V2_Vector)) # angle between the vectors added to the global list
    
    # same process is applied to ither signals
    # angle 1: (t_body_acc_jerk[X.mean,Y.mean,Z.mean],t_gravity[X.mean,Y.mean,Z.mean]
    V1_columns=['t_body_acc_jerk_X','t_body_acc_jerk_Y','t_body_acc_jerk_Z']
    V1_Vector=np.array(t_window[V1_columns].mean())
    angles_list.append(angle(V1_Vector, V2_Vector))
    
    #################################################################################
    
    # V1 vector in this case is the X axis itself [1,0,0]
    # angle 4: ([X_axis],t_gravity[X.mean,Y.mean,Z.mean])   
    V1_Vector=np.array([1,0,0])
    angles_list.append(angle(V1_Vector, V2_Vector))
    
    # V1 vector in this case is the Y axis itself [0,1,0]
    # angle 5: ([Y_acc_axis],t_gravity[X.mean,Y.mean,Z.mean]) 
    V1_Vector=np.array([0,1,0])
    angles_list.append(angle(V1_Vector, V2_Vector))
    
    # V1 vector in this case is the Z axis itself [0,0,1]
    # angle 6: ([Z_acc_axis],t_gravity[X.mean,Y.mean,Z.mean])
    V1_Vector=np.array([0,0,1])
    angles_list.append(angle(V1_Vector, V2_Vector))
    
    return angles_list


In [85]:
angle_features(time_sig_df)

[1.8582164377002448,
 1.3007985123846328,
 0.7741285023936461,
 0.8086298496777289,
 1.68035254598756]

### Define Additional features names

In [86]:
angle_columns=['angle0()','angle1()','angle2()','angle3()','angle4()']

In [87]:
len(angle_columns)

5

### Define Datasets generation PipeLine

In [88]:
# conctenate all features names lists and we add two other columns activity ids and user ids will be related to each row
all_columns=time_features_names()+frequency_features_names()+angle_columns

def Dataset_Generation_PipeLine(b):
    data,sampling_freq=preprocess_real_smartphone(pd.read_csv(b))
    time_sig_df=time_domain_signal(data,sampling_freq)
    freq_sig_df,dfreq=fast_fourier_transform(time_sig_df,sampling_freq)

    # conctenate all features names lists and we add two other columns activity ids and user ids will be related to each row
    all_columns=time_features_names()+frequency_features_names()+angle_columns
    # generate all time features from t_window 
    time_features = t_axial_features_generation(time_sig_df) + t_mag_features_generation(time_sig_df)
    # generate all frequency features from f_window
    frequency_features = f_axial_features_generation(freq_sig_df,dfreq) + f_mag_features_generation(freq_sig_df,dfreq)

    # Generate addtional features from t_window
    additional_features= angle_features(time_sig_df)

    # concatenate all features and append the activity id and the user id
    row= time_features + frequency_features + additional_features 
    return(row)

def Dataset_Generation_PipeLine_SmartWatch(b):
    data,device_id,sampling_freq=preprocess_real_smartwatch(pd.read_csv(b))
    time_sig_df=time_domain_signal(data,sampling_freq)
    freq_sig_df,dfreq=fast_fourier_transform(time_sig_df,sampling_freq)

    # conctenate all features names lists and we add two other columns activity ids and user ids will be related to each row
    all_columns=time_features_names()+frequency_features_names()+angle_columns
    # generate all time features from t_window 
    time_features = t_axial_features_generation(time_sig_df) + t_mag_features_generation(time_sig_df)
    # generate all frequency features from f_window
    frequency_features = f_axial_features_generation(freq_sig_df,dfreq) + f_mag_features_generation(freq_sig_df,dfreq)

    # Generate addtional features from t_window
    additional_features= angle_features(time_sig_df)

    # concatenate all features and append the activity id and the user id
    row= [device_id]+time_features + frequency_features + additional_features 
    return(row)


In [89]:
from joblib import Parallel, delayed
import multiprocessing
import time

## SmartPhone

### training data

In [101]:
a_real=glob.glob("training_data/smartphone_accelerometer/*.csv")
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(Dataset_Generation_PipeLine)(i) for i in a_real)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#3.4828 Mins

--- 3.4538947860399882 Mins ---


In [102]:
df_train=pd.DataFrame(result)
df_train.columns=all_columns
df_train['measurement_id']=[item[len('training_data/smartphone_accelerometer/'):-4] for item in a_real]
print(df_train.shape)
df_train.head()

(526, 376)


Unnamed: 0,t_body_acc_mean()_X,t_body_acc_mean()_Y,t_body_acc_mean()_Z,t_body_acc_std()_X,t_body_acc_std()_Y,t_body_acc_std()_Z,t_body_acc_mad()_X,t_body_acc_mad()_Y,t_body_acc_mad()_Z,t_body_acc_max()_X,...,f_body_acc_Jerk_Mag_maxInd(),f_body_acc_Jerk_Mag_meanFreq(),f_body_acc_Jerk_Mag_skewness(),f_body_acc_Jerk_Mag_kurtosis(),angle0(),angle1(),angle2(),angle3(),angle4(),measurement_id
0,-4.099065e-07,-4.945145e-07,1.272551e-07,0.19431,0.198883,0.427278,0.012449,0.018439,0.013511,5.59487,...,0.0,17.357615,5.054216,51.890984,1.382333,1.561341,1.548855,1.579795,0.023715,05205dc3-9c82-4b6b-921e-b6c2edd2c993
1,5.097855e-06,-6.718633e-06,-1.150386e-06,1.030481,1.282587,1.155372,0.616868,0.484965,0.629766,14.792709,...,0.0,13.78915,112.071483,19860.340756,1.97591,1.509345,1.075313,0.650309,1.186596,aba31c29-79ef-4221-9412-156538a2fd4e
2,-5.860476e-06,4.557677e-06,3.055305e-05,0.377389,0.29464,0.527385,0.038554,0.034712,0.080961,8.190283,...,0.0,13.660789,45.981864,5272.703915,0.704855,2.435034,1.392411,0.831631,0.770715,6cab466b-3c95-4cc0-ba1b-8650f5a12504
3,-2.010678e-06,3.6499e-06,1.612748e-06,0.118161,0.12521,0.145451,0.057362,0.045675,0.064264,1.95766,...,0.0,14.541016,110.629969,19864.795242,1.69891,1.585013,0.340545,1.246392,1.470836,27eccfc4-e329-4695-aee8-6d706b247191
4,0.0001167462,5.815877e-05,-0.0001062504,0.534147,0.54862,0.690895,0.15096,0.130783,0.169818,24.54042,...,0.0,6.241945,32.72337,2173.427111,2.24168,0.915963,1.848708,1.189339,0.480749,ed560c25-e5c5-4dba-82c7-3fc18c248ce4


In [103]:
#ancillary_data
b_real=glob.glob("ancillary_data/smartphone_accelerometer/*.csv")
b_real2=[]
for i in b_real:
    a=pd.read_csv(i).shape[0]
    if a>=2000:
        b_real2.append(i)

In [104]:
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(Dataset_Generation_PipeLine)(i) for i in b_real2)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#3.2621 min



--- 3.310788031419118 Mins ---


In [105]:
df_ancillary=pd.DataFrame(result)
df_ancillary.columns=all_columns
df_ancillary['measurement_id']=[item[len('ancillary_data/smartphone_accelerometer/'):-4] for item in b_real2]
print(df_ancillary.shape)
df_ancillary.head()

(348, 376)


Unnamed: 0,t_body_acc_mean()_X,t_body_acc_mean()_Y,t_body_acc_mean()_Z,t_body_acc_std()_X,t_body_acc_std()_Y,t_body_acc_std()_Z,t_body_acc_mad()_X,t_body_acc_mad()_Y,t_body_acc_mad()_Z,t_body_acc_max()_X,...,f_body_acc_Jerk_Mag_maxInd(),f_body_acc_Jerk_Mag_meanFreq(),f_body_acc_Jerk_Mag_skewness(),f_body_acc_Jerk_Mag_kurtosis(),angle0(),angle1(),angle2(),angle3(),angle4(),measurement_id
0,-1.298659e-08,9.704914e-09,1.038516e-09,0.007636,0.008218,0.008296,0.007568,0.008225,0.008204,0.055133,...,0.0,26.329799,488.900054,257778.588612,1.537272,2.903597,1.547309,1.590222,0.030482,eab1cc17-40aa-4e1f-8e4f-64e6d5743509
1,8.817708e-06,8.912955e-06,9.574077e-06,0.431135,0.321677,0.350872,0.093058,0.054945,0.077654,16.611039,...,0.0,13.92747,57.614093,7362.677375,1.589332,1.464762,1.978408,0.618983,2.008286,b1a5fd6d-db9c-4870-a3c0-943e0656d112
2,-0.0001101328,0.0001876883,7.999644e-06,0.634298,0.537531,1.321364,0.193828,0.142518,0.514031,9.650431,...,0.0,5.528199,32.608196,1953.344718,1.633826,1.445278,1.294846,1.524947,0.279931,bca5e12d-9fd6-496b-ac08-9e2472d8b299
3,-1.132866e-05,-4.062197e-06,-3.427088e-06,0.413682,0.369001,0.493479,0.091024,0.073786,0.079791,4.61245,...,0.0,14.752543,94.79209,13908.541928,2.175627,1.127538,1.403095,0.792376,0.806287,9d74f5e1-241a-4f4a-bc6f-2779edf410cd
4,7.763831e-08,1.221371e-07,-1.060009e-07,0.010878,0.008543,0.012374,0.008105,0.006387,0.009856,0.224689,...,0.0,14.331148,169.593004,32042.543284,2.186913,0.555705,1.558688,1.559488,0.016568,49f80736-6b50-44a6-a77b-9b1572334a8c


In [106]:
Frame = df_train.append(pd.DataFrame(data = df_ancillary), ignore_index=True)

In [107]:
Frame.shape

(874, 376)

In [111]:
Frame.to_csv('analysis2_realpd_comp_training_abhiroop_tillhurst_smartphone.csv',index=False)

### testing data

In [112]:
a_real=glob.glob("testing_data/smartphone_accelerometer/*.csv")
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(Dataset_Generation_PipeLine)(i) for i in a_real)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#1.1097 Mins

df_train=pd.DataFrame(result)
df_train.columns=all_columns
df_train['measurement_id']=[item[len('testing_data/smartphone_accelerometer/'):-4] for item in a_real]
print(df_train.shape)

Frame_test=df_train.copy()

--- 1.1097432374954224 Mins ---
(169, 376)


In [114]:
Frame_test.to_csv('analysis2_realpd_comp_testing_abhiroop_tillhurst_smartphone.csv',index=False)

In [118]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
import ppscore as pps
import lightgbm as lgb
from catboost import CatBoostRegressor, FeaturesData, Pool
from sklearn import preprocessing

In [119]:
#smartwatch_gyroscope features
label=real_pd_training_id.append(pd.DataFrame(real_pd_ancillary_id), ignore_index=True)
df_train=pd.merge(label,Frame,on='measurement_id')
df_train['subject_id']=preprocessing.LabelEncoder().fit(df_train['subject_id']).transform(df_train['subject_id'])
print(df_train.shape)
df_train.head()

(874, 380)


Unnamed: 0,measurement_id,subject_id,on_off,dyskinesia,tremor,t_body_acc_mean()_X,t_body_acc_mean()_Y,t_body_acc_mean()_Z,t_body_acc_std()_X,t_body_acc_std()_Y,...,f_body_acc_Jerk_Mag_entropy(),f_body_acc_Jerk_Mag_maxInd(),f_body_acc_Jerk_Mag_meanFreq(),f_body_acc_Jerk_Mag_skewness(),f_body_acc_Jerk_Mag_kurtosis(),angle0(),angle1(),angle2(),angle3(),angle4()
0,ee053d95-c155-400d-ae42-fe24834ad4a9,2,1.0,0.0,3.0,2e-06,-6e-06,-4e-06,0.370037,0.138497,...,9.902179,0.0,9.78575,118.016913,17569.540743,1.297745,1.786036,0.605897,1.917258,1.095966
1,ce51ee31-8553-4321-9f83-8cd3dabe2f66,2,1.0,0.0,2.0,1.2e-05,-3e-06,-7e-06,0.744393,0.67596,...,10.504205,0.0,18.024094,41.358335,2958.574457,1.652786,1.495809,1.16749,1.69814,0.425195
2,5c42911d-0ebd-47ba-9925-dd5ab1c0ed61,2,1.0,0.0,1.0,-2.7e-05,5e-06,3.7e-05,1.388002,1.075267,...,10.237465,0.0,14.010328,30.98182,2527.258521,2.204363,0.918719,0.215527,1.786323,1.570499
3,274f5bc8-2e4f-4d7c-a546-b65b7d6bd01e,2,0.0,0.0,,9e-06,1.2e-05,9e-06,0.99418,0.631398,...,9.872395,0.0,9.476543,42.59457,3912.692417,1.194747,1.770899,0.15107,1.675052,1.679727
4,ecbeea40-8770-455d-90a6-597e7f896e1b,2,0.0,1.0,0.0,-4.3e-05,-4.7e-05,-6.1e-05,1.302788,0.960859,...,9.885401,0.0,9.648741,54.939414,5976.983118,1.854147,1.293888,0.22502,1.668464,1.772858


In [120]:
lightgbm(df_train,'tremor')
catboost(df_train,'tremor')

lightgbm(df_train,'dyskinesia')
catboost(df_train,'dyskinesia')

lightgbm(df_train,'on_off')
catboost(df_train,'on_off')

lightgbm test_MSE tremor : 0.551
catboost test_MSE tremor : 0.582
lightgbm test_MSE dyskinesia : 0.139
catboost test_MSE dyskinesia : 0.135
lightgbm test_MSE on_off : 0.192
catboost test_MSE on_off : 0.193


# SmartWatch

### Accelerometer

### training

In [186]:
a_real=glob.glob("training_data/smartwatch_accelerometer/*.csv")
a_real_acc=[]
num=[]
for i in a_real:
    data=pd.read_csv(i)
    a=data.groupby('device_id').agg({'x':'var','y':'count'}).reset_index()
    deviceid=a.loc[a.x.idxmax(),'device_id']
    if int(a.loc[a.device_id==deviceid,'y'])<=data.shape[0]*0.2:
        deviceid=a.loc[a.x.idxmin(),'device_id']
    data=data[data.device_id==deviceid].reset_index(drop=True)
    num.append(data.shape[0])
    if data.shape[0]>=2000:
        a_real_acc.append(i)

In [187]:
len(a_real_acc),len(a_real)

(530, 535)

In [188]:
# smartwatch_accelerometer
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(Dataset_Generation_PipeLine_SmartWatch)(i) for i in a_real_acc)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#1.8061357021331788 Mins

--- 1.6911399324735006 Mins ---


In [189]:
df_train=pd.DataFrame(result)
df_train.columns=['device_id_acc']+all_columns
df_train['measurement_id']=[item[len('training_data/smartwatch_accelerometer/'):-4] for item in a_real_acc]
print(df_train.shape)
df_train.head()

(530, 377)


Unnamed: 0,device_id_acc,t_body_acc_mean()_X,t_body_acc_mean()_Y,t_body_acc_mean()_Z,t_body_acc_std()_X,t_body_acc_std()_Y,t_body_acc_std()_Z,t_body_acc_mad()_X,t_body_acc_mad()_Y,t_body_acc_mad()_Z,...,f_body_acc_Jerk_Mag_maxInd(),f_body_acc_Jerk_Mag_meanFreq(),f_body_acc_Jerk_Mag_skewness(),f_body_acc_Jerk_Mag_kurtosis(),angle0(),angle1(),angle2(),angle3(),angle4(),measurement_id
0,2VSP,-4.5e-05,2.6e-05,-6e-06,1.093287,1.196446,1.509475,0.316232,0.372479,0.546717,...,0.0,8.932226,70.200057,8395.906698,2.178365,1.427471,1.478582,3.047426,1.58982,05205dc3-9c82-4b6b-921e-b6c2edd2c993
1,2WLT,4.2e-05,7.9e-05,4e-06,2.437274,2.530655,2.999879,1.697512,1.854427,2.17781,...,0.0,8.905802,110.57826,16375.753738,2.026717,1.802777,0.942807,2.512359,1.536386,aba31c29-79ef-4221-9412-156538a2fd4e
2,2ZX3,-1e-06,-1e-05,-3.1e-05,0.813673,0.904853,0.768711,0.104264,0.123573,0.12879,...,0.0,9.190171,64.479268,6330.404436,1.534705,1.621401,1.878845,2.733065,1.311211,6cab466b-3c95-4cc0-ba1b-8650f5a12504
3,2WH7,-2e-06,8e-06,5.9e-05,0.308921,0.336762,0.387006,0.077327,0.075313,0.095581,...,0.0,9.624517,88.317307,11671.306281,1.959777,1.167866,1.364587,2.818863,1.815439,ed560c25-e5c5-4dba-82c7-3fc18c248ce4
4,2WH7,1.4e-05,-5e-06,-2e-06,0.558929,0.657292,0.692808,0.144032,0.1444,0.188741,...,0.0,9.98152,51.573305,4979.214104,1.06352,1.936788,1.230037,2.427341,0.97232,f96752b5-850e-4a5a-a74a-69ab4893b6aa


In [190]:
a_real=glob.glob("ancillary_data/smartwatch_accelerometer/*.csv")
a_real_acc=[]
num=[]
for i in a_real:
    data=pd.read_csv(i)
    a=data.groupby('device_id').agg({'x':'var','y':'count'}).reset_index()
    deviceid=a.loc[a.x.idxmax(),'device_id']
    if int(a.loc[a.device_id==deviceid,'y'])<=data.shape[0]*0.2:
        deviceid=a.loc[a.x.idxmin(),'device_id']
    data=data[data.device_id==deviceid].reset_index(drop=True)
    num.append(data.shape[0])
    if data.shape[0]>=2000:
        a_real_acc.append(i)

In [191]:
len(a_real_acc),len(a_real)

(426, 428)

In [192]:
# smartwatch_accelerometer
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(Dataset_Generation_PipeLine_SmartWatch)(i) for i in a_real_acc)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#1.8061357021331788 Mins

--- 2.083820815881093 Mins ---


In [193]:
df_ancillary=pd.DataFrame(result)
df_ancillary.columns=['device_id_acc']+all_columns
df_ancillary['measurement_id']=[item[len('ancillary_data/smartwatch_accelerometer/'):-4] for item in a_real_acc]
print(df_ancillary.shape)
df_ancillary.head()

(426, 377)


Unnamed: 0,device_id_acc,t_body_acc_mean()_X,t_body_acc_mean()_Y,t_body_acc_mean()_Z,t_body_acc_std()_X,t_body_acc_std()_Y,t_body_acc_std()_Z,t_body_acc_mad()_X,t_body_acc_mad()_Y,t_body_acc_mad()_Z,...,f_body_acc_Jerk_Mag_maxInd(),f_body_acc_Jerk_Mag_meanFreq(),f_body_acc_Jerk_Mag_skewness(),f_body_acc_Jerk_Mag_kurtosis(),angle0(),angle1(),angle2(),angle3(),angle4(),measurement_id
0,2WLT,5.425922e-05,3.3e-05,6.8e-05,0.490945,0.735682,0.673004,0.115697,0.181025,0.129765,...,0.0,9.381558,67.869028,7596.220424,1.359991,1.825833,1.473242,2.488684,0.927745,eab1cc17-40aa-4e1f-8e4f-64e6d5743509
1,2WH7,1.376938e-05,-3e-06,-1.1e-05,0.115186,0.23601,0.126842,0.046773,0.046221,0.05323,...,0.0,9.775164,93.315049,12278.230062,1.446872,1.701135,1.159022,2.458166,1.060529,166ba983-209f-4639-a5a6-d6e66adeba2b
2,2WH5,8.178773e-06,4.8e-05,7.6e-05,0.696954,0.844087,0.927901,0.073609,0.064186,0.091311,...,0.0,9.252131,44.071087,3055.500241,2.001357,1.098007,1.074279,2.645044,1.565727,bca5e12d-9fd6-496b-ac08-9e2472d8b299
3,327T,8.99589e-07,4e-06,3e-06,0.065349,0.080587,0.074796,0.065094,0.061681,0.071341,...,0.0,11.816651,164.767524,28045.486036,1.711799,1.210994,2.38499,1.94061,0.94765,49f80736-6b50-44a6-a77b-9b1572334a8c
4,327T,-2.492534e-05,-1.7e-05,-5e-06,0.312973,0.304496,0.316882,0.087852,0.083471,0.091598,...,0.0,9.414327,98.998922,13588.476752,1.383559,1.907028,1.320616,2.571777,1.070925,26f49660-ce1a-4946-8f83-f88850f03ec1


In [194]:
Frame_smartwatch_acc = df_train.append(pd.DataFrame(data = df_ancillary), ignore_index=True)

In [195]:
Frame_smartwatch_acc.shape

(956, 377)

In [196]:
Frame_smartwatch_acc.head()

Unnamed: 0,device_id_acc,t_body_acc_mean()_X,t_body_acc_mean()_Y,t_body_acc_mean()_Z,t_body_acc_std()_X,t_body_acc_std()_Y,t_body_acc_std()_Z,t_body_acc_mad()_X,t_body_acc_mad()_Y,t_body_acc_mad()_Z,...,f_body_acc_Jerk_Mag_maxInd(),f_body_acc_Jerk_Mag_meanFreq(),f_body_acc_Jerk_Mag_skewness(),f_body_acc_Jerk_Mag_kurtosis(),angle0(),angle1(),angle2(),angle3(),angle4(),measurement_id
0,2VSP,-4.5e-05,2.6e-05,-6e-06,1.093287,1.196446,1.509475,0.316232,0.372479,0.546717,...,0.0,8.932226,70.200057,8395.906698,2.178365,1.427471,1.478582,3.047426,1.58982,05205dc3-9c82-4b6b-921e-b6c2edd2c993
1,2WLT,4.2e-05,7.9e-05,4e-06,2.437274,2.530655,2.999879,1.697512,1.854427,2.17781,...,0.0,8.905802,110.57826,16375.753738,2.026717,1.802777,0.942807,2.512359,1.536386,aba31c29-79ef-4221-9412-156538a2fd4e
2,2ZX3,-1e-06,-1e-05,-3.1e-05,0.813673,0.904853,0.768711,0.104264,0.123573,0.12879,...,0.0,9.190171,64.479268,6330.404436,1.534705,1.621401,1.878845,2.733065,1.311211,6cab466b-3c95-4cc0-ba1b-8650f5a12504
3,2WH7,-2e-06,8e-06,5.9e-05,0.308921,0.336762,0.387006,0.077327,0.075313,0.095581,...,0.0,9.624517,88.317307,11671.306281,1.959777,1.167866,1.364587,2.818863,1.815439,ed560c25-e5c5-4dba-82c7-3fc18c248ce4
4,2WH7,1.4e-05,-5e-06,-2e-06,0.558929,0.657292,0.692808,0.144032,0.1444,0.188741,...,0.0,9.98152,51.573305,4979.214104,1.06352,1.936788,1.230037,2.427341,0.97232,f96752b5-850e-4a5a-a74a-69ab4893b6aa


### testing

In [197]:
a_real=glob.glob("testing_data/smartwatch_accelerometer/*.csv")
a_real_acc=[]
num=[]
for i in a_real:
    data=pd.read_csv(i)
    a=data.groupby('device_id').agg({'x':'var','y':'count'}).reset_index()
    deviceid=a.loc[a.x.idxmax(),'device_id']
    if int(a.loc[a.device_id==deviceid,'y'])<=data.shape[0]*0.2:
        deviceid=a.loc[a.x.idxmin(),'device_id']
    data=data[data.device_id==deviceid].reset_index(drop=True)
    num.append(data.shape[0])
    if data.shape[0]>=2000:
        a_real_acc.append(i)

print(len(a_real_acc),len(a_real))

# smartwatch_accelerometer
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(Dataset_Generation_PipeLine_SmartWatch)(i) for i in a_real_acc)


df_train=pd.DataFrame(result)
df_train.columns=['device_id_acc']+all_columns
df_train['measurement_id']=[item[len('testing_data/smartwatch_accelerometer/'):-4] for item in a_real_acc]
print(df_train.shape)

Frame_smartwatch_acc_test=df_train.copy()

171 172
(171, 377)


In [198]:
#smartwatch_gyroscope features
label=real_pd_training_id.append(pd.DataFrame(real_pd_ancillary_id), ignore_index=True)
df_train=pd.merge(label,Frame_smartwatch_acc,on='measurement_id')
df_train['subject_id']=preprocessing.LabelEncoder().fit(df_train['subject_id']).transform(df_train['subject_id'])
df_train['device_id_acc']=preprocessing.LabelEncoder().fit(df_train['device_id_acc']).transform(df_train['device_id_acc'])
print(df_train.shape)
df_train.head()

(956, 381)


Unnamed: 0,measurement_id,subject_id,on_off,dyskinesia,tremor,device_id_acc,t_body_acc_mean()_X,t_body_acc_mean()_Y,t_body_acc_mean()_Z,t_body_acc_std()_X,...,f_body_acc_Jerk_Mag_entropy(),f_body_acc_Jerk_Mag_maxInd(),f_body_acc_Jerk_Mag_meanFreq(),f_body_acc_Jerk_Mag_skewness(),f_body_acc_Jerk_Mag_kurtosis(),angle0(),angle1(),angle2(),angle3(),angle4()
0,ee053d95-c155-400d-ae42-fe24834ad4a9,2,1.0,0.0,3.0,4,-6e-06,4e-06,-4e-06,0.118387,...,9.961393,0.0,10.774892,132.376736,20757.689213,1.617342,1.512167,2.226377,2.472398,1.455633
1,ce51ee31-8553-4321-9f83-8cd3dabe2f66,2,1.0,0.0,2.0,4,4.1e-05,-5.1e-05,-9e-06,0.469712,...,9.869984,0.0,8.537766,57.652729,5851.838781,1.683599,1.45213,2.181523,2.02573,0.807339
2,d3c89012-3ab9-4014-b577-61ff05e31968,2,1.0,0.0,0.0,0,5.9e-05,-7e-06,1.4e-05,0.775597,...,9.591627,0.0,9.393297,76.932225,8692.865354,1.987189,1.335116,2.204977,2.387588,1.220842
3,5c42911d-0ebd-47ba-9925-dd5ab1c0ed61,2,1.0,0.0,1.0,4,-2.9e-05,1.7e-05,-2.1e-05,0.768933,...,9.930847,0.0,9.437328,102.191731,14366.602821,1.856489,2.717854,2.020207,2.311734,0.931814
4,235472d5-ad2e-4c76-947e-358c9d8c1280,2,1.0,0.0,0.0,0,-7.5e-05,2.8e-05,8e-06,0.657183,...,9.915465,0.0,9.324634,90.772888,12066.117961,1.632536,1.478546,1.690031,2.348133,0.791487


In [199]:
lightgbm(df_train,'tremor')
catboost(df_train,'tremor')

lightgbm(df_train,'dyskinesia')
catboost(df_train,'dyskinesia')

lightgbm(df_train,'on_off')
catboost(df_train,'on_off')

lightgbm test_MSE tremor : 0.563
catboost test_MSE tremor : 0.559
lightgbm test_MSE dyskinesia : 0.213
catboost test_MSE dyskinesia : 0.212
lightgbm test_MSE on_off : 0.198
catboost test_MSE on_off : 0.208


In [None]:
#All Freq=========================================
lightgbm test_MSE tremor : 0.591
catboost test_MSE tremor : 0.544
lightgbm test_MSE dyskinesia : 0.214
catboost test_MSE dyskinesia : 0.215
lightgbm test_MSE on_off : 0.198
catboost test_MSE on_off : 0.189

#Freq=50=============================================
lightgbm test_MSE tremor : 0.563
catboost test_MSE tremor : 0.559
lightgbm test_MSE dyskinesia : 0.213
catboost test_MSE dyskinesia : 0.212
lightgbm test_MSE on_off : 0.198
catboost test_MSE on_off : 0.208    



## Smartwatch Gyroscope

In [200]:
def preprocess_real_gyroscope(data):
    
    a=data.groupby('device_id').agg({'x':'var','y':'count'}).reset_index()
    deviceid=a.loc[a.x.idxmax(),'device_id']
    if int(a.loc[a.device_id==deviceid,'y'])<=data.shape[0]*0.2:
        deviceid=a.loc[a.x.idxmin(),'device_id']
    
    data=data[data.device_id==deviceid].reset_index()
    data.rename(columns={'t':'Timestamp','x':'X','y':'Y','z':'Z'},inplace=True)
   
    ls=['X','Y','Z']
    #freq=round((1/((data.Timestamp.max()/data.Timestamp.shape[0]).round(3))),0)
    freq=50
    t1=np.arange(data.Timestamp[0],data.Timestamp[(data.shape[0])-1],0.02)
    #t1=np.arange(data.Timestamp[0],data.Timestamp[(data.shape[0])-1],(data.Timestamp.max()/data.Timestamp.shape[0]).round(3))
    df=pd.DataFrame({'Timestamp':t1})
        
    for i in ls:
        fcubic = interpolate.interp1d(data.Timestamp, data[i])
        df[i]=fcubic(t1)
    df.rename(columns={'X':'acc_X','Y':'acc_Y','Z':'acc_Z'},inplace=True)
    return df[['Timestamp','acc_X','acc_Y','acc_Z']],deviceid,freq


def Dataset_Generation_PipeLine_gyroscope(b):
    data,device_id,sampling_freq=preprocess_real_gyroscope(pd.read_csv(b))
    time_sig_df=time_domain_signal(data,sampling_freq)
    freq_sig_df,dfreq=fast_fourier_transform(time_sig_df,sampling_freq)

    # conctenate all features names lists and we add two other columns activity ids and user ids will be related to each row
    all_columns=time_features_names()+frequency_features_names()+angle_columns
    # generate all time features from t_window 
    time_features = t_axial_features_generation(time_sig_df) + t_mag_features_generation(time_sig_df)
    # generate all frequency features from f_window
    frequency_features = f_axial_features_generation(freq_sig_df,dfreq) + f_mag_features_generation(freq_sig_df,dfreq)

    # Generate addtional features from t_window
    additional_features= angle_features(time_sig_df)

    # concatenate all features and append the activity id and the user id
    row= [device_id]+time_features + frequency_features + additional_features 
    return(row)


### training data

In [201]:
a_real=glob.glob("training_data/smartwatch_gyroscope/*.csv")
a_real_gyro=[]
num=[]
for i in a_real:
    data=pd.read_csv(i)
    a=data.groupby('device_id').agg({'x':'var','y':'count'}).reset_index()
    deviceid=a.loc[a.x.idxmax(),'device_id']
    if int(a.loc[a.device_id==deviceid,'y'])<=data.shape[0]*0.2:
        deviceid=a.loc[a.x.idxmin(),'device_id']
    data=data[data.device_id==deviceid].reset_index(drop=True)
    num.append(data.shape[0])
    if data.shape[0]>=2000:
        a_real_gyro.append(i)

In [202]:
len(a_real_gyro),len(a_real)

(530, 535)

In [203]:
# smartwatch_accelerometer
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(Dataset_Generation_PipeLine_gyroscope)(i) for i in a_real_gyro)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#1.8061357021331788 Mins

--- 2.9883177002271015 Mins ---


In [204]:
df_train=pd.DataFrame(result)
df_train.columns=['device_id_gyro']+[i.replace('acc','gyro') for i in all_columns]
df_train['measurement_id']=[item[len('training_data/smartwatch_gyroscope/'):-4] for item in a_real_gyro]
df_train=df_train[[i for i in list(df_train.columns) if not any(w in 'grav' for w in i.split('_'))]]
df_train.head()

Unnamed: 0,device_id_gyro,t_body_gyro_mean()_X,t_body_gyro_mean()_Y,t_body_gyro_mean()_Z,t_body_gyro_std()_X,t_body_gyro_std()_Y,t_body_gyro_std()_Z,t_body_gyro_mad()_X,t_body_gyro_mad()_Y,t_body_gyro_mad()_Z,...,f_body_gyro_Jerk_Mag_maxInd(),f_body_gyro_Jerk_Mag_meanFreq(),f_body_gyro_Jerk_Mag_skewness(),f_body_gyro_Jerk_Mag_kurtosis(),angle0(),angle1(),angle2(),angle3(),angle4(),measurement_id
0,2VSP,-8.08012e-06,-3.324895e-06,4.367288e-06,0.868691,0.391412,0.414207,0.177139,0.091405,0.103189,...,0.0,8.149658,57.374883,6181.289833,2.521363,2.155205,1.180774,0.733659,2.154569,05205dc3-9c82-4b6b-921e-b6c2edd2c993
1,2WLT,-4.102466e-06,-1.187798e-06,-8.524528e-06,2.050404,0.890641,0.887509,1.28219,0.65803,0.624217,...,0.0,8.863675,117.926153,17861.628404,1.776384,0.649764,2.362049,1.05639,1.044818,aba31c29-79ef-4221-9412-156538a2fd4e
2,2ZX3,-3.105367e-07,-9.208662e-08,-2.640078e-07,0.414637,0.25705,0.317073,0.034547,0.023544,0.023482,...,0.0,8.831006,56.176751,4835.658182,1.595897,1.794558,1.967655,2.210951,0.791589,6cab466b-3c95-4cc0-ba1b-8650f5a12504
3,2WH7,-7.127714e-07,3.723054e-08,3.11016e-07,0.175526,0.097615,0.13266,0.018259,0.010175,0.014362,...,0.0,9.378177,35.582949,2662.389466,0.697318,2.913551,2.171902,0.908074,0.989416,ed560c25-e5c5-4dba-82c7-3fc18c248ce4
4,2WH7,-4.32626e-07,3.761519e-08,1.242769e-07,0.344894,0.169855,0.210206,0.058258,0.023167,0.034459,...,0.0,9.503903,48.21878,4586.099451,0.709405,1.729234,2.190499,0.933912,0.981358,f96752b5-850e-4a5a-a74a-69ab4893b6aa


In [205]:
a_real=glob.glob("ancillary_data/smartwatch_gyroscope/*.csv")
a_real_gyro=[]
num=[]
for i in a_real:
    data=pd.read_csv(i)
    a=data.groupby('device_id').agg({'x':'var','y':'count'}).reset_index()
    deviceid=a.loc[a.x.idxmax(),'device_id']
    if int(a.loc[a.device_id==deviceid,'y'])<=data.shape[0]*0.2:
        deviceid=a.loc[a.x.idxmin(),'device_id']
    data=data[data.device_id==deviceid].reset_index(drop=True)
    num.append(data.shape[0])
    if data.shape[0]>=2000:
        a_real_gyro.append(i)
#pd.DataFrame({'num':num}).sort_values(by='num')

In [206]:
len(a_real_gyro),len(a_real)

(426, 428)

In [207]:
# smartwatch_accelerometer
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(Dataset_Generation_PipeLine_gyroscope)(i) for i in a_real_gyro)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#1.8061357021331788 Mins

--- 1.8972928206125894 Mins ---


In [208]:
df_ancillary=pd.DataFrame(result)
df_ancillary.columns=['device_id_gyro']+[i.replace('acc','gyro') for i in all_columns]
df_ancillary['measurement_id']=[item[len('ancillary_data/smartwatch_gyroscope/'):-4] for item in a_real_gyro]
df_ancillary=df_ancillary[[i for i in list(df_ancillary.columns) if not any(w in 'grav' for w in i.split('_'))]]
#df_ancillary=df_ancillary.drop('device_id_gyro',axis=1)
print(df_ancillary.shape)
df_ancillary.head()

(426, 321)


Unnamed: 0,device_id_gyro,t_body_gyro_mean()_X,t_body_gyro_mean()_Y,t_body_gyro_mean()_Z,t_body_gyro_std()_X,t_body_gyro_std()_Y,t_body_gyro_std()_Z,t_body_gyro_mad()_X,t_body_gyro_mad()_Y,t_body_gyro_mad()_Z,...,f_body_gyro_Jerk_Mag_maxInd(),f_body_gyro_Jerk_Mag_meanFreq(),f_body_gyro_Jerk_Mag_skewness(),f_body_gyro_Jerk_Mag_kurtosis(),angle0(),angle1(),angle2(),angle3(),angle4(),measurement_id
0,2WLT,-5.048148e-05,8.241901e-06,6.431667e-06,0.359878,0.153655,0.194087,0.04523,0.021919,0.030113,...,0.0,9.688592,53.708472,5158.189742,1.333381,1.873317,1.628063,1.14803,0.427127,eab1cc17-40aa-4e1f-8e4f-64e6d5743509
1,2WH7,-1.550726e-07,2.997507e-08,4.269507e-08,0.093352,0.030233,0.031614,0.005101,0.003367,0.002293,...,0.0,8.456616,21.719762,977.283788,0.690522,2.611922,2.131496,0.944352,0.913287,166ba983-209f-4639-a5a6-d6e66adeba2b
2,2WH5,-4.557935e-06,3.19067e-07,-2.749723e-06,0.604687,0.313818,0.296413,0.017963,0.011914,0.01626,...,0.0,8.367207,33.020641,1804.197455,0.58713,2.091755,2.695817,1.980418,1.736736,bca5e12d-9fd6-496b-ac08-9e2472d8b299
3,327T,-5.468461e-08,3.229373e-07,1.056226e-07,0.021726,0.019633,0.014395,0.011019,0.015752,0.009075,...,0.0,8.461756,125.658035,19474.280979,1.450554,0.9606,3.092282,1.59884,1.611346,49f80736-6b50-44a6-a77b-9b1572334a8c
4,327T,-1.299787e-06,-3.873046e-07,-2.455625e-07,0.158736,0.089426,0.102105,0.019906,0.009091,0.012959,...,0.0,8.655759,58.806034,5771.389617,0.190302,2.603398,2.786293,1.926096,1.571183,26f49660-ce1a-4946-8f83-f88850f03ec1


In [209]:
Frame_smartwatch_gyro = df_train.append(pd.DataFrame(data = df_ancillary), ignore_index=True)
print(Frame_smartwatch_gyro.shape)
Frame_smartwatch_gyro.head()

(956, 321)


Unnamed: 0,device_id_gyro,t_body_gyro_mean()_X,t_body_gyro_mean()_Y,t_body_gyro_mean()_Z,t_body_gyro_std()_X,t_body_gyro_std()_Y,t_body_gyro_std()_Z,t_body_gyro_mad()_X,t_body_gyro_mad()_Y,t_body_gyro_mad()_Z,...,f_body_gyro_Jerk_Mag_maxInd(),f_body_gyro_Jerk_Mag_meanFreq(),f_body_gyro_Jerk_Mag_skewness(),f_body_gyro_Jerk_Mag_kurtosis(),angle0(),angle1(),angle2(),angle3(),angle4(),measurement_id
0,2VSP,-8.08012e-06,-3.324895e-06,4.367288e-06,0.868691,0.391412,0.414207,0.177139,0.091405,0.103189,...,0.0,8.149658,57.374883,6181.289833,2.521363,2.155205,1.180774,0.733659,2.154569,05205dc3-9c82-4b6b-921e-b6c2edd2c993
1,2WLT,-4.102466e-06,-1.187798e-06,-8.524528e-06,2.050404,0.890641,0.887509,1.28219,0.65803,0.624217,...,0.0,8.863675,117.926153,17861.628404,1.776384,0.649764,2.362049,1.05639,1.044818,aba31c29-79ef-4221-9412-156538a2fd4e
2,2ZX3,-3.105367e-07,-9.208662e-08,-2.640078e-07,0.414637,0.25705,0.317073,0.034547,0.023544,0.023482,...,0.0,8.831006,56.176751,4835.658182,1.595897,1.794558,1.967655,2.210951,0.791589,6cab466b-3c95-4cc0-ba1b-8650f5a12504
3,2WH7,-7.127714e-07,3.723054e-08,3.11016e-07,0.175526,0.097615,0.13266,0.018259,0.010175,0.014362,...,0.0,9.378177,35.582949,2662.389466,0.697318,2.913551,2.171902,0.908074,0.989416,ed560c25-e5c5-4dba-82c7-3fc18c248ce4
4,2WH7,-4.32626e-07,3.761519e-08,1.242769e-07,0.344894,0.169855,0.210206,0.058258,0.023167,0.034459,...,0.0,9.503903,48.21878,4586.099451,0.709405,1.729234,2.190499,0.933912,0.981358,f96752b5-850e-4a5a-a74a-69ab4893b6aa


### testing

In [211]:
a_real=glob.glob("testing_data/smartwatch_gyroscope/*.csv")
a_real_gyro=[]
num=[]
for i in a_real:
    data=pd.read_csv(i)
    a=data.groupby('device_id').agg({'x':'var','y':'count'}).reset_index()
    deviceid=a.loc[a.x.idxmax(),'device_id']
    if int(a.loc[a.device_id==deviceid,'y'])<=data.shape[0]*0.2:
        deviceid=a.loc[a.x.idxmin(),'device_id']
    data=data[data.device_id==deviceid].reset_index(drop=True)
    num.append(data.shape[0])
    if data.shape[0]>=2000:
        a_real_gyro.append(i)

# smartwatch_accelerometer
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(Dataset_Generation_PipeLine_gyroscope)(i) for i in a_real_gyro)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#1.8061357021331788 Mins

df_train=pd.DataFrame(result)
df_train.columns=['device_id_gyro']+[i.replace('acc','gyro') for i in all_columns]
df_train['measurement_id']=[item[len('testing_data/smartwatch_gyroscope/'):-4] for item in a_real_gyro]
df_train=df_train[[i for i in list(df_train.columns) if not any(w in 'grav' for w in i.split('_'))]]
print(df_train.shape)

Frame_smartwatch_gyro_test=df_train.copy()

--- 0.6322071989377339 Mins ---
(171, 321)


In [212]:
#smartwatch_gyroscope features
label=real_pd_training_id.append(pd.DataFrame(real_pd_ancillary_id), ignore_index=True)
df_train=pd.merge(label,Frame_smartwatch_gyro,on='measurement_id')
df_train['subject_id']=preprocessing.LabelEncoder().fit(df_train['subject_id']).transform(df_train['subject_id'])
df_train['device_id_gyro']=preprocessing.LabelEncoder().fit(df_train['device_id_gyro']).transform(df_train['device_id_gyro'])
print(df_train.shape)
df_train.head()



(956, 325)


Unnamed: 0,measurement_id,subject_id,on_off,dyskinesia,tremor,device_id_gyro,t_body_gyro_mean()_X,t_body_gyro_mean()_Y,t_body_gyro_mean()_Z,t_body_gyro_std()_X,...,f_body_gyro_Jerk_Mag_entropy(),f_body_gyro_Jerk_Mag_maxInd(),f_body_gyro_Jerk_Mag_meanFreq(),f_body_gyro_Jerk_Mag_skewness(),f_body_gyro_Jerk_Mag_kurtosis(),angle0(),angle1(),angle2(),angle3(),angle4()
0,ee053d95-c155-400d-ae42-fe24834ad4a9,2,1.0,0.0,3.0,4,1.493504e-07,-2.057308e-08,-1.936774e-08,0.051077,...,9.902863,0.0,9.20634,67.191496,7670.609654,1.443619,1.971498,1.531665,1.311748,2.879471
1,ce51ee31-8553-4321-9f83-8cd3dabe2f66,2,1.0,0.0,2.0,4,-8.376222e-07,-4.224139e-07,2.67156e-07,0.178178,...,9.924402,0.0,9.360919,46.04177,3802.467583,2.11341,1.052636,1.4294,1.244529,2.78389
2,d3c89012-3ab9-4014-b577-61ff05e31968,2,1.0,0.0,0.0,0,-8.997607e-06,-1.483406e-06,4.661575e-06,0.508392,...,9.618452,0.0,9.010205,67.457024,7176.613631,0.382888,2.465438,2.994276,1.685141,1.478318
3,5c42911d-0ebd-47ba-9925-dd5ab1c0ed61,2,1.0,0.0,1.0,4,-1.70467e-06,-1.930004e-06,6.815845e-06,0.321224,...,9.920529,0.0,9.011484,86.203236,11088.610479,2.660534,1.385936,1.811996,1.361374,2.819379
4,235472d5-ad2e-4c76-947e-358c9d8c1280,2,1.0,0.0,0.0,0,-4.300504e-06,2.096929e-06,3.002186e-06,0.343931,...,9.87749,0.0,8.413563,63.591677,6997.725719,0.81879,0.352701,2.064089,1.886957,0.602229


In [213]:
lightgbm(df_train,'tremor')
catboost(df_train,'tremor')

lightgbm(df_train,'dyskinesia')
catboost(df_train,'dyskinesia')

lightgbm(df_train,'on_off')
catboost(df_train,'on_off')

lightgbm test_MSE tremor : 0.534
catboost test_MSE tremor : 0.557
lightgbm test_MSE dyskinesia : 0.192
catboost test_MSE dyskinesia : 0.206
lightgbm test_MSE on_off : 0.193
catboost test_MSE on_off : 0.204


In [None]:
#ALL Freq
lightgbm test_MSE tremor : 0.526
catboost test_MSE tremor : 0.584
lightgbm test_MSE dyskinesia : 0.183
catboost test_MSE dyskinesia : 0.2
lightgbm test_MSE on_off : 0.188
catboost test_MSE on_off : 0.196

#Freq=50
lightgbm test_MSE tremor : 0.534
catboost test_MSE tremor : 0.557
lightgbm test_MSE dyskinesia : 0.192
catboost test_MSE dyskinesia : 0.206
lightgbm test_MSE on_off : 0.193
catboost test_MSE on_off : 0.204

# SmartWatch

In [216]:
Frame_smartwatch_gyro.shape,Frame_smartwatch_acc.shape

((956, 321), (956, 377))

In [217]:
Frame_smartwatch_gyro_test.shape,Frame_smartwatch_acc_test.shape

((171, 321), (171, 377))

In [219]:
Frame_smartwatch=pd.merge(Frame_smartwatch_acc,Frame_smartwatch_gyro,on='measurement_id')
Frame_smartwatch=Frame_smartwatch.drop('device_id_gyro',axis=1)
print(Frame_smartwatch.shape)

Frame_smartwatch_test=pd.merge(Frame_smartwatch_acc_test,Frame_smartwatch_gyro_test,on='measurement_id')
Frame_smartwatch_test=Frame_smartwatch_test.drop('device_id_gyro',axis=1)
print(Frame_smartwatch_test.shape)
Frame_smartwatch_test.head()

(956, 696)
(171, 696)


Unnamed: 0,device_id_acc,t_body_acc_mean()_X,t_body_acc_mean()_Y,t_body_acc_mean()_Z,t_body_acc_std()_X,t_body_acc_std()_Y,t_body_acc_std()_Z,t_body_acc_mad()_X,t_body_acc_mad()_Y,t_body_acc_mad()_Z,...,f_body_gyro_Jerk_Mag_entropy(),f_body_gyro_Jerk_Mag_maxInd(),f_body_gyro_Jerk_Mag_meanFreq(),f_body_gyro_Jerk_Mag_skewness(),f_body_gyro_Jerk_Mag_kurtosis(),angle0()_y,angle1()_y,angle2()_y,angle3()_y,angle4()_y
0,2ZX6,2.781533e-05,-8.935407e-06,3.8e-05,2.474804,2.643943,2.716527,1.610341,1.674962,1.783418,...,9.950424,0.0,9.226285,114.846697,17136.043074,1.956712,1.81273,2.383035,2.255665,1.844431
1,2WH8,-7.366351e-05,2.100833e-05,-4.8e-05,0.705522,0.753662,0.775093,0.123034,0.140916,0.122533,...,9.911633,0.0,8.735523,53.520723,5103.143272,1.72816,1.920036,1.653169,1.55538,3.057784
2,2VSP,-7.772214e-07,1.537878e-05,1e-05,0.758341,0.774186,0.932441,0.217882,0.230607,0.227571,...,9.967489,0.0,10.225977,60.099827,6315.004912,1.136023,0.760462,1.525693,0.060505,1.530492
3,2VSP,8.015321e-06,-2.943306e-06,1.8e-05,0.544454,0.813671,0.841965,0.187693,0.235558,0.294467,...,9.882791,0.0,8.280775,52.660898,5593.044321,1.269218,0.950416,0.669843,0.901517,1.594226
4,2WLT,-1.124787e-05,-3.885093e-07,1e-06,0.738255,1.149816,1.084356,0.369389,0.468508,0.615158,...,9.888309,0.0,8.160465,54.243222,5976.988389,1.58352,1.963404,1.396007,0.914101,0.687733


In [220]:
Frame_smartwatch.to_csv('analysis2_realpd_comp_training_abhiroop_tillhurst_smartwatch.csv',index=False)
Frame_smartwatch_test.to_csv('analysis2_realpd_comp_testing_abhiroop_tillhurst_smartwatch.csv',index=False)

In [221]:
#smartwatch_gyroscope features
label=real_pd_training_id.append(pd.DataFrame(real_pd_ancillary_id), ignore_index=True)
df_train=pd.merge(label,Frame_smartwatch,on='measurement_id')
df_train['subject_id']=preprocessing.LabelEncoder().fit(df_train['subject_id']).transform(df_train['subject_id'])
df_train['device_id_acc']=preprocessing.LabelEncoder().fit(df_train['device_id_acc']).transform(df_train['device_id_acc'])
print(df_train.shape)
df_train.head()

(956, 700)


Unnamed: 0,measurement_id,subject_id,on_off,dyskinesia,tremor,device_id_acc,t_body_acc_mean()_X,t_body_acc_mean()_Y,t_body_acc_mean()_Z,t_body_acc_std()_X,...,f_body_gyro_Jerk_Mag_entropy(),f_body_gyro_Jerk_Mag_maxInd(),f_body_gyro_Jerk_Mag_meanFreq(),f_body_gyro_Jerk_Mag_skewness(),f_body_gyro_Jerk_Mag_kurtosis(),angle0()_y,angle1()_y,angle2()_y,angle3()_y,angle4()_y
0,ee053d95-c155-400d-ae42-fe24834ad4a9,2,1.0,0.0,3.0,4,-6e-06,4e-06,-4e-06,0.118387,...,9.902863,0.0,9.20634,67.191496,7670.609654,1.443619,1.971498,1.531665,1.311748,2.879471
1,ce51ee31-8553-4321-9f83-8cd3dabe2f66,2,1.0,0.0,2.0,4,4.1e-05,-5.1e-05,-9e-06,0.469712,...,9.924402,0.0,9.360919,46.04177,3802.467583,2.11341,1.052636,1.4294,1.244529,2.78389
2,d3c89012-3ab9-4014-b577-61ff05e31968,2,1.0,0.0,0.0,0,5.9e-05,-7e-06,1.4e-05,0.775597,...,9.618452,0.0,9.010205,67.457024,7176.613631,0.382888,2.465438,2.994276,1.685141,1.478318
3,5c42911d-0ebd-47ba-9925-dd5ab1c0ed61,2,1.0,0.0,1.0,4,-2.9e-05,1.7e-05,-2.1e-05,0.768933,...,9.920529,0.0,9.011484,86.203236,11088.610479,2.660534,1.385936,1.811996,1.361374,2.819379
4,235472d5-ad2e-4c76-947e-358c9d8c1280,2,1.0,0.0,0.0,0,-7.5e-05,2.8e-05,8e-06,0.657183,...,9.87749,0.0,8.413563,63.591677,6997.725719,0.81879,0.352701,2.064089,1.886957,0.602229


In [222]:
lightgbm(df_train,'tremor')
catboost(df_train,'tremor')

lightgbm(df_train,'dyskinesia')
catboost(df_train,'dyskinesia')

lightgbm(df_train,'on_off')
catboost(df_train,'on_off')

lightgbm test_MSE tremor : 0.474
catboost test_MSE tremor : 0.557
lightgbm test_MSE dyskinesia : 0.196
catboost test_MSE dyskinesia : 0.198
lightgbm test_MSE on_off : 0.194
catboost test_MSE on_off : 0.199


In [None]:
#======================SmartWatch=============================
lightgbm test_MSE tremor : 0.474
catboost test_MSE tremor : 0.557
lightgbm test_MSE dyskinesia : 0.196
catboost test_MSE dyskinesia : 0.198
lightgbm test_MSE on_off : 0.194
catboost test_MSE on_off : 0.199

In [50]:
#lightgbm
def lightgbm(df4,label):
    df4=df4[df4['tremor']!=4]
    train=df4[~df4[label].isnull()]

    # Splitting the dataset into the Training set and Test set
    x_train, x_test, y_train, y_test = train_test_split(train.drop(['subject_id','measurement_id','on_off','dyskinesia','tremor'], axis = 1), train[label], 
                                                        train_size=0.7, 
                                                        stratify = train[label],
                                                        random_state=1)

    d_train = lgb.Dataset(x_train, label=y_train)

    params = {}
    params['learning_rate'] = 0.03
    params['boosting_type'] = 'gbdt'
    params['objective'] = 'regression'
    params['metric'] = 'rmse'
    params['max_depth'] = 5
    params['feature_fraction'] = 0.8
    clf = lgb.train(params, d_train, 150)
    y_pred=clf.predict(x_test)
    y_pred2=clf.predict(x_train)

    #print('lightgbm train_MSE '+label+' :',round(mean_squared_error(y_train, y_pred2),3))
    print('lightgbm test_MSE '+label+' :',round(mean_squared_error(y_test, y_pred),3))
    
    
#Catboost    
#This will give indexes of the categorical features
def categorical_index(df,cols):
    cat=[]
    for c in cols:
        try:
            cat.append(df.columns.get_loc(c))
        except:
            pass # doing nothing on exception
    return cat

def catboost(df4,label): 
    df4=df4[df4['tremor']!=4]
    train=df4[~df4[label].isnull()]
    X_train, X_test, y_train, y_test = train_test_split(train.drop(['subject_id','measurement_id','on_off','dyskinesia','tremor'], axis = 1), train[label], 
                                                        train_size=0.7, 
                                                        stratify = train[label],
                                                        random_state=1)
    train_dataset = Pool(data=X_train,
                         label=y_train,
                         cat_features=categorical_index(X_train,[]))

    eval_dataset = Pool(data=X_test,
                        label=y_test,
                        cat_features=categorical_index(X_train,[]))



    model = CatBoostRegressor(iterations=50, learning_rate=0.1, depth=6,verbose=False)
    # Fit model
    model.fit(train_dataset)
    # Get predictions
    preds = model.predict(eval_dataset)
    preds2 = model.predict(train_dataset)

    #print('catboost train_MSE '+label+' :',round(mean_squared_error(y_train, preds2),3))
    print('catboost test_MSE '+label+' :',round(mean_squared_error(y_test, preds),3))