In [None]:
import pandas as pd
import numpy as np

### Function

In [None]:
#==========================================================================================
#making time stamp uniform by Interpolation
from scipy import interpolate
def preprocess(data):
    freq=50
    ls=['X','Y','Z']
    t1=np.arange(data.Timestamp[0],data.Timestamp[(data.shape[0])-1],0.02)
    df=pd.DataFrame({'Timestamp':t1})
    for i in ls:
        fcubic = interpolate.interp1d(data.Timestamp, data[i], kind='cubic')
        df[i]=fcubic(t1)
    df.columns=['Timestamp','acc_X','acc_Y','acc_Z']
    return df

#making time stamp uniform by Interpolation 
def preprocess_real_smartphone(data):
    data.rename(columns={'t':'Timestamp','x':'X','y':'Y','z':'Z'},inplace=True)
    ls=['X','Y','Z']
    freq=round((1/((data.Timestamp.max()/data.Timestamp.shape[0]).round(3))),0)
    t1=np.arange(data.Timestamp[0],data.Timestamp[(data.shape[0])-1],(data.Timestamp.max()/data.Timestamp.shape[0]).round(3))
    #freq=100
    #t1=np.arange(data.Timestamp[0],data.Timestamp[(data.shape[0])-1],0.01)
    df=pd.DataFrame({'Timestamp':t1})
    for i in ls:
        fcubic = interpolate.interp1d(data.Timestamp, data[i], kind='cubic')
        df[i]=fcubic(t1)
    df.columns=['Timestamp','acc_X','acc_Y','acc_Z']
    return df,freq

def preprocess_real_smartwatch(data):
    
    a=data.groupby('device_id').agg({'x':'var','y':'count'}).reset_index()
    deviceid=a.loc[a.x.idxmax(),'device_id']
    if int(a.loc[a.device_id==deviceid,'y'])<=data.shape[0]*0.2:
        deviceid=a.loc[a.x.idxmin(),'device_id']
    
    data=data[data.device_id==deviceid].reset_index()
    data.rename(columns={'t':'Timestamp','x':'X','y':'Y','z':'Z'},inplace=True)
   
    ls=['X','Y','Z']
    #freq=round((1/((data.Timestamp.max()/data.Timestamp.shape[0]).round(3))),0)
    freq=50
    t1=np.arange(data.Timestamp[0],data.Timestamp[(data.shape[0])-1],0.02)
    #t1=np.arange(data.Timestamp[0],data.Timestamp[(data.shape[0])-1],(data.Timestamp.max()/data.Timestamp.shape[0]).round(3))
    df=pd.DataFrame({'Timestamp':t1})
        
    for i in ls:
        fcubic = interpolate.interp1d(data.Timestamp, data[i])
        df[i]=fcubic(t1)
    df.rename(columns={'X':'acc_X','Y':'acc_Y','Z':'acc_Z'},inplace=True)
    return df[['Timestamp','acc_X','acc_Y','acc_Z']],deviceid,freq

#==========================================================================================
#median filter
from scipy.signal import medfilt # import the median filter function
def median(signal):# input: numpy array 1D (one column)  
    #applying the median filter
    return  medfilt(np.array(signal), kernel_size=3) # applying the median filter order3(kernel_size=3)


#==========================================================================================
#components_selection_one_signal
import math # import math library


def components_selection_one_signal(t_signal,sampling_freq):
    nyq=sampling_freq/float(2) # nyq is the nyquist frequency equal to the half of the sampling frequency[50/2= 25 Hz]

    freq1 = 0.3
    freq2 = 20

    t_signal=np.array(t_signal)
    t_signal_length=len(t_signal) # number of points in a t_signal
    
    # the t_signal in frequency domain after applying fft
    f_signal=np.fft.fft(t_signal) # 1D numpy array contains complex values (in C)
    
    # generate frequencies associated to f_signal complex values
    freqs=np.array(np.fft.fftfreq(t_signal_length, d=1/float(sampling_freq))) # frequency values between [-25hz:+25hz]
        
    df=pd.DataFrame({'freq':abs(freqs),'amplitute':f_signal})
    df['f_DC_signal']=np.where(df.freq>freq1,0,df.amplitute)
    #df['f_noise_signal']=np.where(df.freq<=freq2,0,df.amplitute)
    df['f_body_signal']=np.where(df.freq<=freq1,0,np.where(df.freq>freq2,0,df.amplitute))

    
    # Inverse the transformation of signals in freq domain #
    # applying the inverse fft(ifft) to signals in freq domain and put them in float format
    t_DC_component= np.fft.ifft(np.array(df['f_DC_signal'])).real
    t_body_component= np.fft.ifft(np.array(df['f_body_signal'])).real
    #t_noise=np.fft.ifft(np.array(df['f_noise_signal'])).real
    t_noise=[]
    
    #total_component=t_signal-t_noise # extracting the total component(filtered from noise) 
                                     #  by substracting noise from t_signal (the original signal).
    total_component=[]
    
    # return outputs mentioned earlier
    return (total_component,t_DC_component,t_body_component,t_noise) 


#=================================================================================================================
#Define verify gravity function
def mag_3_signals(df): # Euclidian magnitude
    return np.array(np.sqrt(np.square(df).sum(axis=1)))

def verify_gravity(data):
    
    acc_x=np.array(data['acc_X']) # copy acc_X column from dataframe in raw_dic having the key mentioned above
    acc_y=np.array(data['acc_Y'])# copy acc_Y column  from dataframe in raw_dic having the key mentioned above
    acc_z=np.array(data['acc_Z'])# copy acc_Z column  from dataframe in raw_dic having the key mentioned above

    # apply the filtering method to acc_[X,Y,Z] and store gravity components
    grav_acc_X=components_selection_one_signal(acc_x)[1] 
    grav_acc_Y=components_selection_one_signal(acc_y)[1]
    grav_acc_Z=components_selection_one_signal(acc_z)[1]
    
    # calculating gravity magnitude signal
    grav_acc_mag=mag_3_signals(grav_acc_X, grav_acc_Y,grav_acc_Z)
    print('mean value = ',round((sum(grav_acc_mag) / len(grav_acc_mag)),3),' g')
    
#=================================================================================================================    
#Define jerking and magnitude functions
def jerk_one_signal(signal,sampling_freq):
    signal=pd.DataFrame(signal)
    jerk=(signal.shift(-1)-signal)*sampling_freq
    return np.array(jerk.dropna()).transpose()[0]

### IDs_Labels Data

In [None]:
#Test Data
cis_pd_testing_id=pd.read_csv('test_data_Id/cis-pd.CIS-PD_Test_Data_IDs.csv')
real_pd_testing_id=pd.read_csv('test_data_Id/real-pd.REAL-PD_Test_Data_IDs.csv')

#Training Data
cis_pd_training_id=pd.read_csv('data_labels/CIS-PD_Training_Data_IDs_Labels.csv')
real_pd_training_id=pd.read_csv('data_labels/REAL-PD_Training_Data_IDs_Labels.csv')

#Ancillary Data
cis_pd_ancillary_id=pd.read_csv('data_labels/CIS-PD_Ancillary_Data_IDs_Labels.csv')
real_pd_ancillary_id=pd.read_csv('data_labels/REAL-PD_Ancillary_Data_IDs_Labels.csv')

In [None]:
def time_domain_signal(data,sampling_freq):
    time_sig_df=pd.DataFrame()
    for column in ['acc_X','acc_Y','acc_Z']:
        t_signal=np.array(data[column])
        #med_filtred=median(t_signal)
        med_filtred=(t_signal)
        _,grav_acc,body_acc,_=components_selection_one_signal(med_filtred,sampling_freq)
        body_acc_jerk=jerk_one_signal(body_acc,sampling_freq)
        time_sig_df['t_body_'+column]=body_acc[:-1]
        time_sig_df['t_grav_'+column]= grav_acc[:-1]
        time_sig_df['t_body_acc_jerk_'+column[-1]]=body_acc_jerk

    # all 15 axial signals generated above are reordered to facilitate magnitudes signals generation
    new_columns_ordered=['t_body_acc_X','t_body_acc_Y','t_body_acc_Z',
                              't_grav_acc_X','t_grav_acc_Y','t_grav_acc_Z',
                              't_body_acc_jerk_X','t_body_acc_jerk_Y','t_body_acc_jerk_Z']


    # create new dataframe to order columns
    time_sig_df=time_sig_df[new_columns_ordered]

    # Magnitude Features
    for i in range(0,9,3):
        mag_col_name=new_columns_ordered[i][:-1]+'mag'# Create the magnitude column name related to each 3-axial signals
        time_sig_df[mag_col_name]=mag_3_signals(time_sig_df[new_columns_ordered[i:i+3]]) # store the signal_mag with its appropriate column name

    return(time_sig_df)

In [None]:
#from scipy import fftpack # import fftpack to use all fft functions
from numpy.fft import *

# fast_fourier_transform_one_signal 
def fast_fourier_transform_one_signal(t_signal):
    return np.abs(np.fft.rfft(t_signal))

# fast fourier transform for data frames
def fast_fourier_transform(t_window,sampling_freq):
    f_window=pd.DataFrame() 
    for column in t_window.columns: 
        if 'grav' not in column: # verify if time domain signal is not related to gravity components
            t_signal=np.array(t_window[column]) # convert the column to a 1D numpy array
            f_signal= fast_fourier_transform_one_signal(t_signal) # apply the function defined above to the column
            f_window["f_"+column[2:]]=f_signal # storing the frequency signal in f_window with an appropriate column name
    dfreq=np.array(np.fft.rfftfreq(len(t_signal), d=1/float(sampling_freq))) 
    return f_window,dfreq # return the frequency domain window


### Common Axial Features Functions

In [None]:
# mean
def mean_axial(df):
    array=np.array(df) # convert dataframe into 2D numpy array for efficiency
    mean_vector = list(array.mean(axis=0)) # calculate the mean value of each column
    return mean_vector # return mean vetor
# std
def std_axial(df):
    array=np.array(df)# convert dataframe into 2D numpy array for efficiency
    std_vector = list(array.std(axis=0))# calculate the standard deviation value of each column
    return std_vector

# mad
from statsmodels.robust import mad as median_deviation # import the median deviation function
def mad_axial(df):
    array=np.array(df)# convert dataframe into 2D numpy array for efficiency
    mad_vector = list(median_deviation(array,axis=0)) # calculate the median deviation value of each column
    return mad_vector

# max

def max_axial(df):
    array=np.array(df)# convert dataframe into 2D numpy array for efficiency
    max_vector=list(array.max(axis=0))# calculate the max value of each column
    return max_vector
# min
def min_axial(df):
    array=np.array(df)# convert dataframe into 2D numpy array for efficiency
    min_vector=list(array.min(axis=0))# calculate the min value of each column
    return min_vector
# IQR
from scipy.stats import iqr as IQR # import interquartile range function (Q3(column)-Q1(column))
def IQR_axial(df):
    array=np.array(df)# convert dataframe into 2D numpy array for efficiency
    IQR_vector=list(np.apply_along_axis(IQR,0,array))# calculate the inter quartile range value of each column
    return IQR_vector


# Entropy
from scipy.stats import entropy # import the entropy function
def entropy_axial(df):
    array=np.array(df)# convert dataframe into 2D numpy array for efficiency
    entropy_vector=list(np.apply_along_axis(entropy,0,abs(array)))# calculate the entropy value of each column
    return entropy_vector

### Common Magnitude Features Functions

In [None]:
# mean
def mean_mag(mag_column):
    array=np.array(mag_column)
    mean_value = float(array.mean())
    return mean_value

# std: standard deviation of mag column
def std_mag(mag_column):
    array=np.array(mag_column)
    std_value = float(array.std()) # std value 
    return std_value

# mad: median deviation
def mad_mag(mag_column):
    array=np.array(mag_column)
    mad_value = float(median_deviation(array))# median deviation value of mag_column
    return mad_value

# max
def max_mag(mag_column):
    array=np.array(mag_column)
    max_value=float(array.max()) # max value 
    return max_value
# min
def min_mag(mag_column):
    array=np.array(mag_column)
    min_value= float(array.min()) # min value
    return min_value

# IQR
def IQR_mag(mag_column):
    array=np.array(mag_column)
    IQR_value=float(IQR(array))# Q3(column)-Q1(column)
    return IQR_value

# Entropy
def entropy_mag(mag_column):
    array=np.array(mag_column)
    entropy_value=float(entropy(array)) # entropy signal
    return entropy_value

### Time Axial Features functions

In [None]:
# sma
def t_sma_axial(df):
    array=np.array(df)
    sma_axial=float(abs(array).sum())/float(3) # sum of areas under each signal
    return sma_axial # return sma value

# energy
def t_energy_axial(df):
    array=np.array(df)
    energy_vector=list((array**2).sum(axis=0)) # energy value of each df column
    return energy_vector # return energy vector energy_X,energy_Y,energy_Z

# define the arbugr function
#auto regression coefficients with using burg method with order from 1 to 4
from spectrum import *

##############################################################################################
# I took this function as it is from this link ------>    https://github.com/faroit/freezefx/blob/master/fastburg.py
# This fucntion and the original function arburg in the library spectrum generate the same first 3 coefficients 
#for all windows the original burg method is low and for some windows it cannot generate all 4th coefficients 

def _arburg2(X, order):
    """This version is 10 times faster than arburg, but the output rho is not correct.
    returns [1 a0,a1, an-1]
    """
    x = numpy.array(X)
    N = len(x)

    if order == 0.:
        raise ValueError("order must be > 0")

    # Initialisation
    # ------ rho, den
    rho = sum(abs(x)**2.) / N  # Eq 8.21 [Marple]_
    den = rho * 2. * N

    # ------ backward and forward errors
    ef = numpy.zeros(N, dtype=complex)
    eb = numpy.zeros(N, dtype=complex)
    for j in range(0, N):  # eq 8.11
        ef[j] = x[j]
        eb[j] = x[j]

    # AR order to be stored
    a = numpy.zeros(1, dtype=complex)
    a[0] = 1
    # ---- rflection coeff to be stored
    ref = numpy.zeros(order, dtype=complex)

    E = numpy.zeros(order+1)
    E[0] = rho

    for m in range(0, order):
        # print m
        # Calculate the next order reflection (parcor) coefficient
        efp = ef[1:]
        ebp = eb[0:-1]
        # print efp, ebp
        num = -2. * numpy.dot(ebp.conj().transpose(), efp)
        den = numpy.dot(efp.conj().transpose(),  efp)
        den += numpy.dot(ebp,  ebp.conj().transpose())
        ref[m] = num / den

        # Update the forward and backward prediction errors
        ef = efp + ref[m] * ebp
        eb = ebp + ref[m].conj().transpose() * efp

        # Update the AR coeff.
        a.resize(len(a)+1)
        a = a + ref[m] * numpy.flipud(a).conjugate()

        # Update the prediction error
        E[m+1] = numpy.real((1 - ref[m].conj().transpose() * ref[m])) * E[m]
        # print 'REF', ref, num, den
    return a, E[-1], ref

#################################################################################################################

# to generate arburg (order 4) coefficents for 3 columns [X,Y,Z]
def t_arburg_axial(df):
    # converting signals to 1D numpy arrays for efficiency
    array_X=np.array(df[df.columns[0]])
    array_Y=np.array(df[df.columns[1]])
    array_Z=np.array(df[df.columns[2]])
    
    AR_X = list(_arburg2(array_X,4)[0][1:].real) # list contains real parts of all 4th coefficients generated from signal_X
    AR_Y = list(_arburg2(array_Y,4)[0][1:].real) # list contains real parts of all 4th coefficients generated from signal_Y
    AR_Z = list(_arburg2(array_Z,4)[0][1:].real) # list contains real parts of all 4th coefficients generated from signal_Z
    
    # selecting [AR1 AR2 AR3 AR4] real components for each axis concatenate them in one vector
    AR_vector= AR_X + AR_Y+ AR_Z
    
    
    # AR_vector contains 12 values 4values per each axis 
    return AR_vector


from scipy.stats import pearsonr
def t_corr_axial(df): # it returns 3 correlation features per each 3-axial signals in  time_window
    
    array=np.array(df)
    
    Corr_X_Y=float(pearsonr(array[:,0],array[:,1])[0]) # correlation value between signal_X and signal_Y
    Corr_X_Z=float(pearsonr(array[:,0],array[:,2])[0]) # correlation value between signal_X and signal_Z
    Corr_Y_Z=float(pearsonr(array[:,1],array[:,2])[0]) # correlation value between signal_Y and signal_Z
    
    corr_vector =[Corr_X_Y, Corr_X_Z, Corr_Y_Z] # put correlation values in list
    
    return corr_vector 
 
#hurst exponent
def hurst(signal):
    """
    **Experimental**/untested implementation taken from:
    http://drtomstarke.com/index.php/calculation-of-the-hurst-exponent-to-test-for-trend-and-mean-reversion/
    Use at your own risks.
    """
    
    signal=np.array(signal)
    tau = []; lagvec = []
    #  Step through the different lags
    for lag in range(2,20):
    #  produce price difference with lag
        pp = np.subtract(signal[lag:],signal[:-lag])
    #  Write the different lags into a vector
        lagvec.append(lag)
    #  Calculate the variance of the difference vector
        tau.append(np.std(pp))
    #  linear fit to double-log graph (gives power)
    m = np.polyfit(np.log10(lagvec),np.log10(tau),1)
    # calculate hurst
    hurst = m[0]
    return hurst


# to generate hurst  for 3 columns [X,Y,Z]
def t_hurst_axial(df):
    return list(df.apply(hurst,axis=0))


### Time Axial Features PipeLine

In [None]:
def t_axial_features_generation(t_window):
    
    # select axial columns : the first 9 columns
    axial_columns=t_window.columns[0:9]
    
    # select axial columns in a dataframe
    axial_df=t_window[axial_columns]
    
    ## a list will contain all axial features values resulted from applying: 
    #  common axial features functions and time axial features functions to all time domain signals in t_window
    t_axial_features=[]
    for col in range(0,9,3):
        df=axial_df[axial_columns[col:col+3]] # select each group of 3-axial signal: signal_name[X,Y,Z]
        
        # apply all common axial features functions and time axial features functions to each 3-axial signals dataframe
        mean_vector   = mean_axial(df) # 3values
        std_vector    = std_axial(df) # 3 values
        mad_vector    = mad_axial(df)# 3 values
        max_vector    = max_axial(df)# 3 values
        min_vector    = min_axial(df)# 3 values
        sma_value     = t_sma_axial(df)# 1 value
        energy_vector = t_energy_axial(df)# 3 values
        IQR_vector    = IQR_axial(df)# 3 values
        entropy_vector= entropy_axial(df)# 3 values
        AR_vector     = t_arburg_axial(df)# 3 values
        corr_vector   = t_corr_axial(df)# 3 values
        hurst_vector  = t_hurst_axial(df)# 3 values
        # 40 value per each 3-axial signals
        t_3axial_vector= mean_vector + std_vector + mad_vector + max_vector + min_vector + [sma_value] + energy_vector + IQR_vector + entropy_vector + AR_vector + corr_vector+hurst_vector
        
        # append these features to the global list of features
        t_axial_features= t_axial_features+ t_3axial_vector
    
    # t_axial_features contains 200 values = 40 value per each 3axial x 5 tri-axial-signals[X,Y,Z]
    return t_axial_features
    

### Define Time Magnitudes Features functions

In [None]:
# Functions used to generate time magnitude features

# sma: signal magnitude area
def t_sma_mag(mag_column):
    array=np.array(mag_column)
    sma_mag=float(abs(array).sum())# signal magnitude area of one mag column
    return sma_mag

# energy
def t_energy_mag(mag_column):
    array=np.array(mag_column)
    energy_value=float((array**2).sum()) # energy of the mag signal
    return energy_value



# arburg: auto regression coefficients using the burg method
def t_arburg_mag(mag_column):
    
    array = np.array(mag_column)
    
    AR_vector= list(_arburg2(array,4)[0][1:].real) # AR1, AR2, AR3, AR4 of the mag column
    #print(AR_vector)
    return AR_vector

### Time Magnitude Features PipLine

In [None]:
def t_mag_features_generation(t_window):
    
    # select mag columns : the last 3 columns in a time domain window
    
    mag_columns=t_window.columns[9:] # mag columns' names
    mag_columns=t_window[mag_columns] # mag data frame
    
    t_mag_features=[] # a global list will contain all time domain magnitude features
    
    for col in mag_columns: # iterate throw each mag column
        
        mean_value   = mean_mag(mag_columns[col]) # 1 value
        std_value    = std_mag(mag_columns[col])# 1 value
        mad_value    = mad_mag(mag_columns[col])# 1 value
        max_value    = max_mag(mag_columns[col])# 1 value
        min_value    = min_mag(mag_columns[col])# 1 value
        sma_value    = t_sma_mag(mag_columns[col])# 1 value
        energy_value = t_energy_mag(mag_columns[col])# 1 value
        IQR_value    = IQR_mag(mag_columns[col])# 1 value
        entropy_value= entropy_mag(mag_columns[col])# 1 value
        #hurst_vector    = hurst(mag_columns[col])# 1 value
        AR_vector    = t_arburg_mag(mag_columns[col])# 1 value
        
        # 13 value per each t_mag_column
        col_mag_values = [mean_value, std_value, mad_value, max_value, min_value, sma_value, 
                          energy_value,IQR_value, entropy_value]+ AR_vector
        
        # col_mag_values will be added to the global list
        t_mag_features= t_mag_features+ col_mag_values
    
    # t_mag_features contains 65 values = 13 values (per each t_mag_column) x 5 (t_mag_columns)
    return t_mag_features
 

### Time Features names Generation

In [None]:
def time_features_names():
    # Generating time feature names
    
    # time domain axial signals' names
    t_axis_signals=[['t_body_acc_X','t_body_acc_Y','t_body_acc_Z'],
                    ['t_grav_acc_X','t_grav_acc_Y','t_grav_acc_Z'],
                    ['t_body_acc_jerk_X','t_body_acc_jerk_Y','t_body_acc_jerk_Z']]
    
    # time domain magnitude signals' names
    magnitude_signals=['t_body_acc_Mag','t_grav_acc_Mag','t_body_acc_jerk_Mag']

    # functions' names:
    t_one_input_features_name1=['_mean()','_std()','_mad()','_max()','_min()']

    t_one_input_features_name2=['_energy()','_iqr()','_entropy()']

    t_one_input_features_name3=['_AR1()','_AR2()','_AR3()','_AR4()']

    correlation_columns=['_Corr(X,Y)','_Corr(X,Z)','_Corr(Y,Z)']

    t_one_input_features_name4=['_hurst()']
    

    features=[]# Empty list : it will contain all time domain features' names
    
    for columns in t_axis_signals: # iterate throw  each group of 3-axial signals'
        
        for feature in t_one_input_features_name1: # iterate throw the first list of functions names
            
            for column in columns: # iterate throw each axial signal in that group
                
                newcolumn=column[:-2]+feature+column[-2:] # build the feature name
                features.append(newcolumn) # add it to the global list
        
        sma_column=column[:-2]+'_sma()' # build the feature name sma related to that group
        features.append(sma_column) # add the feature to the list
        
        for feature in t_one_input_features_name2: # same process for the second list of features functions
            for column in columns:
                newcolumn=column[:-2]+feature+column[-2:]
                features.append(newcolumn)
        
        for column in columns:# same process for the third list of features functions
            for feature in t_one_input_features_name3:
                newcolumn=column[0:-2]+feature+column[-2:]
                features.append(newcolumn)
        
        for feature in correlation_columns: # adding correlations features
            newcolumn=column[0:-2]+feature
            features.append(newcolumn)
            
        for feature in t_one_input_features_name4: # adding correlations features
            for column in columns:
                newcolumn=column[:-2]+feature+column[-2:]
                features.append(newcolumn)
        

    for columns in magnitude_signals: # iterate throw time domain magnitude column names

        # build feature names related to that column
        #list 1
        for feature in t_one_input_features_name1:
            newcolumn=columns+feature
            features.append(newcolumn)
        # sma feature name
        sma_column=columns+'_sma()'
        features.append(sma_column)
        
        # list 2
        for feature in t_one_input_features_name2: 
            newcolumn=columns+feature
            features.append(newcolumn)
            
       
        # list 3
        for feature in t_one_input_features_name3:
            newcolumn=columns+feature
            features.append(newcolumn)
            
        
    ###########################################################################################################
    time_list_features=features
    
    return time_list_features # return all time domain features' names

### Frequency Axial features functions

In [None]:
# sma
def f_sma_axial(df):
    array=np.array(df)
    sma_value=float((abs(array)/math.sqrt(array.shape[0])).sum())/float(3) # sma value of 3-axial f_signals
    return sma_value


# energy
def f_energy_axial(df):
    array=np.array(df)
    # spectral energy vector
    energy_vector=list((array**2).sum(axis=0)/float(len(array))) # energy of: f_signalX,f_signalY, f_signalZ
    return energy_vector # enrgy veactor=[energy(signal_X),energy(signal_Y),energy(signal_Z)]

#Max Inds and Mean_Freq Functions
# max_Inds
def f_max_Inds_axial(df,dfreq):
    array=np.array(df)
    max_Inds_X =dfreq[array[:,0].argmax()] # return the frequency related to max value of f_signal X
    max_Inds_Y =dfreq[array[:,1].argmax()] # return the frequency related to max value of f_signal Y
    max_Inds_Z =dfreq[array[:,2].argmax()] # return the frequency related to max value of f_signal Z
    max_Inds_vector= [max_Inds_X,max_Inds_Y,max_Inds_Z]# put those frequencies in a list
    return max_Inds_vector

# mean freq()
def f_mean_Freq_axial(df,dfreq):
    array=np.array(df)
    # sum of( freq_i * f_signal[i])/ sum of signal[i]
    mean_freq_X = np.dot(dfreq,array[:,0]).sum() / float(array[:,0].sum()) #  frequencies weighted sum using f_signalX
    mean_freq_Y = np.dot(dfreq,array[:,1]).sum() / float(array[:,1].sum()) #  frequencies weighted sum using f_signalY 
    mean_freq_Z = np.dot(dfreq,array[:,2]).sum() / float(array[:,2].sum()) #  frequencies weighted sum using f_signalZ
    mean_freq_vector=[mean_freq_X,mean_freq_Y,mean_freq_Z] # vector contain mean frequencies[X,Y,Z]
    return  mean_freq_vector


# Skewness & Kurtosis Functions
from scipy.stats import kurtosis       # kurtosis function
from scipy.stats import skew           # skewness function
    
def f_skewness_and_kurtosis_axial(df):
    array=np.array(df)
    skew_axial=list(skew(array,axis=0))
    kur_axial=list(kurtosis(array,axis=0))
    skew_kur_3axial_vector = [i for tup in zip(skew_axial,kur_axial) for i in tup]
    return  skew_kur_3axial_vector



#f_one_band_energy
def f_one_band_energy(psd, bands,dfreq):
    psd = np.abs(np.array(psd))**2
    bands = np.asarray(bands)
    freq_limits_low = np.concatenate([[1],bands])
    freq_limits_up = np.concatenate([bands, [25]])
    power_per_band_mean = [np.mean(psd[np.bitwise_and(dfreq >= low, dfreq<up)])
            for low,up in zip(freq_limits_low, freq_limits_up)]
    return power_per_band_mean

#spectral_entropy
def spectral_entropy(psd, bands,dfreq):
    psd = np.abs(np.array(psd))**2
    psd2 =psd/np.sum(psd) # psd as a pdf (normalised to one)
    bands = np.asarray(bands)
    freq_limits_low = np.concatenate([[1],bands])
    freq_limits_up = np.concatenate([bands, [25]])
    power_per_band = [np.sum(psd2[np.bitwise_and(dfreq >= low, dfreq<up)])
        for low,up in zip(freq_limits_low, freq_limits_up)]
    power_per_band=np.asarray(power_per_band)
    power_per_band= power_per_band[ power_per_band > 0]
    t=[- np.sum(power_per_band * np.log2(power_per_band))]
    return t

#Bands Energy FUNCTIONS
B1=[4,7,10,13,16,19,22] 
B2=[7,13,19]
B3=[12]


def f_all_bands_energy_axial(df,dfreq): # df is dataframe contain 3 columns (3-axial f_signals [X,Y,Z])
    E_3_axis =[]
    SE_3_axis =[]
    array=np.array(df)
    for i in range(0,3): # iterate throw signals
        E1=f_one_band_energy(array[:,i],B1,dfreq) # energy bands1 values of f_signal
        E2=f_one_band_energy(array[:,i],B2,dfreq)# energy bands2 values of f_signal
        E3=f_one_band_energy(array[:,i],B3,dfreq)# energy bands3 values of f_signal
        E_one_axis = E1+E2+E3 # list of energy bands values of one f_signal
        E_3_axis= E_3_axis + E_one_axis # add values to the global list
        
        SE1=spectral_entropy(array[:,i],B1,dfreq) # Senergy bands1 values of f_signal
        SE2=spectral_entropy(array[:,i],B2,dfreq)# Senergy bands2 values of f_signal
        SE3=spectral_entropy(array[:,i],B3,dfreq)# Senergy bands3 values of f_signal
        SE_one_axis = SE1+SE2+SE3 # list of energy bands values of one f_signal
        SE_3_axis= SE_3_axis + SE_one_axis # add values to the global list        
    return E_3_axis+SE_3_axis

### Frequency axial features PipeLine

In [None]:
def f_axial_features_generation(f_window,dfreq):
    
    
    axial_columns=f_window.columns[0:6] # select frequency axial column names
    axial_df=f_window[axial_columns] # select frequency axial signals in one dataframe
    f_all_axial_features=[] # a global list will contain all frequency axial features values
    
    
    
    for col in range(0,6,3):# iterate throw each group of frequency axial signals in a window
        
        df=axial_df[axial_columns[col:col+3]]  # select each group of 3-axial signals
      
        # mean
        mean_vector                  = mean_axial(df) # 3 values
        # std
        std_vector                   = std_axial(df) # 3 values
        # mad
        mad_vector                   = mad_axial(df) # 3 values
        # max
        max_vector                   = max_axial(df) # 3 values
        # min
        min_vector                   = min_axial(df) # 3 values
        # sma
        sma_value                    = f_sma_axial(df)
        # energy
        energy_vector                = f_energy_axial(df)# 3 values
        # IQR
        IQR_vector                   = IQR_axial(df) # 3 values
        # entropy
        entropy_vector               = entropy_axial(df) # 3 values
        # max_inds
        max_inds_vector              = f_max_Inds_axial(df,dfreq)# 3 values
        # mean_Freq
        mean_Freq_vector             = f_mean_Freq_axial(df,dfreq)# 3 values
        # skewness and kurtosis
        skewness_and_kurtosis_vector = f_skewness_and_kurtosis_axial(df)# 6 values
        # bands energy
        bands_energy_vector          = f_all_bands_energy_axial(df,dfreq) # 42 values

        # append all values of each 3-axial signals in a list
        f_3axial_features = mean_vector +std_vector + mad_vector + max_vector + min_vector + [sma_value] + energy_vector + IQR_vector + entropy_vector + max_inds_vector + mean_Freq_vector + skewness_and_kurtosis_vector + bands_energy_vector

        f_all_axial_features = f_all_axial_features+ f_3axial_features # add features to the global list
        
    return f_all_axial_features

### Define Frequency Magnitudes features functions

In [None]:
# Functions used to generate frequency magnitude features

# sma
def f_sma_mag(mag_column):
    
    array=np.array(mag_column)
    sma_value=float((abs(array)/math.sqrt(len(mag_column))).sum()) # sma of one mag f_signals
    
    return sma_value

# energy
def f_energy_mag(mag_column):
    
    array=np.array(mag_column)
    # spectral energy value
    energy_value=float((array**2).sum()/float(len(array))) # energy value of one mag f_signals
    return energy_value


####### Max Inds and Mean_Freq Functions#######################################


# max_Inds
def f_max_Inds_mag(mag_column,dfreq):
    
    array=np.array(mag_column)
    
    max_Inds_value =float(dfreq[array.argmax()]) # freq value related with max component
    
    return max_Inds_value

# mean freq()
def f_mean_Freq_mag(mag_column,dfreq):
    
    array=np.array(mag_column)
    
    mean_freq_value = float(np.dot(dfreq,array).sum() / float(array.sum())) # weighted sum of one mag f_signal
    
    return  mean_freq_value

###################################################################################

########## Skewness & Kurtosis Functions #######################################

from scipy.stats import skew           # skewness
def f_skewness_mag(mag_column):
    
    array=np.array(mag_column)
    skew_value     = float(skew(array)) # skewness value of one mag f_signal
    return skew_value



from scipy.stats import kurtosis       # kurtosis
def f_kurtosis_mag(mag_column):
    array=np.array(mag_column)
    kurtosis_value = float(kurtosis(array)) # kurotosis value of on mag f_signal

    return kurtosis_value
##################################################################################

### Define Frequency Magnitude features pipline

In [None]:
def f_mag_features_generation(f_window,dfreq):
    
    # select frequnecy mag columns : the last 2 columns in f_window
    mag_columns=f_window.columns[-2:]
    mag_columns=f_window[mag_columns]
    
    f_mag_features=[]
    for col in mag_columns: # iterate throw each mag column in f_window
        
        # calculate common mag features and frequency mag features for each column
        mean_value   = mean_mag(mag_columns[col])
        std_value    = std_mag(mag_columns[col])
        mad_value    = mad_mag(mag_columns[col])
        max_value    = max_mag(mag_columns[col])
        min_value    = min_mag(mag_columns[col])
        sma_value    = f_sma_mag(mag_columns[col])
        energy_value = f_energy_mag(mag_columns[col])
        IQR_value    = IQR_mag(mag_columns[col])
        entropy_value= entropy_mag(mag_columns[col])
        max_Inds_value=f_max_Inds_mag(mag_columns[col],dfreq)
        mean_Freq_value= f_mean_Freq_mag (mag_columns[col],dfreq)
        skewness_value=  f_skewness_mag(mag_columns[col])
        kurtosis_value = f_kurtosis_mag(mag_columns[col])
        # 13 value per each t_mag_column
        col_mag_values = [mean_value, std_value, mad_value, max_value, 
                          min_value, sma_value, energy_value,IQR_value, 
                          entropy_value, max_Inds_value, mean_Freq_value,
                          skewness_value, kurtosis_value ]
        
        
        f_mag_features= f_mag_features+ col_mag_values # append feature values of one mag column to the global list
    
    # f_mag_features contains 65 values = 13 value (per each t_mag_column) x 4 (f_mag_columns)
    return f_mag_features
    

### Frequency features name generation

In [None]:
def frequency_features_names():
    #Generating Frequency feature names
    
    # frequency axial signal names 
    axial_signals=[
                    ['f_body_acc_X','f_body_acc_Y','f_body_acc_Z'],
                    ['f_body_acc_Jerk_X','f_body_acc_Jerk_Y','f_body_acc_Jerk_Z']]

    # frequency magnitude signals
    mag_signals=['f_body_acc_Mag','f_body_acc_Jerk_Mag']


    # features functions names will be applied to f_signals
    f_one_input_features_name1=['_mean()','_std()','_mad()','_max()','_min()']

    f_one_input_features_name2=['_energy()','_iqr()','_entropy()','_maxInd()','_meanFreq()']

    f_one_input_features_name3= ['_skewness()','_kurtosis()']


    f_one_input_features_name4=[
                                '_BE[1-4]','_BE[4-7]','_BE[7-10]','_BE[10-13]',
                                '_BE[13-16]','_BE[16-19]','_BE[19-22]','_BE[22-25]',
                                '_BE[1-7]','_BE[7-13]','_BE[13-19]','_BE[19-25]',
                                '_BE[1-12]','_BE[12-25]','_SE_B1','_SE_B2','_SE_B3'
                               ]
    
    frequency_features_names=[] # global list of frequency features
    
    for columns in axial_signals: # iterate throw each group of 3-axial signals
        
        # iterate throw the first list of features
        for feature in f_one_input_features_name1: 
            for column in columns:# iterate throw each signal name of that group
                newcolumn=column[:-2]+feature+column[-2:] # build the full feature name
                frequency_features_names.append(newcolumn) # add the feature name to the global list
        
        # sma feature name
        sma_column=column[:-2]+'_sma()'
        frequency_features_names.append(sma_column)

        # iterate throw the first list of features
        for feature in f_one_input_features_name2:
            for column in columns:
                newcolumn=column[:-2]+feature+column[-2:]
                frequency_features_names.append(newcolumn)
        
        # iterate throw each signal name of that group
        for column in columns:
            for feature in f_one_input_features_name3: # iterate throw [skewness ,kurtosis]
                newcolumn=column[:-2]+feature+column[-2:] # build full feature name
                frequency_features_names.append(newcolumn) # append full feature names
        
        # same process above will be applied to list number 4
        for column in columns:
            for feature in f_one_input_features_name4:
                newcolumn=column[:-2]+feature+column[-2:]
                frequency_features_names.append(newcolumn)
   
    #################################################################################################################
    # generate frequency mag features names
    for column in mag_signals:# iterate throw each frequency mag signal name
        for feature in f_one_input_features_name1:# iterate throw the first list of features functions names
            frequency_features_names.append(column+feature) # build the full feature name and add it to the global list

        sma_column=column+'_sma()' # build the sma full feature name
        frequency_features_names.append(sma_column) # add it to the global list

        for feature in f_one_input_features_name2:# iterate throw the second list of features functions names
            frequency_features_names.append(column+feature)# build the full feature name and add it to the global list
        
        for feature in f_one_input_features_name3:# iterate throw the third list of features functions names
            frequency_features_names.append(column+feature)# build the full feature name and add it to the global list
    ####################################################################################################################
    
    return frequency_features_names

### Define Addtional features functions

In [None]:
############### Angles Functions ####################################
from math import acos # inverse of cosinus function
from math import sqrt # square root function

########Euclidian magnitude 3D############
def magnitude_vector(vector3D): # vector[X,Y,Z]
    return sqrt((vector3D**2).sum()) # eulidian norm of that vector

###########angle between two vectors in radian ###############
def angle(vector1, vector2):
    vector1_mag=magnitude_vector(vector1) # euclidian norm of V1
    vector2_mag=magnitude_vector(vector2) # euclidian norm of V2
   
    scalar_product=np.dot(vector1,vector2) # scalar product of vector 1 and Vector 2
    cos_angle=scalar_product/float(vector1_mag*vector2_mag) # the cosinus value of the angle between V1 and V2
    
    # just in case some values were added automatically
    if cos_angle>1:
        cos_angle=1
    elif cos_angle<-1:
        cos_angle=-1
    
    angle_value=float(acos(cos_angle)) # the angle value in radian
    return angle_value # in radian.

################## angle_features ############################
def angle_features(t_window): # it returns 7 angles per window
    angles_list=[]# global list of angles values
    
    # mean value of each column t_body_acc[X,Y,Z]
    V2_columns=['t_grav_acc_X','t_grav_acc_Y','t_grav_acc_Z']
    V2_Vector=np.array(t_window[V2_columns].mean()) # mean values
    
    # angle 0: angle between (t_body_acc[X.mean,Y.mean,Z.mean], t_gravity[X.mean,Y.mean,Z.mean])
    V1_columns=['t_body_acc_X','t_body_acc_Y','t_body_acc_Z']
    V1_Vector=np.array(t_window[V1_columns].mean()) # mean values of t_body_acc[X,Y,Z]
    angles_list.append(angle(V1_Vector, V2_Vector)) # angle between the vectors added to the global list
    
    # same process is applied to ither signals
    # angle 1: (t_body_acc_jerk[X.mean,Y.mean,Z.mean],t_gravity[X.mean,Y.mean,Z.mean]
    V1_columns=['t_body_acc_jerk_X','t_body_acc_jerk_Y','t_body_acc_jerk_Z']
    V1_Vector=np.array(t_window[V1_columns].mean())
    angles_list.append(angle(V1_Vector, V2_Vector))
    
    #################################################################################
    
    # V1 vector in this case is the X axis itself [1,0,0]
    # angle 4: ([X_axis],t_gravity[X.mean,Y.mean,Z.mean])   
    V1_Vector=np.array([1,0,0])
    angles_list.append(angle(V1_Vector, V2_Vector))
    
    # V1 vector in this case is the Y axis itself [0,1,0]
    # angle 5: ([Y_acc_axis],t_gravity[X.mean,Y.mean,Z.mean]) 
    V1_Vector=np.array([0,1,0])
    angles_list.append(angle(V1_Vector, V2_Vector))
    
    # V1 vector in this case is the Z axis itself [0,0,1]
    # angle 6: ([Z_acc_axis],t_gravity[X.mean,Y.mean,Z.mean])
    V1_Vector=np.array([0,0,1])
    angles_list.append(angle(V1_Vector, V2_Vector))
    
    return angles_list


### Define Additional features names

In [None]:
angle_columns=['angle0()','angle1()','angle2()','angle3()','angle4()']

### Define Datasets generation PipeLine

In [None]:
# conctenate all features names lists and we add two other columns activity ids and user ids will be related to each row
all_columns=time_features_names()+frequency_features_names()+angle_columns

def Dataset_Generation_PipeLine(b):
    data,sampling_freq=preprocess_real_smartphone(pd.read_csv(b))
    time_sig_df=time_domain_signal(data,sampling_freq)
    freq_sig_df,dfreq=fast_fourier_transform(time_sig_df,sampling_freq)

    # conctenate all features names lists and we add two other columns activity ids and user ids will be related to each row
    all_columns=time_features_names()+frequency_features_names()+angle_columns
    # generate all time features from t_window 
    time_features = t_axial_features_generation(time_sig_df) + t_mag_features_generation(time_sig_df)
    # generate all frequency features from f_window
    frequency_features = f_axial_features_generation(freq_sig_df,dfreq) + f_mag_features_generation(freq_sig_df,dfreq)

    # Generate addtional features from t_window
    additional_features= angle_features(time_sig_df)

    # concatenate all features and append the activity id and the user id
    row= time_features + frequency_features + additional_features 
    return(row)

def Dataset_Generation_PipeLine_SmartWatch(b):
    data,device_id,sampling_freq=preprocess_real_smartwatch(pd.read_csv(b))
    time_sig_df=time_domain_signal(data,sampling_freq)
    freq_sig_df,dfreq=fast_fourier_transform(time_sig_df,sampling_freq)

    # conctenate all features names lists and we add two other columns activity ids and user ids will be related to each row
    all_columns=time_features_names()+frequency_features_names()+angle_columns
    # generate all time features from t_window 
    time_features = t_axial_features_generation(time_sig_df) + t_mag_features_generation(time_sig_df)
    # generate all frequency features from f_window
    frequency_features = f_axial_features_generation(freq_sig_df,dfreq) + f_mag_features_generation(freq_sig_df,dfreq)

    # Generate addtional features from t_window
    additional_features= angle_features(time_sig_df)

    # concatenate all features and append the activity id and the user id
    row= [device_id]+time_features + frequency_features + additional_features 
    return(row)


In [None]:
from joblib import Parallel, delayed
import multiprocessing
import time
import glob

## SmartPhone

In [None]:
# training data
a_real=glob.glob("training_data/smartphone_accelerometer/*.csv")
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(Dataset_Generation_PipeLine)(i) for i in a_real)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#3.4828 Mins

In [None]:
df_train=pd.DataFrame(result)
df_train.columns=all_columns
df_train['measurement_id']=[item[len('training_data/smartphone_accelerometer/'):-4] for item in a_real]
print(df_train.shape)
df_train.head()

In [None]:
#ancillary_data
b_real=glob.glob("ancillary_data/smartphone_accelerometer/*.csv")
b_real2=[]
for i in b_real:
    a=pd.read_csv(i).shape[0]
    if a>=2000:
        b_real2.append(i)

In [None]:
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(Dataset_Generation_PipeLine)(i) for i in b_real2)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#3.2621 min

In [None]:
df_ancillary=pd.DataFrame(result)
df_ancillary.columns=all_columns
df_ancillary['measurement_id']=[item[len('ancillary_data/smartphone_accelerometer/'):-4] for item in b_real2]
print(df_ancillary.shape)
df_ancillary.head()

In [None]:
Frame = df_train.append(pd.DataFrame(data = df_ancillary), ignore_index=True)

In [None]:
#export part1 features of training data from smartphone signal for realpd
Frame.to_csv('analysis2_realpd_comp_training_abhiroop_tillhurst_smartphone.csv',index=False)

In [None]:
# testing data
a_real=glob.glob("testing_data/smartphone_accelerometer/*.csv")
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(Dataset_Generation_PipeLine)(i) for i in a_real)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#1.1097 Mins

df_train=pd.DataFrame(result)
df_train.columns=all_columns
df_train['measurement_id']=[item[len('testing_data/smartphone_accelerometer/'):-4] for item in a_real]
print(df_train.shape)
Frame_test=df_train.copy()

In [None]:
#export part1 features of testing data from smartphone signal for realpd
Frame_test.to_csv('analysis2_realpd_comp_testing_abhiroop_tillhurst_smartphone.csv',index=False)

# SmartWatch

### Accelerometer

In [None]:
#training
a_real=glob.glob("training_data/smartwatch_accelerometer/*.csv")
a_real_acc=[]
num=[]
for i in a_real:
    data=pd.read_csv(i)
    a=data.groupby('device_id').agg({'x':'var','y':'count'}).reset_index()
    deviceid=a.loc[a.x.idxmax(),'device_id']
    if int(a.loc[a.device_id==deviceid,'y'])<=data.shape[0]*0.2:
        deviceid=a.loc[a.x.idxmin(),'device_id']
    data=data[data.device_id==deviceid].reset_index(drop=True)
    num.append(data.shape[0])
    if data.shape[0]>=2000:
        a_real_acc.append(i)

In [None]:
# smartwatch_accelerometer
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(Dataset_Generation_PipeLine_SmartWatch)(i) for i in a_real_acc)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#1.8061357021331788 Mins

In [None]:
df_train=pd.DataFrame(result)
df_train.columns=['device_id_acc']+all_columns
df_train['measurement_id']=[item[len('training_data/smartwatch_accelerometer/'):-4] for item in a_real_acc]
print(df_train.shape)
df_train.head()

In [None]:
#ancillary
a_real=glob.glob("ancillary_data/smartwatch_accelerometer/*.csv")
a_real_acc=[]
num=[]
for i in a_real:
    data=pd.read_csv(i)
    a=data.groupby('device_id').agg({'x':'var','y':'count'}).reset_index()
    deviceid=a.loc[a.x.idxmax(),'device_id']
    if int(a.loc[a.device_id==deviceid,'y'])<=data.shape[0]*0.2:
        deviceid=a.loc[a.x.idxmin(),'device_id']
    data=data[data.device_id==deviceid].reset_index(drop=True)
    num.append(data.shape[0])
    if data.shape[0]>=2000:
        a_real_acc.append(i)

In [None]:
# smartwatch_accelerometer
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(Dataset_Generation_PipeLine_SmartWatch)(i) for i in a_real_acc)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#1.8061357021331788 Mins

In [None]:
df_ancillary=pd.DataFrame(result)
df_ancillary.columns=['device_id_acc']+all_columns
df_ancillary['measurement_id']=[item[len('ancillary_data/smartwatch_accelerometer/'):-4] for item in a_real_acc]
print(df_ancillary.shape)
df_ancillary.head()

In [None]:
Frame_smartwatch_acc = df_train.append(pd.DataFrame(data = df_ancillary), ignore_index=True)

In [None]:
# testing
a_real=glob.glob("testing_data/smartwatch_accelerometer/*.csv")
a_real_acc=[]
num=[]
for i in a_real:
    data=pd.read_csv(i)
    a=data.groupby('device_id').agg({'x':'var','y':'count'}).reset_index()
    deviceid=a.loc[a.x.idxmax(),'device_id']
    if int(a.loc[a.device_id==deviceid,'y'])<=data.shape[0]*0.2:
        deviceid=a.loc[a.x.idxmin(),'device_id']
    data=data[data.device_id==deviceid].reset_index(drop=True)
    num.append(data.shape[0])
    if data.shape[0]>=2000:
        a_real_acc.append(i)

print(len(a_real_acc),len(a_real))

# smartwatch_accelerometer
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(Dataset_Generation_PipeLine_SmartWatch)(i) for i in a_real_acc)


df_train=pd.DataFrame(result)
df_train.columns=['device_id_acc']+all_columns
df_train['measurement_id']=[item[len('testing_data/smartwatch_accelerometer/'):-4] for item in a_real_acc]
print(df_train.shape)

Frame_smartwatch_acc_test=df_train.copy()

### Gyroscope

In [None]:
def preprocess_real_gyroscope(data):
    
    a=data.groupby('device_id').agg({'x':'var','y':'count'}).reset_index()
    deviceid=a.loc[a.x.idxmax(),'device_id']
    if int(a.loc[a.device_id==deviceid,'y'])<=data.shape[0]*0.2:
        deviceid=a.loc[a.x.idxmin(),'device_id']
    
    data=data[data.device_id==deviceid].reset_index()
    data.rename(columns={'t':'Timestamp','x':'X','y':'Y','z':'Z'},inplace=True)
   
    ls=['X','Y','Z']
    #freq=round((1/((data.Timestamp.max()/data.Timestamp.shape[0]).round(3))),0)
    freq=50
    t1=np.arange(data.Timestamp[0],data.Timestamp[(data.shape[0])-1],0.02)
    #t1=np.arange(data.Timestamp[0],data.Timestamp[(data.shape[0])-1],(data.Timestamp.max()/data.Timestamp.shape[0]).round(3))
    df=pd.DataFrame({'Timestamp':t1})
        
    for i in ls:
        fcubic = interpolate.interp1d(data.Timestamp, data[i])
        df[i]=fcubic(t1)
    df.rename(columns={'X':'acc_X','Y':'acc_Y','Z':'acc_Z'},inplace=True)
    return df[['Timestamp','acc_X','acc_Y','acc_Z']],deviceid,freq


def Dataset_Generation_PipeLine_gyroscope(b):
    data,device_id,sampling_freq=preprocess_real_gyroscope(pd.read_csv(b))
    time_sig_df=time_domain_signal(data,sampling_freq)
    freq_sig_df,dfreq=fast_fourier_transform(time_sig_df,sampling_freq)

    # conctenate all features names lists and we add two other columns activity ids and user ids will be related to each row
    all_columns=time_features_names()+frequency_features_names()+angle_columns
    # generate all time features from t_window 
    time_features = t_axial_features_generation(time_sig_df) + t_mag_features_generation(time_sig_df)
    # generate all frequency features from f_window
    frequency_features = f_axial_features_generation(freq_sig_df,dfreq) + f_mag_features_generation(freq_sig_df,dfreq)

    # Generate addtional features from t_window
    additional_features= angle_features(time_sig_df)

    # concatenate all features and append the activity id and the user id
    row= [device_id]+time_features + frequency_features + additional_features 
    return(row)


In [None]:
#training data
a_real=glob.glob("training_data/smartwatch_gyroscope/*.csv")
a_real_gyro=[]
num=[]
for i in a_real:
    data=pd.read_csv(i)
    a=data.groupby('device_id').agg({'x':'var','y':'count'}).reset_index()
    deviceid=a.loc[a.x.idxmax(),'device_id']
    if int(a.loc[a.device_id==deviceid,'y'])<=data.shape[0]*0.2:
        deviceid=a.loc[a.x.idxmin(),'device_id']
    data=data[data.device_id==deviceid].reset_index(drop=True)
    num.append(data.shape[0])
    if data.shape[0]>=2000:
        a_real_gyro.append(i)

In [None]:
# smartwatch_accelerometer
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(Dataset_Generation_PipeLine_gyroscope)(i) for i in a_real_gyro)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#1.8061357021331788 Mins

In [None]:
df_train=pd.DataFrame(result)
df_train.columns=['device_id_gyro']+[i.replace('acc','gyro') for i in all_columns]
df_train['measurement_id']=[item[len('training_data/smartwatch_gyroscope/'):-4] for item in a_real_gyro]
df_train=df_train[[i for i in list(df_train.columns) if not any(w in 'grav' for w in i.split('_'))]]
df_train.head()

In [None]:
a_real=glob.glob("ancillary_data/smartwatch_gyroscope/*.csv")
a_real_gyro=[]
num=[]
for i in a_real:
    data=pd.read_csv(i)
    a=data.groupby('device_id').agg({'x':'var','y':'count'}).reset_index()
    deviceid=a.loc[a.x.idxmax(),'device_id']
    if int(a.loc[a.device_id==deviceid,'y'])<=data.shape[0]*0.2:
        deviceid=a.loc[a.x.idxmin(),'device_id']
    data=data[data.device_id==deviceid].reset_index(drop=True)
    num.append(data.shape[0])
    if data.shape[0]>=2000:
        a_real_gyro.append(i)
#pd.DataFrame({'num':num}).sort_values(by='num')

In [None]:
# smartwatch_accelerometer
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(Dataset_Generation_PipeLine_gyroscope)(i) for i in a_real_gyro)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#1.8061357021331788 Mins

In [None]:
df_ancillary=pd.DataFrame(result)
df_ancillary.columns=['device_id_gyro']+[i.replace('acc','gyro') for i in all_columns]
df_ancillary['measurement_id']=[item[len('ancillary_data/smartwatch_gyroscope/'):-4] for item in a_real_gyro]
df_ancillary=df_ancillary[[i for i in list(df_ancillary.columns) if not any(w in 'grav' for w in i.split('_'))]]
#df_ancillary=df_ancillary.drop('device_id_gyro',axis=1)
print(df_ancillary.shape)
df_ancillary.head()

In [None]:
Frame_smartwatch_gyro = df_train.append(pd.DataFrame(data = df_ancillary), ignore_index=True)
print(Frame_smartwatch_gyro.shape)
Frame_smartwatch_gyro.head()

In [None]:
#testing
a_real=glob.glob("testing_data/smartwatch_gyroscope/*.csv")
a_real_gyro=[]
num=[]
for i in a_real:
    data=pd.read_csv(i)
    a=data.groupby('device_id').agg({'x':'var','y':'count'}).reset_index()
    deviceid=a.loc[a.x.idxmax(),'device_id']
    if int(a.loc[a.device_id==deviceid,'y'])<=data.shape[0]*0.2:
        deviceid=a.loc[a.x.idxmin(),'device_id']
    data=data[data.device_id==deviceid].reset_index(drop=True)
    num.append(data.shape[0])
    if data.shape[0]>=2000:
        a_real_gyro.append(i)

# smartwatch_accelerometer
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(Dataset_Generation_PipeLine_gyroscope)(i) for i in a_real_gyro)
print("--- %s Mins ---" % ((time.time() - start_time)/60))

df_train=pd.DataFrame(result)
df_train.columns=['device_id_gyro']+[i.replace('acc','gyro') for i in all_columns]
df_train['measurement_id']=[item[len('testing_data/smartwatch_gyroscope/'):-4] for item in a_real_gyro]
df_train=df_train[[i for i in list(df_train.columns) if not any(w in 'grav' for w in i.split('_'))]]
print(df_train.shape)

Frame_smartwatch_gyro_test=df_train.copy()

In [None]:
#Merging smartwatch_accelerometer and gyroscope for training and testing data
Frame_smartwatch=pd.merge(Frame_smartwatch_acc,Frame_smartwatch_gyro,on='measurement_id')
Frame_smartwatch=Frame_smartwatch.drop('device_id_gyro',axis=1)

Frame_smartwatch_test=pd.merge(Frame_smartwatch_acc_test,Frame_smartwatch_gyro_test,on='measurement_id')
Frame_smartwatch_test=Frame_smartwatch_test.drop('device_id_gyro',axis=1)

In [None]:
#export part1 features of training data from smartwatch signal for realpd
Frame_smartwatch.to_csv('analysis2_realpd_comp_training_abhiroop_tillhurst_smartwatch.csv',index=False)

#export part1 features of testing data from smartwatch signal for realpd
Frame_smartwatch_test.to_csv('analysis2_realpd_comp_testing_abhiroop_tillhurst_smartwatch.csv',index=False)