In [1]:
import pandas as pd
import numpy as np
import pywt
import matplotlib.pyplot as plt
from scipy import interpolate
import glob
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm


import heapq
from scipy.signal import argrelextrema

from scipy import signal
from scipy.ndimage.interpolation import shift
import pywt
from numpy.fft import fft
from numpy import zeros, floor, log10, log, mean, array, sqrt, vstack, cumsum, ones, log2, std
from numpy.linalg import svd, lstsq
import time

from sklearn.decomposition import PCA
import time
from joblib import Parallel, delayed
import multiprocessing

from sklearn.model_selection import train_test_split
import lightgbm as lgb
from catboost import CatBoostRegressor, FeaturesData, Pool
from sklearn.metrics import mean_squared_error

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
#==========================================================================================
#making time stamp uniform by Interpolation

def preprocess(data):
    freq=50
    ls=['X','Y','Z']
    t1=np.arange(data.Timestamp[0],data.Timestamp[(data.shape[0])-1],0.02)
    df=pd.DataFrame({'Timestamp':t1})
    for i in ls:
        fcubic = interpolate.interp1d(data.Timestamp, data[i], kind='cubic')
        df[i]=fcubic(t1)
    df.columns=['Timestamp','acc_X','acc_Y','acc_Z']
    return df



def preprocess_real_smartphone(data):
    data.rename(columns={'t':'Timestamp','x':'X','y':'Y','z':'Z'},inplace=True)
    ls=['X','Y','Z']
    freq=round((1/((data.Timestamp.max()/data.Timestamp.shape[0]).round(3))),0)
    t1=np.arange(data.Timestamp[0],data.Timestamp[(data.shape[0])-1],(data.Timestamp.max()/data.Timestamp.shape[0]).round(3))
    #t1=np.arange(data.Timestamp[0],data.Timestamp[(data.shape[0])-1],0.01)
    df=pd.DataFrame({'Timestamp':t1})
    for i in ls:
        fcubic = interpolate.interp1d(data.Timestamp, data[i], kind='cubic')
        df[i]=fcubic(t1)
    df.columns=['Timestamp','acc_X','acc_Y','acc_Z']
    return df,freq


#==========================================================================================
#median filter
from scipy.signal import medfilt # import the median filter function
def median(signal):# input: numpy array 1D (one column)  
    #applying the median filter
    return  medfilt(np.array(signal), kernel_size=3) # applying the median filter order3(kernel_size=3)


#==========================================================================================
#components_selection_one_signal
import math # import math library


def components_selection_one_signal(t_signal,sampling_freq):
    nyq=sampling_freq/float(2) # nyq is the nyquist frequency equal to the half of the sampling frequency[50/2= 25 Hz]

    freq1 = 0.3
    freq2 = 20

    t_signal=np.array(t_signal)
    t_signal_length=len(t_signal) # number of points in a t_signal
    
    # the t_signal in frequency domain after applying fft
    f_signal=np.fft.fft(t_signal) # 1D numpy array contains complex values (in C)
    
    # generate frequencies associated to f_signal complex values
    freqs=np.array(np.fft.fftfreq(t_signal_length, d=1/float(sampling_freq))) # frequency values between [-25hz:+25hz]
        
    df=pd.DataFrame({'freq':abs(freqs),'amplitute':f_signal})
    df['f_DC_signal']=np.where(df.freq>freq1,0,df.amplitute)
    df['f_noise_signal']=np.where(df.freq<=freq2,0,df.amplitute)
    df['f_body_signal']=np.where(df.freq<=freq1,0,np.where(df.freq>freq2,0,df.amplitute))

    
    # Inverse the transformation of signals in freq domain #
    # applying the inverse fft(ifft) to signals in freq domain and put them in float format
    t_DC_component= np.fft.ifft(np.array(df['f_DC_signal'])).real
    t_body_component= np.fft.ifft(np.array(df['f_body_signal'])).real
    t_noise=np.fft.ifft(np.array(df['f_noise_signal'])).real
    
    total_component=t_signal-t_noise # extracting the total component(filtered from noise) 
                                     #  by substracting noise from t_signal (the original signal).
    
    # return outputs mentioned earlier
    return (total_component,t_DC_component,t_body_component,t_noise) 


#=================================================================================================================
#Define verify gravity function
def mag_3_signals(df): # Euclidian magnitude
    return np.array(np.sqrt(np.square(df).sum(axis=1)))

def verify_gravity(data):
    
    acc_x=np.array(data['acc_X']) # copy acc_X column from dataframe in raw_dic having the key mentioned above
    acc_y=np.array(data['acc_Y'])# copy acc_Y column  from dataframe in raw_dic having the key mentioned above
    acc_z=np.array(data['acc_Z'])# copy acc_Z column  from dataframe in raw_dic having the key mentioned above

    # apply the filtering method to acc_[X,Y,Z] and store gravity components
    grav_acc_X=components_selection_one_signal(acc_x)[1] 
    grav_acc_Y=components_selection_one_signal(acc_y)[1]
    grav_acc_Z=components_selection_one_signal(acc_z)[1]
    
    # calculating gravity magnitude signal
    grav_acc_mag=mag_3_signals(grav_acc_X, grav_acc_Y,grav_acc_Z)
    print('mean value = ',round((sum(grav_acc_mag) / len(grav_acc_mag)),3),' g')
    
#=================================================================================================================    
#Define jerking and magnitude functions
def jerk_one_signal(signal,sampling_freq):
    signal=pd.DataFrame(signal)
    jerk=(signal.shift(-1)-signal)*sampling_freq
    return np.array(jerk.dropna()).transpose()[0]

def time_domain_signal(data,sampling_freq):
    time_sig_df=pd.DataFrame()
    for column in ['acc_X','acc_Y','acc_Z']:
        t_signal=np.array(data[column])
        #med_filtred=median(t_signal)
        med_filtred=(t_signal)
        _,grav_acc,body_acc,_=components_selection_one_signal(med_filtred,sampling_freq)
        body_acc_jerk=jerk_one_signal(body_acc,sampling_freq)
        time_sig_df['t_body_'+column]=body_acc[:-1]
        time_sig_df['t_grav_'+column]= grav_acc[:-1]
        time_sig_df['t_body_acc_jerk_'+column[-1]]=body_acc_jerk

    # all 15 axial signals generated above are reordered to facilitate magnitudes signals generation
    new_columns_ordered=['t_body_acc_X','t_body_acc_Y','t_body_acc_Z',
                              't_grav_acc_X','t_grav_acc_Y','t_grav_acc_Z',
                              't_body_acc_jerk_X','t_body_acc_jerk_Y','t_body_acc_jerk_Z']


    # create new dataframe to order columns
    time_sig_df=time_sig_df[new_columns_ordered]

    # Magnitude Features
    for i in range(0,9,3):
        mag_col_name=new_columns_ordered[i][:-1]+'mag'# Create the magnitude column name related to each 3-axial signals
        time_sig_df[mag_col_name]=mag_3_signals(time_sig_df[new_columns_ordered[i:i+3]]) # store the signal_mag with its appropriate column name

    return(time_sig_df)

In [3]:
a_real=glob.glob("training_data/smartphone_accelerometer/*.csv")

In [4]:
data,sampling_freq=preprocess_real_smartphone(pd.read_csv(a_real[14]))
print(data.shape)
data.head()

(239999, 4)


Unnamed: 0,Timestamp,acc_X,acc_Y,acc_Z
0,0.0,0.3304,0.486023,9.694122
1,0.005,0.308852,0.4956,9.703699
2,0.01,0.317626,0.496629,9.697804
3,0.015,0.31364,0.452504,9.679757
4,0.02,0.296881,0.474052,9.691729


In [5]:
time_sig_df=time_domain_signal(data,sampling_freq)
time_sig_df.shape

(239998, 12)

In [6]:
time_sig_df.head()

Unnamed: 0,t_body_acc_X,t_body_acc_Y,t_body_acc_Z,t_grav_acc_X,t_grav_acc_Y,t_grav_acc_Z,t_body_acc_jerk_X,t_body_acc_jerk_Y,t_body_acc_jerk_Z,t_body_acc_mag,t_grav_acc_mag,t_body_acc_jerk_mag
0,0.016403,0.015019,-0.008986,0.241659,0.438172,9.706851,5.394058,3.139477,-0.236475,0.023987,9.71974,6.245646
1,0.043373,0.030716,-0.010169,0.242028,0.438459,9.706896,4.24604,2.321064,-0.221879,0.054112,9.719807,4.844112
2,0.064603,0.042321,-0.011278,0.242397,0.438746,9.706941,2.62254,1.375334,-0.09512,0.078051,9.719875,2.96282
3,0.077716,0.049198,-0.011754,0.242766,0.439033,9.706986,0.880836,0.502243,0.091331,0.092727,9.719942,1.018067
4,0.08212,0.051709,-0.011297,0.243135,0.43932,9.707032,-0.618131,-0.141429,0.269791,0.0977,9.720009,0.689112


In [7]:
#Fractal Dimension using Katz FD algorithm
def katz(data):
    n = len(data)-1
    L = np.hypot(np.diff(data), 1).sum() # Sum of distances
    d = np.hypot(data - data[0], np.arange(len(data))).max() # furthest distance from first point
    return np.log10(n) / (np.log10(d/L) + np.log10(n))




#Coefficient of variation
def coeff_var(a):
    output = np.std(a)/np.mean(a) #computing coefficient of variation
    return output

#Mean and variance of Vertex to Vertex Slope
def slope(x):
    
    amp_max = np.array(x[argrelextrema(np.array(x), np.greater)[0]])
    t_max = argrelextrema(np.array(x), np.greater)[0]
    amp_min = np.array(x[argrelextrema(np.array(x), np.less)[0]])
    t_min = argrelextrema(np.array(x), np.less)[0]
    t = np.concatenate((t_max,t_min),axis=0)
    t.sort()#sort on the basis of time

    amp = np.zeros(len(t))
    res = np.zeros(len(t))
    
    for l in range(len(t)):
        amp[l]=x[t[l]]

    amp_diff = first_diff(amp)
    t_diff = first_diff(t)

    for q in range(len(amp_diff)):
        res[q] = amp_diff[q]/t_diff[q]         
    
    res=res[~np.isnan(res)]
    return [np.mean(res),np.std(res)] #returning mean and std of vertex to vertex slope


#Hjorth Parameter
def hjorth(input):                                              
    hfeatures = []
    diff_input = np.diff(input)
    diff_diffinput = np.diff(diff_input)
    
    hjorth_activity = np.var(input)
    hjorth_mobility = np.sqrt(np.var(diff_input)/hjorth_activity)
    hjorth_diffmobility = np.sqrt(np.var(diff_diffinput)/np.var(diff_input))
    hjorth_complexity = hjorth_diffmobility/hjorth_mobility
    
    hfeatures.append(hjorth_activity)
    hfeatures.append(hjorth_mobility)
    hfeatures.append(hjorth_complexity)
    
    return hfeatures  #returning hjorth activity, hjorth mobility , hjorth complexity


#Kurtosis
def kurtosis(a):
    mean_i = np.mean(a) # Saving the mean of array i
    std_i = np.std(a) # Saving the standard deviation of array i
    t = 0.0
    for j in a:
        t += (pow((j-mean_i)/std_i,4)-3)
    kurtosis_i = t/len(a) # Formula: (1/N)*(summation(x_i-mean)/standard_deviation)^4-3
    return kurtosis_i


#Second difference Mean,Max,std
def sec_diff(b):
    temp1 = abs(b-b.shift(1)) # Obtaining the 1st Diffs
    t = abs(temp1-temp1.shift(1)) # Summing the 2nd Diffs
    output = t.mean() # Calculating the mean of the 2nd Diffs
    return [t.mean(),t.max(),t.std()]

#Skewness
def skewness(a):
    import scipy.stats as sp
    skew_array=sp.stats.skew(a,axis=0,bias=True)
    return skew_array #returning skewness


#First Difference Mean,Max,std
def first_diff_mean(a):
    output = abs(a-a.shift(1)) # Obtaining the 1st Diffs
    return [output.mean(),output.max(),output.std()] #returns first diff mean,max,min

#First Difference
def first_diff(a):
    if str(type(a))!="<class 'pandas.core.series.Series'>":
        a=pd.DataFrame(a)[0]
    output = a-a.shift(1)# Obtaining the 1st Diffs
    return output


#wavelet features
def wavelet_features(epoch): 
    wfeatures = []
    
    #calculating the coefficients of wavelet transform.
    cA_values,cD_values=pywt.dwt(epoch,'coif1')
    
    cA_square=np.array([x for x in (np.square(cA_values)).tolist() if x != 0])
    cD_square=np.array([x for x in (np.square(cD_values)).tolist() if x != 0])
    
      
    wfeatures.append(np.mean(cA_values)) #cA_mean
    wfeatures.append(abs(np.std(cA_values))) #cA_std
    wfeatures.append(abs(np.sum(np.square(cA_values)))) #cA_Energy
    wfeatures.append(abs(np.sum(cA_square * np.log(cA_square)))) #Entropy_A
    
    wfeatures.append(np.mean(cD_values)) #cD_mean
    wfeatures.append(abs(np.std(cD_values))) #cD_std
    wfeatures.append(abs(np.sum(np.square(cD_values)))) #cD_Energy
    wfeatures.append(abs(np.sum(cD_square * np.log(cD_square)))) #Entropy_D

    return wfeatures # returning 'Wavelet Approximate Mean','Wavelet Approximate Std Deviation','Wavelet Approximate Energy','Wavelet Approximate Entropy','Wavelet Detailed Mean','Wavelet Detailed Std Deviation','Wavelet Detailed Energy','Wavelet Detailed Entropy'


In [8]:
def features_names():
    # Generating time feature names
    
    # time domain magnitude signals' names
    magnitude_signals=['t_body_acc_X','t_body_acc_Y','t_body_acc_Z',
                       't_grav_acc_X','t_grav_acc_Y','t_grav_acc_Z',
                       't_body_acc_jerk_X','t_body_acc_jerk_Y','t_body_acc_jerk_Z',
                       't_body_acc_Mag','t_grav_acc_Mag','t_body_acc_jerk_Mag']

    # functions' names:
    t_one_input_features_name1=['_katz()','_coeff_var()']

    t_one_input_features_slope=['_slope_mean()','_slope_std()']

    t_one_input_features_hjorth=['_hjorth_activity()','_hjorth_mobility()','_hjorth_complexity()']

    t_one_input_features_sec_diff=['_sec_diff_mean()','_sec_diff_max()','_sec_diff_std()']
    t_one_input_features_first_diff=['_first_diff_mean()','_first_diff_max()','_first_diff_std()']
    
    t_one_input_features_wavelet=['_wavelet_cA_mean()','_wavelet_cA_std()','_wavelet_cA_Energy()','_wavelet_Entropy_A()',
                                  '_wavelet_cD_mean()','_wavelet_cD_std()','_wavelet_cD_Energy()','_wavelet_Entropy_D()']
    
    features=[]# Empty list : it will contain all time domain features' names
    
    for columns in magnitude_signals: # iterate throw time domain magnitude column names

        # build feature names related to that column
        #list 1
        for feature in t_one_input_features_name1:
            newcolumn=columns+feature
            features.append(newcolumn)
        
        # list 2
        for feature in t_one_input_features_slope: 
            newcolumn=columns+feature
            features.append(newcolumn)
            
       
        # list 3
        for feature in t_one_input_features_hjorth:
            newcolumn=columns+feature
            features.append(newcolumn)
            
        # list 4
        for feature in t_one_input_features_sec_diff:
            newcolumn=columns+feature
            features.append(newcolumn)
        # list 5
        for feature in t_one_input_features_first_diff:
            newcolumn=columns+feature
            features.append(newcolumn)
        # list 6
        for feature in t_one_input_features_wavelet:
            newcolumn=columns+feature
            features.append(newcolumn)
            
        
    ###########################################################################################################
    time_list_features=features
    
    return time_list_features # return all time domain features' names

In [9]:
def features_generation(t_window):
    
    # select mag columns : the last 3 columns in a time domain window
    
    mag_columns=t_window.columns # mag columns' names
    
    t_mag_features=[] # a global list will contain all time domain magnitude features
    
    for col in mag_columns: # iterate throw each mag column
        
        fkatz = [katz(t_window[col])] # 1 value
        fcoeff_var   = [coeff_var(t_window[col])] # 1 value
        fslope    = slope(t_window[col])# 2 value
        fhjorth    = hjorth(t_window[col])# 3 value
        fsec_diff    = sec_diff(t_window[col])# 3 value
        ffirst_diff_mean = first_diff_mean(t_window[col])# 3 value
        fwavelet_features    = wavelet_features(t_window[col])# 8 value
        
        # 13 value per each t_mag_column
        col_mag_values = fkatz+fcoeff_var+fslope+fhjorth+fsec_diff+ffirst_diff_mean+fwavelet_features
        
        # col_mag_values will be added to the global list
        t_mag_features= t_mag_features+ col_mag_values
    
    # t_mag_features contains 65 values = 13 values (per each t_mag_column) x 5 (t_mag_columns)
    return t_mag_features
 

def Dataset_Generation_PipeLine(b):
    data,sampling_freq=preprocess_real_smartphone(pd.read_csv(b))
    time_sig_df=time_domain_signal(data,sampling_freq)
    # concatenate all features and append the activity id and the user id
    row= features_generation(time_sig_df)
    return(row)




In [10]:
from joblib import Parallel, delayed
import multiprocessing
import time

# Smartphone

### training

In [13]:
a_real=glob.glob("training_data/smartphone_accelerometer/*.csv")
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(Dataset_Generation_PipeLine)(i) for i in a_real)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#20.544 min 

--- 20.54461863040924 Mins ---


In [14]:
training_data=pd.DataFrame(result)
training_data.columns=features_names()
training_data['measurement_id']=[item[len('training_data/smartphone_accelerometer/'):-4] for item in a_real]
print(training_data.shape)
training_data.head()

(526, 253)


Unnamed: 0,t_body_acc_X_katz(),t_body_acc_X_coeff_var(),t_body_acc_X_slope_mean(),t_body_acc_X_slope_std(),t_body_acc_X_hjorth_activity(),t_body_acc_X_hjorth_mobility(),t_body_acc_X_hjorth_complexity(),t_body_acc_X_sec_diff_mean(),t_body_acc_X_sec_diff_max(),t_body_acc_X_sec_diff_std(),...,t_body_acc_jerk_Mag_first_diff_std(),t_body_acc_jerk_Mag_wavelet_cA_mean(),t_body_acc_jerk_Mag_wavelet_cA_std(),t_body_acc_jerk_Mag_wavelet_cA_Energy(),t_body_acc_jerk_Mag_wavelet_Entropy_A(),t_body_acc_jerk_Mag_wavelet_cD_mean(),t_body_acc_jerk_Mag_wavelet_cD_std(),t_body_acc_jerk_Mag_wavelet_cD_Energy(),t_body_acc_jerk_Mag_wavelet_Entropy_D(),measurement_id
0,1.000046,-474035.467275,-1.3e-05,0.026462,0.037756,0.18201,4.080593,0.004415,1.830736,0.02086,...,29.383219,4.103529,48.020357,139369900.0,2186716000.0,-0.000474,22.197886,29565260.0,433466900.0,05205dc3-9c82-4b6b-921e-b6c2edd2c993
1,1.003821,202140.199789,-0.000983,0.328246,1.061892,0.340858,2.03215,0.096267,4.764164,0.179184,...,22.172472,47.003156,65.87771,392956700.0,4251168000.0,-0.001581,16.295724,15933300.0,142432400.0,aba31c29-79ef-4221-9412-156538a2fd4e
2,1.000324,-64395.594048,2.4e-05,0.083277,0.142422,0.281528,2.455005,0.014698,4.92732,0.059402,...,10.46108,11.883184,30.390764,63889580.0,723126300.0,-0.001341,7.486253,3362695.0,30971150.0,6cab466b-3c95-4cc0-ba1b-8650f5a12504
3,1.00006,-58766.906591,6e-05,0.030454,0.013962,0.321067,1.839911,0.010312,0.72694,0.016041,...,2.63972,5.301139,7.415292,4985401.0,34368530.0,-0.001355,1.945919,227200.0,1203258.0,27eccfc4-e329-4695-aee8-6d706b247191
4,1.002672,4575.288448,0.002206,0.464255,0.285314,0.66293,2.166208,0.079449,24.713101,0.293832,...,16.007185,13.712178,33.357373,26908370.0,293452700.0,0.04687,11.206863,2598204.0,23439910.0,ed560c25-e5c5-4dba-82c7-3fc18c248ce4


In [15]:
#ancillary_data
b_real=glob.glob("ancillary_data/smartphone_accelerometer/*.csv")
b_real2=[]
for i in b_real:
    a=pd.read_csv(i).shape[0]
    if a>=2000:
        b_real2.append(i)


start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(Dataset_Generation_PipeLine)(i) for i in b_real2)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#11.363 min 



--- 11.363233252366383 Mins ---


In [16]:
df_ancillary=pd.DataFrame(result)
df_ancillary.columns=features_names()
df_ancillary['measurement_id']=[item[len('ancillary_data/smartphone_accelerometer/'):-4] for item in b_real2]
print(df_ancillary.shape)
df_ancillary.head()

(348, 253)


Unnamed: 0,t_body_acc_X_katz(),t_body_acc_X_coeff_var(),t_body_acc_X_slope_mean(),t_body_acc_X_slope_std(),t_body_acc_X_hjorth_activity(),t_body_acc_X_hjorth_mobility(),t_body_acc_X_hjorth_complexity(),t_body_acc_X_sec_diff_mean(),t_body_acc_X_sec_diff_max(),t_body_acc_X_sec_diff_std(),...,t_body_acc_jerk_Mag_first_diff_std(),t_body_acc_jerk_Mag_wavelet_cA_mean(),t_body_acc_jerk_Mag_wavelet_cA_std(),t_body_acc_jerk_Mag_wavelet_cA_Energy(),t_body_acc_jerk_Mag_wavelet_Entropy_A(),t_body_acc_jerk_Mag_wavelet_cD_mean(),t_body_acc_jerk_Mag_wavelet_cD_std(),t_body_acc_jerk_Mag_wavelet_cD_Energy(),t_body_acc_jerk_Mag_wavelet_Entropy_D(),measurement_id
0,1.0,-587966.420697,-7.193507e-07,0.000887,5.8e-05,0.1329,1.403225,0.000143,0.001494,0.000111,...,0.060253,1.199134,0.515406,511073.2,431907.9,2e-06,0.02521,190.6641,1127.702,eab1cc17-40aa-4e1f-8e4f-64e6d5743509
1,1.000315,48894.202072,0.0004516919,0.103361,0.185877,0.242232,3.154527,0.016163,7.566209,0.061289,...,7.190025,9.733553,21.477499,33362060.0,327404400.0,-0.004947,5.046609,1528123.0,13338290.0,b1a5fd6d-db9c-4870-a3c0-943e0656d112
2,1.003249,-5759.39098,0.002671911,0.53095,0.402334,0.537203,2.177747,0.071791,13.667867,0.270522,...,11.462667,10.4359,26.593493,15667910.0,154366000.0,0.021199,7.896549,1197109.0,9339204.0,bca5e12d-9fd6-496b-ac08-9e2472d8b299
3,1.000655,-36516.433629,-0.0004700092,0.107656,0.171133,0.312366,2.11692,0.032236,1.688237,0.063908,...,12.296451,21.899163,37.612554,113660400.0,1099945000.0,-0.001263,8.930533,4785424.0,33692170.0,9d74f5e1-241a-4f4a-bc6f-2779edf410cd
4,1.000004,140114.724393,-1.428941e-05,0.00802,0.000118,0.823587,1.460451,0.004942,0.158282,0.005557,...,0.513419,1.153107,0.73574,74842.47,115969.0,-0.00224,0.465897,8683.034,3455.195,49f80736-6b50-44a6-a77b-9b1572334a8c


In [17]:
Frame = training_data.append(pd.DataFrame(data = df_ancillary), ignore_index=True)
print(Frame.shape)
Frame.head()

(874, 253)


Unnamed: 0,t_body_acc_X_katz(),t_body_acc_X_coeff_var(),t_body_acc_X_slope_mean(),t_body_acc_X_slope_std(),t_body_acc_X_hjorth_activity(),t_body_acc_X_hjorth_mobility(),t_body_acc_X_hjorth_complexity(),t_body_acc_X_sec_diff_mean(),t_body_acc_X_sec_diff_max(),t_body_acc_X_sec_diff_std(),...,t_body_acc_jerk_Mag_first_diff_std(),t_body_acc_jerk_Mag_wavelet_cA_mean(),t_body_acc_jerk_Mag_wavelet_cA_std(),t_body_acc_jerk_Mag_wavelet_cA_Energy(),t_body_acc_jerk_Mag_wavelet_Entropy_A(),t_body_acc_jerk_Mag_wavelet_cD_mean(),t_body_acc_jerk_Mag_wavelet_cD_std(),t_body_acc_jerk_Mag_wavelet_cD_Energy(),t_body_acc_jerk_Mag_wavelet_Entropy_D(),measurement_id
0,1.000046,-474035.467275,-1.3e-05,0.026462,0.037756,0.18201,4.080593,0.004415,1.830736,0.02086,...,29.383219,4.103529,48.020357,139369900.0,2186716000.0,-0.000474,22.197886,29565260.0,433466900.0,05205dc3-9c82-4b6b-921e-b6c2edd2c993
1,1.003821,202140.199789,-0.000983,0.328246,1.061892,0.340858,2.03215,0.096267,4.764164,0.179184,...,22.172472,47.003156,65.87771,392956700.0,4251168000.0,-0.001581,16.295724,15933300.0,142432400.0,aba31c29-79ef-4221-9412-156538a2fd4e
2,1.000324,-64395.594048,2.4e-05,0.083277,0.142422,0.281528,2.455005,0.014698,4.92732,0.059402,...,10.46108,11.883184,30.390764,63889580.0,723126300.0,-0.001341,7.486253,3362695.0,30971150.0,6cab466b-3c95-4cc0-ba1b-8650f5a12504
3,1.00006,-58766.906591,6e-05,0.030454,0.013962,0.321067,1.839911,0.010312,0.72694,0.016041,...,2.63972,5.301139,7.415292,4985401.0,34368530.0,-0.001355,1.945919,227200.0,1203258.0,27eccfc4-e329-4695-aee8-6d706b247191
4,1.002672,4575.288448,0.002206,0.464255,0.285314,0.66293,2.166208,0.079449,24.713101,0.293832,...,16.007185,13.712178,33.357373,26908370.0,293452700.0,0.04687,11.206863,2598204.0,23439910.0,ed560c25-e5c5-4dba-82c7-3fc18c248ce4


In [19]:
Frame.to_csv('realpd_comp_training_abhiroop_lastfeatures_smartphone.csv',index=False)

### testing

In [18]:
a_real=glob.glob("testing_data/smartphone_accelerometer/*.csv")
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(Dataset_Generation_PipeLine)(i) for i in a_real)

training_data=pd.DataFrame(result)
training_data.columns=features_names()
training_data['measurement_id']=[item[len('testing_data/smartphone_accelerometer/'):-4] for item in a_real]

Frame_test = training_data.copy()
print(Frame_test.shape)
Frame_test.head()



(169, 253)


Unnamed: 0,t_body_acc_X_katz(),t_body_acc_X_coeff_var(),t_body_acc_X_slope_mean(),t_body_acc_X_slope_std(),t_body_acc_X_hjorth_activity(),t_body_acc_X_hjorth_mobility(),t_body_acc_X_hjorth_complexity(),t_body_acc_X_sec_diff_mean(),t_body_acc_X_sec_diff_max(),t_body_acc_X_sec_diff_std(),...,t_body_acc_jerk_Mag_first_diff_std(),t_body_acc_jerk_Mag_wavelet_cA_mean(),t_body_acc_jerk_Mag_wavelet_cA_std(),t_body_acc_jerk_Mag_wavelet_cA_Energy(),t_body_acc_jerk_Mag_wavelet_Entropy_A(),t_body_acc_jerk_Mag_wavelet_cD_mean(),t_body_acc_jerk_Mag_wavelet_cD_std(),t_body_acc_jerk_Mag_wavelet_cD_Energy(),t_body_acc_jerk_Mag_wavelet_Entropy_D(),measurement_id
0,1.007186,1259028.0,0.004715,0.500324,2.343522,0.402286,1.602876,0.11337,13.101362,0.318258,...,32.06666,49.346485,98.683105,730418000.0,8851149000.0,-0.044703,21.235846,27058240.0,260716400.0,7860035d-c9df-44e9-ba0e-0855dfad90f4
1,1.011013,223499.6,-0.00322,0.524701,9.778053,0.227365,2.470623,0.134461,8.508648,0.318067,...,45.986081,86.334489,167.132938,2123259000.0,27029890000.0,0.063188,31.691755,60263280.0,596033500.0,8abf7688-c6bb-488f-bb9d-c359a5f86b35
2,1.000591,22032.82,3.8e-05,0.112981,0.196748,0.298925,2.207169,0.027153,3.00566,0.068131,...,14.352802,15.243349,37.291417,97382190.0,1129841000.0,-0.001102,9.159239,5033583.0,49762180.0,5b76c255-d8dd-40d3-82e9-745cb943607f
3,1.000083,-15957.35,-4.7e-05,0.049266,0.016428,0.430944,2.033974,0.005352,4.437498,0.038832,...,7.375219,2.791829,17.730166,19329510.0,233905900.0,0.00635,5.786445,2009013.0,21005530.0,c4f0d5dc-c1db-4cc5-89d1-26d7fee32299
4,1.000197,190550.5,2.8e-05,0.06686,0.038342,0.388051,1.772939,0.015282,3.667435,0.04056,...,4.485009,7.468222,13.97219,15060040.0,134065200.0,0.000939,3.200937,614770.0,4450083.0,6bb1ba19-25ac-4889-8e5b-8e4215784e82


In [20]:
Frame_test.to_csv('realpd_comp_testing_abhiroop_lastfeatures_smartphone.csv',index=False)

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
import ppscore as pps
import lightgbm as lgb
from catboost import CatBoostRegressor, FeaturesData, Pool
from sklearn import preprocessing

#Test Data
cis_pd_testing_id=pd.read_csv('test_data_Id/cis-pd.CIS-PD_Test_Data_IDs.csv')
real_pd_testing_id=pd.read_csv('test_data_Id/real-pd.REAL-PD_Test_Data_IDs.csv')

#Training Data
cis_pd_training_id=pd.read_csv('data_labels/CIS-PD_Training_Data_IDs_Labels.csv')
real_pd_training_id=pd.read_csv('data_labels/REAL-PD_Training_Data_IDs_Labels.csv')

#Ancillary Data
cis_pd_ancillary_id=pd.read_csv('data_labels/CIS-PD_Ancillary_Data_IDs_Labels.csv')
real_pd_ancillary_id=pd.read_csv('data_labels/REAL-PD_Ancillary_Data_IDs_Labels.csv')

In [22]:
#smartwatch_gyroscope features
label=real_pd_training_id.append(pd.DataFrame(real_pd_ancillary_id), ignore_index=True)
df_train=pd.merge(label,Frame,on='measurement_id')
df_train['subject_id']=preprocessing.LabelEncoder().fit(df_train['subject_id']).transform(df_train['subject_id'])
print(df_train.shape)
df_train.head()

(874, 257)


Unnamed: 0,measurement_id,subject_id,on_off,dyskinesia,tremor,t_body_acc_X_katz(),t_body_acc_X_coeff_var(),t_body_acc_X_slope_mean(),t_body_acc_X_slope_std(),t_body_acc_X_hjorth_activity(),...,t_body_acc_jerk_Mag_first_diff_max(),t_body_acc_jerk_Mag_first_diff_std(),t_body_acc_jerk_Mag_wavelet_cA_mean(),t_body_acc_jerk_Mag_wavelet_cA_std(),t_body_acc_jerk_Mag_wavelet_cA_Energy(),t_body_acc_jerk_Mag_wavelet_Entropy_A(),t_body_acc_jerk_Mag_wavelet_cD_mean(),t_body_acc_jerk_Mag_wavelet_cD_std(),t_body_acc_jerk_Mag_wavelet_cD_Energy(),t_body_acc_jerk_Mag_wavelet_Entropy_D()
0,ee053d95-c155-400d-ae42-fe24834ad4a9,2,1.0,0.0,3.0,1.001259,190525.170377,-0.000161,0.179777,0.136928,...,1000.83168,12.201951,19.186292,16.089191,17914580.0,146415800.0,-0.070816,11.577819,3830236.0,29757160.0
1,ce51ee31-8553-4321-9f83-8cd3dabe2f66,2,1.0,0.0,2.0,1.003508,64305.547204,0.000172,0.410412,0.55412,...,2080.319703,54.846396,37.624867,127.513167,1178337000.0,16182870000.0,0.001988,38.812255,100425100.0,1162400000.0
2,5c42911d-0ebd-47ba-9925-dd5ab1c0ed61,2,1.0,0.0,1.0,1.012685,-51797.351931,-0.000411,1.089542,1.926549,...,4484.661304,95.071079,58.5549,171.905829,1522172000.0,21884810000.0,0.135727,70.624774,230210500.0,2957325000.0
3,274f5bc8-2e4f-4d7c-a546-b65b7d6bd01e,2,0.0,0.0,,1.016144,114164.595367,-0.001818,1.192039,0.988393,...,3359.982425,71.654705,48.245759,104.387826,377849600.0,4850456000.0,-0.329438,53.889976,82979890.0,990522900.0
4,ecbeea40-8770-455d-90a6-597e7f896e1b,2,0.0,1.0,0.0,1.027986,-30492.945031,-0.001418,1.628427,1.697255,...,5291.00432,99.530744,76.610624,138.317694,714352800.0,9206557000.0,-0.31492,75.458064,162695200.0,1994225000.0


In [23]:
#lightgbm
def lightgbm(df4,label):
    df4=df4[df4['tremor']!=4]
    train=df4[~df4[label].isnull()]

    # Splitting the dataset into the Training set and Test set
    x_train, x_test, y_train, y_test = train_test_split(train.drop(['subject_id','measurement_id','on_off','dyskinesia','tremor'], axis = 1), train[label], 
                                                        train_size=0.7, 
                                                        stratify = train[label],
                                                        random_state=1)

    d_train = lgb.Dataset(x_train, label=y_train)

    params = {}
    params['learning_rate'] = 0.03
    params['boosting_type'] = 'gbdt'
    params['objective'] = 'regression'
    params['metric'] = 'rmse'
    params['max_depth'] = 5
    params['feature_fraction'] = 0.8
    clf = lgb.train(params, d_train, 150)
    y_pred=clf.predict(x_test)
    y_pred2=clf.predict(x_train)

    #print('lightgbm train_MSE '+label+' :',round(mean_squared_error(y_train, y_pred2),3))
    print('lightgbm test_MSE '+label+' :',round(mean_squared_error(y_test, y_pred),3))
    
    
#Catboost    
#This will give indexes of the categorical features
def categorical_index(df,cols):
    cat=[]
    for c in cols:
        try:
            cat.append(df.columns.get_loc(c))
        except:
            pass # doing nothing on exception
    return cat

def catboost(df4,label): 
    df4=df4[df4['tremor']!=4]
    train=df4[~df4[label].isnull()]
    X_train, X_test, y_train, y_test = train_test_split(train.drop(['subject_id','measurement_id','on_off','dyskinesia','tremor'], axis = 1), train[label], 
                                                        train_size=0.7, 
                                                        stratify = train[label],
                                                        random_state=1)
    train_dataset = Pool(data=X_train,
                         label=y_train,
                         cat_features=categorical_index(X_train,[]))

    eval_dataset = Pool(data=X_test,
                        label=y_test,
                        cat_features=categorical_index(X_train,[]))



    model = CatBoostRegressor(iterations=50, learning_rate=0.1, depth=6,verbose=False)
    # Fit model
    model.fit(train_dataset)
    # Get predictions
    preds = model.predict(eval_dataset)
    preds2 = model.predict(train_dataset)

    #print('catboost train_MSE '+label+' :',round(mean_squared_error(y_train, preds2),3))
    print('catboost test_MSE '+label+' :',round(mean_squared_error(y_test, preds),3))

In [24]:
lightgbm(df_train,'tremor')
catboost(df_train,'tremor')

lightgbm(df_train,'dyskinesia')
catboost(df_train,'dyskinesia')

lightgbm(df_train,'on_off')
catboost(df_train,'on_off')

lightgbm test_MSE tremor : 0.603
catboost test_MSE tremor : 0.636
lightgbm test_MSE dyskinesia : 0.132
catboost test_MSE dyskinesia : 0.142
lightgbm test_MSE on_off : 0.201
catboost test_MSE on_off : 0.196


In [None]:
lightgbm test_MSE tremor : 0.603
catboost test_MSE tremor : 0.636
lightgbm test_MSE dyskinesia : 0.132
catboost test_MSE dyskinesia : 0.142
lightgbm test_MSE on_off : 0.201
catboost test_MSE on_off : 0.196

# SmartWatch

## Accelerometer

In [25]:
def preprocess_real_smartwatch(data):
    
    a=data.groupby('device_id').agg({'x':'var','y':'count'}).reset_index()
    deviceid=a.loc[a.x.idxmax(),'device_id']
    if int(a.loc[a.device_id==deviceid,'y'])<=data.shape[0]*0.2:
        deviceid=a.loc[a.x.idxmin(),'device_id']
    
    data=data[data.device_id==deviceid].reset_index()
    data.rename(columns={'t':'Timestamp','x':'X','y':'Y','z':'Z'},inplace=True)
   
    ls=['X','Y','Z']
    #freq=round((1/((data.Timestamp.max()/data.Timestamp.shape[0]).round(3))),0)
    t1=np.arange(data.Timestamp[0],data.Timestamp[(data.shape[0])-1],0.02)
    freq=50
    #t1=np.arange(data.Timestamp[0],data.Timestamp[(data.shape[0])-1],(data.Timestamp.max()/data.Timestamp.shape[0]).round(3))
    df=pd.DataFrame({'Timestamp':t1})
        
    for i in ls:
        fcubic = interpolate.interp1d(data.Timestamp, data[i])
        df[i]=fcubic(t1)
    df.rename(columns={'X':'acc_X','Y':'acc_Y','Z':'acc_Z'},inplace=True)
    return df[['Timestamp','acc_X','acc_Y','acc_Z']],deviceid,freq

def Dataset_Generation_PipeLine_smartwatch_acc(b):
    data,device_id,sampling_freq=preprocess_real_smartwatch(pd.read_csv(b))
    time_sig_df=time_domain_signal(data,sampling_freq)
    # concatenate all features and append the activity id and the user id
    row= features_generation(time_sig_df)
    return([device_id]+row)


In [26]:
a_real=glob.glob("training_data/smartwatch_accelerometer/*.csv")
a_real_acc=[]
num=[]
for i in a_real:
    data=pd.read_csv(i)
    a=data.groupby('device_id').agg({'x':'var','y':'count'}).reset_index()
    deviceid=a.loc[a.x.idxmax(),'device_id']
    if int(a.loc[a.device_id==deviceid,'y'])<=data.shape[0]*0.2:
        deviceid=a.loc[a.x.idxmin(),'device_id']
    data=data[data.device_id==deviceid].reset_index(drop=True)
    num.append(data.shape[0])
    if data.shape[0]>=2000:
        a_real_acc.append(i)

In [27]:
len(a_real_acc),len(a_real)

(530, 535)

In [28]:
import time
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(Dataset_Generation_PipeLine_smartwatch_acc)(i) for i in a_real_acc)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#12.101 min 

--- 12.63281877040863 Mins ---


In [29]:
training_data=pd.DataFrame(result)
training_data.columns=['device_id_acc']+features_names()
training_data['measurement_id']=[item[len('training_data/smartphone_accelerometer/'):-4] for item in a_real_acc]
print(training_data.shape)
training_data.head()

(530, 254)


Unnamed: 0,device_id_acc,t_body_acc_X_katz(),t_body_acc_X_coeff_var(),t_body_acc_X_slope_mean(),t_body_acc_X_slope_std(),t_body_acc_X_hjorth_activity(),t_body_acc_X_hjorth_mobility(),t_body_acc_X_hjorth_complexity(),t_body_acc_X_sec_diff_mean(),t_body_acc_X_sec_diff_max(),...,t_body_acc_jerk_Mag_first_diff_std(),t_body_acc_jerk_Mag_wavelet_cA_mean(),t_body_acc_jerk_Mag_wavelet_cA_std(),t_body_acc_jerk_Mag_wavelet_cA_Energy(),t_body_acc_jerk_Mag_wavelet_Entropy_A(),t_body_acc_jerk_Mag_wavelet_cD_mean(),t_body_acc_jerk_Mag_wavelet_cD_std(),t_body_acc_jerk_Mag_wavelet_cD_Energy(),t_body_acc_jerk_Mag_wavelet_Entropy_D(),measurement_id
0,2VSP,1.004,-24121.66893,-4.3e-05,0.274105,1.195276,0.322246,3.282729,0.122494,7.748725,...,24.246612,33.13461,52.77573,116499300.0,1218558000.0,-0.128587,18.07618,9803271.0,83513280.0,05205dc3-9c82-4b6b-921e-b6c2edd2c993
1,2WLT,1.020188,58603.225645,0.000753,0.942833,5.940303,0.402234,2.863228,0.413388,24.422981,...,52.917483,101.232822,108.388156,659904300.0,7820029000.0,0.337225,42.453446,54074070.0,544307500.0,aba31c29-79ef-4221-9412-156538a2fd4e
2,2ZX3,1.002798,-545517.997583,0.001156,0.228635,0.662064,0.37394,2.764917,0.088138,5.926416,...,15.393561,19.739822,36.4435,51535400.0,511380200.0,0.05277,11.645678,4068874.0,32542780.0,6cab466b-3c95-4cc0-ba1b-8650f5a12504
3,2WH7,1.000423,-144341.044526,7.9e-05,0.08897,0.095432,0.330273,4.100591,0.046259,3.171407,...,6.341025,9.828324,12.725178,7756042.0,61891710.0,-0.010777,5.004624,751416.4,4854295.0,ed560c25-e5c5-4dba-82c7-3fc18c248ce4
4,2WH7,1.001493,39152.736486,2e-06,0.198279,0.312401,0.41512,3.209567,0.070423,6.303599,...,15.475185,16.717025,31.942369,38995810.0,395997800.0,-0.046825,11.916498,4260438.0,37495780.0,f96752b5-850e-4a5a-a74a-69ab4893b6aa


In [30]:
a_real=glob.glob("ancillary_data/smartwatch_accelerometer/*.csv")
a_real_acc=[]
num=[]
for i in a_real:
    data=pd.read_csv(i)
    a=data.groupby('device_id').agg({'x':'var','y':'count'}).reset_index()
    deviceid=a.loc[a.x.idxmax(),'device_id']
    if int(a.loc[a.device_id==deviceid,'y'])<=data.shape[0]*0.2:
        deviceid=a.loc[a.x.idxmin(),'device_id']
    data=data[data.device_id==deviceid].reset_index(drop=True)
    num.append(data.shape[0])
    if data.shape[0]>=2000:
        a_real_acc.append(i)

In [31]:
len(a_real_acc),len(a_real)

(426, 428)

In [32]:
import time
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(Dataset_Generation_PipeLine_smartwatch_acc)(i) for i in a_real_acc)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#10.233 min 

--- 10.093303966522218 Mins ---


In [33]:
df_ancillary=pd.DataFrame(result)
df_ancillary.columns=['device_id_acc']+features_names()
df_ancillary['measurement_id']=[item[len('ancillary_data/smartwatch_accelerometer/'):-4] for item in a_real_acc]
print(df_ancillary.shape)
df_ancillary.head()

(426, 254)


Unnamed: 0,device_id_acc,t_body_acc_X_katz(),t_body_acc_X_coeff_var(),t_body_acc_X_slope_mean(),t_body_acc_X_slope_std(),t_body_acc_X_hjorth_activity(),t_body_acc_X_hjorth_mobility(),t_body_acc_X_hjorth_complexity(),t_body_acc_X_sec_diff_mean(),t_body_acc_X_sec_diff_max(),...,t_body_acc_jerk_Mag_first_diff_std(),t_body_acc_jerk_Mag_wavelet_cA_mean(),t_body_acc_jerk_Mag_wavelet_cA_std(),t_body_acc_jerk_Mag_wavelet_cA_Energy(),t_body_acc_jerk_Mag_wavelet_Entropy_A(),t_body_acc_jerk_Mag_wavelet_cD_mean(),t_body_acc_jerk_Mag_wavelet_cD_std(),t_body_acc_jerk_Mag_wavelet_cD_Energy(),t_body_acc_jerk_Mag_wavelet_Entropy_D(),measurement_id
0,2WLT,1.002863,9048.142586,6.5e-05,0.291877,0.241027,0.622906,2.416673,0.102471,7.29396,...,17.302626,23.742797,38.617638,61655380.0,605952400.0,-0.074499,13.231625,5252794.0,40764710.0,eab1cc17-40aa-4e1f-8e4f-64e6d5743509
1,2WH7,1.000218,8365.349311,-0.000165,0.062554,0.013268,0.62081,2.071451,0.034911,1.926634,...,3.734114,6.488661,7.611613,2771591.0,19156450.0,-0.003879,3.048874,257536.0,1349995.0,166ba983-209f-4639-a5a6-d6e66adeba2b
2,2WH5,1.002679,85215.011311,0.000217,0.276775,0.485745,0.473479,2.669775,0.084079,12.410336,...,19.363182,18.086702,44.079662,57495970.0,623650400.0,0.037479,14.065864,5010945.0,44345230.0,bca5e12d-9fd6-496b-ac08-9e2472d8b299
3,327T,1.000114,72642.747488,-0.000149,0.048892,0.00427,0.767195,1.906556,0.032456,0.229525,...,1.63512,5.893948,2.136874,1179185.0,4613253.0,-0.014784,1.838239,101383.6,207965.6,49f80736-6b50-44a6-a77b-9b1572334a8c
4,327T,1.000442,-12556.426237,0.000123,0.085459,0.097952,0.333752,3.496981,0.045014,3.616567,...,4.708086,8.936002,10.397893,5639237.0,40774610.0,0.02604,3.858474,446669.8,2331490.0,26f49660-ce1a-4946-8f83-f88850f03ec1


In [34]:
Frame_smartwatch_acc = training_data.append(pd.DataFrame(data = df_ancillary), ignore_index=True)
print(Frame_smartwatch_acc.shape)
Frame_smartwatch_acc.head()

(956, 254)


Unnamed: 0,device_id_acc,t_body_acc_X_katz(),t_body_acc_X_coeff_var(),t_body_acc_X_slope_mean(),t_body_acc_X_slope_std(),t_body_acc_X_hjorth_activity(),t_body_acc_X_hjorth_mobility(),t_body_acc_X_hjorth_complexity(),t_body_acc_X_sec_diff_mean(),t_body_acc_X_sec_diff_max(),...,t_body_acc_jerk_Mag_first_diff_std(),t_body_acc_jerk_Mag_wavelet_cA_mean(),t_body_acc_jerk_Mag_wavelet_cA_std(),t_body_acc_jerk_Mag_wavelet_cA_Energy(),t_body_acc_jerk_Mag_wavelet_Entropy_A(),t_body_acc_jerk_Mag_wavelet_cD_mean(),t_body_acc_jerk_Mag_wavelet_cD_std(),t_body_acc_jerk_Mag_wavelet_cD_Energy(),t_body_acc_jerk_Mag_wavelet_Entropy_D(),measurement_id
0,2VSP,1.004,-24121.66893,-4.3e-05,0.274105,1.195276,0.322246,3.282729,0.122494,7.748725,...,24.246612,33.13461,52.77573,116499300.0,1218558000.0,-0.128587,18.07618,9803271.0,83513280.0,05205dc3-9c82-4b6b-921e-b6c2edd2c993
1,2WLT,1.020188,58603.225645,0.000753,0.942833,5.940303,0.402234,2.863228,0.413388,24.422981,...,52.917483,101.232822,108.388156,659904300.0,7820029000.0,0.337225,42.453446,54074070.0,544307500.0,aba31c29-79ef-4221-9412-156538a2fd4e
2,2ZX3,1.002798,-545517.997583,0.001156,0.228635,0.662064,0.37394,2.764917,0.088138,5.926416,...,15.393561,19.739822,36.4435,51535400.0,511380200.0,0.05277,11.645678,4068874.0,32542780.0,6cab466b-3c95-4cc0-ba1b-8650f5a12504
3,2WH7,1.000423,-144341.044526,7.9e-05,0.08897,0.095432,0.330273,4.100591,0.046259,3.171407,...,6.341025,9.828324,12.725178,7756042.0,61891710.0,-0.010777,5.004624,751416.4,4854295.0,ed560c25-e5c5-4dba-82c7-3fc18c248ce4
4,2WH7,1.001493,39152.736486,2e-06,0.198279,0.312401,0.41512,3.209567,0.070423,6.303599,...,15.475185,16.717025,31.942369,38995810.0,395997800.0,-0.046825,11.916498,4260438.0,37495780.0,f96752b5-850e-4a5a-a74a-69ab4893b6aa


### test

In [35]:
a_real=glob.glob("testing_data/smartwatch_accelerometer/*.csv")
a_real_acc=[]
num=[]
for i in a_real:
    data=pd.read_csv(i)
    a=data.groupby('device_id').agg({'x':'var','y':'count'}).reset_index()
    deviceid=a.loc[a.x.idxmax(),'device_id']
    if int(a.loc[a.device_id==deviceid,'y'])<=data.shape[0]*0.2:
        deviceid=a.loc[a.x.idxmin(),'device_id']
    data=data[data.device_id==deviceid].reset_index(drop=True)
    num.append(data.shape[0])
    if data.shape[0]>=2000:
        a_real_acc.append(i)

print(len(a_real_acc),len(a_real))

import time
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(Dataset_Generation_PipeLine_smartwatch_acc)(i) for i in a_real_acc)

training_data=pd.DataFrame(result)
training_data.columns=['device_id_acc']+features_names()
training_data['measurement_id']=[item[len('testing_data/smartphone_accelerometer/'):-4] for item in a_real_acc]

Frame_smartwatch_acc_test = training_data.copy()
print(Frame_smartwatch_acc_test.shape)
Frame_smartwatch_acc_test.head()

171 172
(171, 254)


Unnamed: 0,device_id_acc,t_body_acc_X_katz(),t_body_acc_X_coeff_var(),t_body_acc_X_slope_mean(),t_body_acc_X_slope_std(),t_body_acc_X_hjorth_activity(),t_body_acc_X_hjorth_mobility(),t_body_acc_X_hjorth_complexity(),t_body_acc_X_sec_diff_mean(),t_body_acc_X_sec_diff_max(),...,t_body_acc_jerk_Mag_first_diff_std(),t_body_acc_jerk_Mag_wavelet_cA_mean(),t_body_acc_jerk_Mag_wavelet_cA_std(),t_body_acc_jerk_Mag_wavelet_cA_Energy(),t_body_acc_jerk_Mag_wavelet_Entropy_A(),t_body_acc_jerk_Mag_wavelet_cD_mean(),t_body_acc_jerk_Mag_wavelet_cD_std(),t_body_acc_jerk_Mag_wavelet_cD_Energy(),t_body_acc_jerk_Mag_wavelet_Entropy_D(),measurement_id
0,2ZX6,1.023602,88972.650089,0.002406,1.065472,6.124654,0.471469,2.507854,0.465609,55.776289,...,65.279374,113.248347,120.61834,821246500.0,9766180000.0,-0.169012,54.844725,90242180.0,984505200.0,7860035d-c9df-44e9-ba0e-0855dfad90f4
1,2WH8,1.002468,-9577.625612,0.00062,0.25194,0.497761,0.419816,2.870803,0.09278,14.599304,...,18.905019,20.222352,40.507956,61497190.0,681839400.0,-0.175286,14.146314,6004668.0,59154540.0,0c593c3f-6636-4f0f-b9b1-f489c1cd3852
2,2VSP,1.004413,-975708.091118,-0.000236,0.360364,0.575081,0.488049,3.12799,0.149894,7.094872,...,20.596159,31.794656,43.792402,87866100.0,889800700.0,0.122876,16.042862,7722171.0,62060240.0,8abf7688-c6bb-488f-bb9d-c359a5f86b35
3,2VSP,1.000875,67926.655283,-0.000399,0.143272,0.29643,0.294123,4.18596,0.05995,8.542191,...,9.12239,13.973823,20.252368,18164000.0,161236000.0,-0.088866,6.878839,1419885.0,9548193.0,c4f0d5dc-c1db-4cc5-89d1-26d7fee32299
4,2WLT,1.002326,-65635.078837,0.000161,0.274812,0.54502,0.395441,3.18318,0.100699,13.317348,...,27.905948,27.226059,48.889858,93950560.0,1090616000.0,0.068849,20.474143,12576700.0,140661300.0,6bb1ba19-25ac-4889-8e5b-8e4215784e82


## Gyroscope

In [36]:
def preprocess_real_gyroscope(data):
    
    a=data.groupby('device_id').agg({'x':'var','y':'count'}).reset_index()
    deviceid=a.loc[a.x.idxmax(),'device_id']
    if int(a.loc[a.device_id==deviceid,'y'])<=data.shape[0]*0.2:
        deviceid=a.loc[a.x.idxmin(),'device_id']
    
    data=data[data.device_id==deviceid].reset_index()
    data.rename(columns={'t':'Timestamp','x':'X','y':'Y','z':'Z'},inplace=True)
   
    ls=['X','Y','Z']
    #freq=round((1/((data.Timestamp.max()/data.Timestamp.shape[0]).round(3))),0)
    t1=np.arange(data.Timestamp[0],data.Timestamp[(data.shape[0])-1],0.02)
    freq=50
    #t1=np.arange(data.Timestamp[0],data.Timestamp[(data.shape[0])-1],(data.Timestamp.max()/data.Timestamp.shape[0]).round(3))
    df=pd.DataFrame({'Timestamp':t1})
        
    for i in ls:
        fcubic = interpolate.interp1d(data.Timestamp, data[i])
        df[i]=fcubic(t1)
    df.rename(columns={'X':'acc_X','Y':'acc_Y','Z':'acc_Z'},inplace=True)
    return df[['Timestamp','acc_X','acc_Y','acc_Z']],deviceid,freq

def Dataset_Generation_PipeLine_smartwatch_gyro(b):
    data,device_id,sampling_freq=preprocess_real_gyroscope(pd.read_csv(b))
    time_sig_df=time_domain_signal(data,sampling_freq)
    time_sig_df=time_sig_df.drop(['t_grav_acc_X','t_grav_acc_Y','t_grav_acc_Z','t_grav_acc_mag'],axis=1)
    # concatenate all features and append the activity id and the user id
    row= features_generation(time_sig_df)
    return([device_id]+row)

In [37]:
fn_gyro=[i.replace('acc','gyro') for i in features_names()]
fn_gyro=[i for i in fn_gyro if not any(w in 'grav' for w in i.split('_'))]

In [38]:
a_real=glob.glob("training_data/smartwatch_gyroscope/*.csv")
a_real_gyro=[]
num=[]
for i in a_real:
    data=pd.read_csv(i)
    a=data.groupby('device_id').agg({'x':'var','y':'count'}).reset_index()
    deviceid=a.loc[a.x.idxmax(),'device_id']
    if int(a.loc[a.device_id==deviceid,'y'])<=data.shape[0]*0.2:
        deviceid=a.loc[a.x.idxmin(),'device_id']
    data=data[data.device_id==deviceid].reset_index(drop=True)
    num.append(data.shape[0])
    if data.shape[0]>=2000:
        a_real_gyro.append(i)

In [39]:
len(a_real_gyro),len(a_real)

(530, 535)

In [40]:
# smartwatch_accelerometer
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(Dataset_Generation_PipeLine_smartwatch_gyro)(i) for i in a_real_gyro)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#11.900 Mins

--- 11.90065267086029 Mins ---


In [41]:
training_data=pd.DataFrame(result)
training_data.columns=['device_id_gyro']+fn_gyro
training_data['measurement_id']=[item[len('training_data/smartwatch_gyroscope/'):-4] for item in a_real_gyro]
print(training_data.shape)
training_data.head()

(530, 170)


Unnamed: 0,device_id_gyro,t_body_gyro_X_katz(),t_body_gyro_X_coeff_var(),t_body_gyro_X_slope_mean(),t_body_gyro_X_slope_std(),t_body_gyro_X_hjorth_activity(),t_body_gyro_X_hjorth_mobility(),t_body_gyro_X_hjorth_complexity(),t_body_gyro_X_sec_diff_mean(),t_body_gyro_X_sec_diff_max(),...,t_body_gyro_jerk_Mag_first_diff_std(),t_body_gyro_jerk_Mag_wavelet_cA_mean(),t_body_gyro_jerk_Mag_wavelet_cA_std(),t_body_gyro_jerk_Mag_wavelet_cA_Energy(),t_body_gyro_jerk_Mag_wavelet_Entropy_A(),t_body_gyro_jerk_Mag_wavelet_cD_mean(),t_body_gyro_jerk_Mag_wavelet_cD_std(),t_body_gyro_jerk_Mag_wavelet_cD_Energy(),t_body_gyro_jerk_Mag_wavelet_Entropy_D(),measurement_id
0,2VSP,1.00265,-107509.7,-5.9e-05,0.216099,0.754624,0.323896,2.439451,0.067972,5.778713,...,7.972038,10.147031,18.88379,13787250.0,116742100.0,0.021184,5.664321,962581.6,6456324.0,05205dc3-9c82-4b6b-921e-b6c2edd2c993
1,2WLT,1.016804,-499797.9,-0.005478,0.684763,4.204157,0.39217,2.252779,0.324338,20.884087,...,21.441469,41.31825,42.222093,104700600.0,1000636000.0,0.017429,17.923587,9637979.0,77015270.0,aba31c29-79ef-4221-9412-156538a2fd4e
2,2ZX3,1.00111,-1335227.0,0.000793,0.13959,0.171924,0.414643,2.156222,0.039527,2.625508,...,5.278354,5.233988,11.835444,5024339.0,38734720.0,0.016609,3.842511,442969.8,2659358.0,6cab466b-3c95-4cc0-ba1b-8650f5a12504
3,2WH7,1.000122,-246259.1,-1.4e-05,0.04339,0.03081,0.322568,3.439114,0.012659,3.22367,...,2.086422,1.487288,3.971671,539621.8,3635382.0,0.005732,1.463915,64296.68,351279.0,ed560c25-e5c5-4dba-82c7-3fc18c248ce4
4,2WH7,1.000468,-797211.0,3e-06,0.081894,0.118952,0.309429,3.487643,0.03189,2.545157,...,3.6723,3.638508,7.491964,2081187.0,14267690.0,-0.008545,2.772051,230545.5,1182145.0,f96752b5-850e-4a5a-a74a-69ab4893b6aa


In [42]:
a_real=glob.glob("ancillary_data/smartwatch_gyroscope/*.csv")
a_real_gyro=[]
num=[]
for i in a_real:
    data=pd.read_csv(i)
    a=data.groupby('device_id').agg({'x':'var','y':'count'}).reset_index()
    deviceid=a.loc[a.x.idxmax(),'device_id']
    if int(a.loc[a.device_id==deviceid,'y'])<=data.shape[0]*0.2:
        deviceid=a.loc[a.x.idxmin(),'device_id']
    data=data[data.device_id==deviceid].reset_index(drop=True)
    num.append(data.shape[0])
    if data.shape[0]>=2000:
        a_real_gyro.append(i)
#pd.DataFrame({'num':num}).sort_values(by='num')

In [43]:
len(a_real_gyro),len(a_real)

(426, 428)

In [44]:
# smartwatch_accelerometer
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(Dataset_Generation_PipeLine_smartwatch_gyro)(i) for i in a_real_gyro)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#8.7881 Mins

--- 8.788118120034536 Mins ---


In [45]:
df_ancillary=pd.DataFrame(result)
df_ancillary.columns=['device_id_gyro']+fn_gyro
df_ancillary['measurement_id']=[item[len('ancillary_data/smartwatch_gyroscope/'):-4] for item in a_real_gyro]
print(df_ancillary.shape)
df_ancillary.head()

(426, 170)


Unnamed: 0,device_id_gyro,t_body_gyro_X_katz(),t_body_gyro_X_coeff_var(),t_body_gyro_X_slope_mean(),t_body_gyro_X_slope_std(),t_body_gyro_X_hjorth_activity(),t_body_gyro_X_hjorth_mobility(),t_body_gyro_X_hjorth_complexity(),t_body_gyro_X_sec_diff_mean(),t_body_gyro_X_sec_diff_max(),...,t_body_gyro_jerk_Mag_first_diff_std(),t_body_gyro_jerk_Mag_wavelet_cA_mean(),t_body_gyro_jerk_Mag_wavelet_cA_std(),t_body_gyro_jerk_Mag_wavelet_cA_Energy(),t_body_gyro_jerk_Mag_wavelet_Entropy_A(),t_body_gyro_jerk_Mag_wavelet_cD_mean(),t_body_gyro_jerk_Mag_wavelet_cD_std(),t_body_gyro_jerk_Mag_wavelet_cD_Energy(),t_body_gyro_jerk_Mag_wavelet_Entropy_D(),measurement_id
0,2WLT,1.001518,-7128.906056,0.000426,0.189003,0.129512,0.561299,2.641192,0.059124,2.652827,...,7.141569,7.33172,14.415857,7847655.0,61881740.0,0.002375,5.38971,871527.436365,5372407.0,eab1cc17-40aa-4e1f-8e4f-64e6d5743509
1,2WH7,1.000033,-601988.101763,6.2e-05,0.020153,0.008715,0.302088,2.914152,0.004172,1.400709,...,0.934095,0.546636,2.072678,127294.4,777711.9,0.001082,0.64552,11544.192736,51626.63,166ba983-209f-4639-a5a6-d6e66adeba2b
2,2WH5,1.001486,-132666.862639,-0.000288,0.164944,0.365646,0.363827,2.27788,0.035475,4.992081,...,6.952026,5.249308,17.532807,8483394.0,79034030.0,0.060075,4.866248,599844.050366,4365862.0,bca5e12d-9fd6-496b-ac08-9e2472d8b299
3,327T,1.000003,-397294.625718,5.5e-05,0.00628,0.000472,0.374125,2.306019,0.003537,0.142443,...,0.214192,0.552312,0.505713,16824.39,10896.87,0.001267,0.183083,1005.665506,1967.664,49f80736-6b50-44a6-a77b-9b1572334a8c
4,327T,1.000147,-122124.650848,-0.000266,0.043249,0.025197,0.364137,2.26764,0.015788,0.809859,...,1.627956,2.008968,3.842988,564154.1,3044145.0,0.010548,1.19455,42813.228949,139768.7,26f49660-ce1a-4946-8f83-f88850f03ec1


In [46]:
Frame_smartwatch_gyro = training_data.append(pd.DataFrame(data = df_ancillary), ignore_index=True)
print(Frame_smartwatch_gyro.shape)
Frame_smartwatch_gyro.head()

(956, 170)


Unnamed: 0,device_id_gyro,t_body_gyro_X_katz(),t_body_gyro_X_coeff_var(),t_body_gyro_X_slope_mean(),t_body_gyro_X_slope_std(),t_body_gyro_X_hjorth_activity(),t_body_gyro_X_hjorth_mobility(),t_body_gyro_X_hjorth_complexity(),t_body_gyro_X_sec_diff_mean(),t_body_gyro_X_sec_diff_max(),...,t_body_gyro_jerk_Mag_first_diff_std(),t_body_gyro_jerk_Mag_wavelet_cA_mean(),t_body_gyro_jerk_Mag_wavelet_cA_std(),t_body_gyro_jerk_Mag_wavelet_cA_Energy(),t_body_gyro_jerk_Mag_wavelet_Entropy_A(),t_body_gyro_jerk_Mag_wavelet_cD_mean(),t_body_gyro_jerk_Mag_wavelet_cD_std(),t_body_gyro_jerk_Mag_wavelet_cD_Energy(),t_body_gyro_jerk_Mag_wavelet_Entropy_D(),measurement_id
0,2VSP,1.00265,-107509.7,-5.9e-05,0.216099,0.754624,0.323896,2.439451,0.067972,5.778713,...,7.972038,10.147031,18.88379,13787250.0,116742100.0,0.021184,5.664321,962581.6,6456324.0,05205dc3-9c82-4b6b-921e-b6c2edd2c993
1,2WLT,1.016804,-499797.9,-0.005478,0.684763,4.204157,0.39217,2.252779,0.324338,20.884087,...,21.441469,41.31825,42.222093,104700600.0,1000636000.0,0.017429,17.923587,9637979.0,77015270.0,aba31c29-79ef-4221-9412-156538a2fd4e
2,2ZX3,1.00111,-1335227.0,0.000793,0.13959,0.171924,0.414643,2.156222,0.039527,2.625508,...,5.278354,5.233988,11.835444,5024339.0,38734720.0,0.016609,3.842511,442969.8,2659358.0,6cab466b-3c95-4cc0-ba1b-8650f5a12504
3,2WH7,1.000122,-246259.1,-1.4e-05,0.04339,0.03081,0.322568,3.439114,0.012659,3.22367,...,2.086422,1.487288,3.971671,539621.8,3635382.0,0.005732,1.463915,64296.68,351279.0,ed560c25-e5c5-4dba-82c7-3fc18c248ce4
4,2WH7,1.000468,-797211.0,3e-06,0.081894,0.118952,0.309429,3.487643,0.03189,2.545157,...,3.6723,3.638508,7.491964,2081187.0,14267690.0,-0.008545,2.772051,230545.5,1182145.0,f96752b5-850e-4a5a-a74a-69ab4893b6aa


In [47]:
Frame_smartwatch_acc.shape

(956, 254)

In [48]:
Frame_smartwatch=pd.merge(Frame_smartwatch_acc,Frame_smartwatch_gyro,on='measurement_id')
Frame_smartwatch=Frame_smartwatch.drop('device_id_gyro',axis=1)
print(Frame_smartwatch.shape)
Frame_smartwatch.head()

(956, 422)


Unnamed: 0,device_id_acc,t_body_acc_X_katz(),t_body_acc_X_coeff_var(),t_body_acc_X_slope_mean(),t_body_acc_X_slope_std(),t_body_acc_X_hjorth_activity(),t_body_acc_X_hjorth_mobility(),t_body_acc_X_hjorth_complexity(),t_body_acc_X_sec_diff_mean(),t_body_acc_X_sec_diff_max(),...,t_body_gyro_jerk_Mag_first_diff_max(),t_body_gyro_jerk_Mag_first_diff_std(),t_body_gyro_jerk_Mag_wavelet_cA_mean(),t_body_gyro_jerk_Mag_wavelet_cA_std(),t_body_gyro_jerk_Mag_wavelet_cA_Energy(),t_body_gyro_jerk_Mag_wavelet_Entropy_A(),t_body_gyro_jerk_Mag_wavelet_cD_mean(),t_body_gyro_jerk_Mag_wavelet_cD_std(),t_body_gyro_jerk_Mag_wavelet_cD_Energy(),t_body_gyro_jerk_Mag_wavelet_Entropy_D()
0,2VSP,1.004,-24121.66893,-4.3e-05,0.274105,1.195276,0.322246,3.282729,0.122494,7.748725,...,290.197266,7.972038,10.147031,18.88379,13787250.0,116742100.0,0.021184,5.664321,962581.6,6456324.0
1,2WLT,1.020188,58603.225645,0.000753,0.942833,5.940303,0.402234,2.863228,0.413388,24.422981,...,898.104244,21.441469,41.31825,42.222093,104700600.0,1000636000.0,0.017429,17.923587,9637979.0,77015270.0
2,2ZX3,1.002798,-545517.997583,0.001156,0.228635,0.662064,0.37394,2.764917,0.088138,5.926416,...,123.21471,5.278354,5.233988,11.835444,5024339.0,38734720.0,0.016609,3.842511,442969.8,2659358.0
3,2WH7,1.000423,-144341.044526,7.9e-05,0.08897,0.095432,0.330273,4.100591,0.046259,3.171407,...,166.230191,2.086422,1.487288,3.971671,539621.8,3635382.0,0.005732,1.463915,64296.68,351279.0
4,2WH7,1.001493,39152.736486,2e-06,0.198279,0.312401,0.41512,3.209567,0.070423,6.303599,...,123.358364,3.6723,3.638508,7.491964,2081187.0,14267690.0,-0.008545,2.772051,230545.5,1182145.0


In [52]:
Frame_smartwatch.to_csv('realpd_comp_training_abhiroop_lastfeatures_smartwatch.csv',index=False)

### test

In [49]:
a_real=glob.glob("testing_data/smartwatch_gyroscope/*.csv")
a_real_gyro=[]
num=[]
for i in a_real:
    data=pd.read_csv(i)
    a=data.groupby('device_id').agg({'x':'var','y':'count'}).reset_index()
    deviceid=a.loc[a.x.idxmax(),'device_id']
    if int(a.loc[a.device_id==deviceid,'y'])<=data.shape[0]*0.2:
        deviceid=a.loc[a.x.idxmin(),'device_id']
    data=data[data.device_id==deviceid].reset_index(drop=True)
    num.append(data.shape[0])
    if data.shape[0]>=2000:
        a_real_gyro.append(i)

print(len(a_real_gyro),len(a_real))

# smartwatch_accelerometer
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(Dataset_Generation_PipeLine_smartwatch_gyro)(i) for i in a_real_gyro)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#1.8061357021331788 Mins

training_data=pd.DataFrame(result)
training_data.columns=['device_id_gyro']+fn_gyro
training_data['measurement_id']=[item[len('testing_data/smartwatch_gyroscope/'):-4] for item in a_real_gyro]

Frame_smartwatch_gyro_test = training_data.copy()
print(Frame_smartwatch_gyro_test.shape)
Frame_smartwatch_gyro_test.head()

171 172
--- 4.558271133899689 Mins ---
(171, 170)


Unnamed: 0,device_id_gyro,t_body_gyro_X_katz(),t_body_gyro_X_coeff_var(),t_body_gyro_X_slope_mean(),t_body_gyro_X_slope_std(),t_body_gyro_X_hjorth_activity(),t_body_gyro_X_hjorth_mobility(),t_body_gyro_X_hjorth_complexity(),t_body_gyro_X_sec_diff_mean(),t_body_gyro_X_sec_diff_max(),...,t_body_gyro_jerk_Mag_first_diff_std(),t_body_gyro_jerk_Mag_wavelet_cA_mean(),t_body_gyro_jerk_Mag_wavelet_cA_std(),t_body_gyro_jerk_Mag_wavelet_cA_Energy(),t_body_gyro_jerk_Mag_wavelet_Entropy_A(),t_body_gyro_jerk_Mag_wavelet_cD_mean(),t_body_gyro_jerk_Mag_wavelet_cD_std(),t_body_gyro_jerk_Mag_wavelet_cD_Energy(),t_body_gyro_jerk_Mag_wavelet_Entropy_D(),measurement_id
0,2ZX6,1.011536,-1564920.0,-0.000162,0.530383,2.045313,0.434484,2.233903,0.273384,6.63441,...,16.76453,33.986253,34.416915,70190020.0,635424400.0,0.061358,14.301418,6136235.0,43630510.0,7860035d-c9df-44e9-ba0e-0855dfad90f4
1,2WH8,1.001157,-70297.6,-0.000121,0.136866,0.230913,0.371402,2.357704,0.04284,3.599173,...,6.244261,6.485373,13.668825,6867134.0,56099580.0,-0.041733,4.458588,596442.4,4129949.0,0c593c3f-6636-4f0f-b9b1-f489c1cd3852
2,2VSP,1.00129,-738256.7,-0.000262,0.171516,0.240422,0.388788,3.528422,0.062065,6.49481,...,7.030368,6.613199,12.010681,5640102.0,43490610.0,-0.009378,5.461172,894794.4,5832560.0,8abf7688-c6bb-488f-bb9d-c359a5f86b35
3,2VSP,1.000711,943555.2,0.000855,0.114326,0.1878,0.312182,2.756888,0.034086,2.175633,...,3.939849,4.672803,9.126461,3154032.0,23114650.0,0.022121,2.850972,243872.2,1328356.0,c4f0d5dc-c1db-4cc5-89d1-26d7fee32299
4,2WLT,1.001833,314851.7,-0.000216,0.233392,0.498586,0.37649,2.34727,0.065643,16.303188,...,7.968947,9.294405,17.95646,12265430.0,118424600.0,-0.033677,5.722445,982490.8,7796013.0,6bb1ba19-25ac-4889-8e5b-8e4215784e82


In [50]:
Frame_smartwatch_test=pd.merge(Frame_smartwatch_acc_test,Frame_smartwatch_gyro_test,on='measurement_id')
Frame_smartwatch_test=Frame_smartwatch_test.drop('device_id_gyro',axis=1)
print(Frame_smartwatch_test.shape)
Frame_smartwatch_test.head()

(171, 422)


Unnamed: 0,device_id_acc,t_body_acc_X_katz(),t_body_acc_X_coeff_var(),t_body_acc_X_slope_mean(),t_body_acc_X_slope_std(),t_body_acc_X_hjorth_activity(),t_body_acc_X_hjorth_mobility(),t_body_acc_X_hjorth_complexity(),t_body_acc_X_sec_diff_mean(),t_body_acc_X_sec_diff_max(),...,t_body_gyro_jerk_Mag_first_diff_max(),t_body_gyro_jerk_Mag_first_diff_std(),t_body_gyro_jerk_Mag_wavelet_cA_mean(),t_body_gyro_jerk_Mag_wavelet_cA_std(),t_body_gyro_jerk_Mag_wavelet_cA_Energy(),t_body_gyro_jerk_Mag_wavelet_Entropy_A(),t_body_gyro_jerk_Mag_wavelet_cD_mean(),t_body_gyro_jerk_Mag_wavelet_cD_std(),t_body_gyro_jerk_Mag_wavelet_cD_Energy(),t_body_gyro_jerk_Mag_wavelet_Entropy_D()
0,2ZX6,1.023602,88972.650089,0.002406,1.065472,6.124654,0.471469,2.507854,0.465609,55.776289,...,331.143793,16.76453,33.986253,34.416915,70190020.0,635424400.0,0.061358,14.301418,6136235.0,43630510.0
1,2WH8,1.002468,-9577.625612,0.00062,0.25194,0.497761,0.419816,2.870803,0.09278,14.599304,...,365.273467,6.244261,6.485373,13.668825,6867134.0,56099580.0,-0.041733,4.458588,596442.4,4129949.0
2,2VSP,1.004413,-975708.091118,-0.000236,0.360364,0.575081,0.488049,3.12799,0.149894,7.094872,...,296.280466,7.030368,6.613199,12.010681,5640102.0,43490610.0,-0.009378,5.461172,894794.4,5832560.0
3,2VSP,1.000875,67926.655283,-0.000399,0.143272,0.29643,0.294123,4.18596,0.05995,8.542191,...,117.671729,3.939849,4.672803,9.126461,3154032.0,23114650.0,0.022121,2.850972,243872.2,1328356.0
4,2WLT,1.002326,-65635.078837,0.000161,0.274812,0.54502,0.395441,3.18318,0.100699,13.317348,...,718.572156,7.968947,9.294405,17.95646,12265430.0,118424600.0,-0.033677,5.722445,982490.8,7796013.0


In [51]:
Frame_smartwatch_test.to_csv('realpd_comp_testing_abhiroop_lastfeatures_smartwatch.csv',index=False)

In [53]:
#smartwatch_gyroscope features
label=real_pd_training_id.append(pd.DataFrame(real_pd_ancillary_id), ignore_index=True)
df_train=pd.merge(label,Frame_smartwatch,on='measurement_id')
df_train['subject_id']=preprocessing.LabelEncoder().fit(df_train['subject_id']).transform(df_train['subject_id'])
df_train['device_id_acc']=preprocessing.LabelEncoder().fit(df_train['device_id_acc']).transform(df_train['device_id_acc'])
print(df_train.shape)
df_train.head()

(956, 426)


Unnamed: 0,measurement_id,subject_id,on_off,dyskinesia,tremor,device_id_acc,t_body_acc_X_katz(),t_body_acc_X_coeff_var(),t_body_acc_X_slope_mean(),t_body_acc_X_slope_std(),...,t_body_gyro_jerk_Mag_first_diff_max(),t_body_gyro_jerk_Mag_first_diff_std(),t_body_gyro_jerk_Mag_wavelet_cA_mean(),t_body_gyro_jerk_Mag_wavelet_cA_std(),t_body_gyro_jerk_Mag_wavelet_cA_Energy(),t_body_gyro_jerk_Mag_wavelet_Entropy_A(),t_body_gyro_jerk_Mag_wavelet_cD_mean(),t_body_gyro_jerk_Mag_wavelet_cD_std(),t_body_gyro_jerk_Mag_wavelet_cD_Energy(),t_body_gyro_jerk_Mag_wavelet_Entropy_D()
0,ee053d95-c155-400d-ae42-fe24834ad4a9,2,1.0,0.0,3.0,4,1.000188,-20526.340895,0.000181,0.059866,...,33.323622,0.786707,1.06863,1.733941,124459.8,547944.7,-0.013069,0.622528,11631.76052,25068.41
1,ce51ee31-8553-4321-9f83-8cd3dabe2f66,2,1.0,0.0,2.0,4,1.001944,11333.770237,-0.00097,0.286324,...,210.500683,3.859154,3.362561,7.865499,2195260.0,16561530.0,-0.015964,2.84237,242387.685434,1556088.0
2,d3c89012-3ab9-4014-b577-61ff05e31968,2,1.0,0.0,0.0,0,1.004018,13156.478439,-0.003303,0.291894,...,206.728928,6.262726,8.760759,12.930031,5385388.0,40773120.0,-0.056107,4.843852,518059.908885,3097808.0
3,5c42911d-0ebd-47ba-9925-dd5ab1c0ed61,2,1.0,0.0,1.0,4,1.003646,-26119.934209,-0.002956,0.294796,...,162.089955,4.724539,7.269784,9.893913,4522329.0,31399390.0,-0.014247,3.743357,420401.804357,2218043.0
4,235472d5-ad2e-4c76-947e-358c9d8c1280,2,1.0,0.0,0.0,0,1.001315,-8780.809628,-0.00049,0.138259,...,110.916249,3.106651,4.29141,7.42928,2208386.0,14331780.0,0.01172,2.308537,159889.77307,748331.0


In [54]:
lightgbm(df_train,'tremor')
catboost(df_train,'tremor')

lightgbm(df_train,'dyskinesia')
catboost(df_train,'dyskinesia')

lightgbm(df_train,'on_off')
catboost(df_train,'on_off')

lightgbm test_MSE tremor : 0.571
catboost test_MSE tremor : 0.584
lightgbm test_MSE dyskinesia : 0.193
catboost test_MSE dyskinesia : 0.211
lightgbm test_MSE on_off : 0.205
catboost test_MSE on_off : 0.217


In [None]:
lightgbm test_MSE tremor : 0.571
catboost test_MSE tremor : 0.584
lightgbm test_MSE dyskinesia : 0.193
catboost test_MSE dyskinesia : 0.211
lightgbm test_MSE on_off : 0.205
catboost test_MSE on_off : 0.217

## Clinical Data

In [169]:
real_pd_demo=pd.read_csv('clinical_data/REAL-PD_Demographics.csv')
real_pd_demo['Gender']=real_pd_demo['Gender'].map({'Male':1,'Female':0})
print(real_pd_demo.shape)
real_pd_demo.head()


(22, 3)


Unnamed: 0,subject_id,Age,Gender
0,hbv002,65,1
1,hbv012,57,0
2,hbv013,68,0
3,hbv014,67,1
4,hbv016,71,1


In [170]:
real_pd_smartphone=pd.read_csv('clinical_data/REAL-PD_Smartphone_Metadata.csv')
real_pd_smartphone=real_pd_smartphone.drop(['smartphone_model','android_version','Smartphone_Location_Other'],axis=1)
real_pd_smartphone.smartphone_brand=np.where(real_pd_smartphone.smartphone_brand=='Samsung',1,0)
real_pd_smartphone.Most_Common_Smartphone_Location=np.where(real_pd_smartphone.Most_Common_Smartphone_Location=='Front pocket',1,np.where(real_pd_smartphone.Most_Common_Smartphone_Location=='Back pocket',2,3))
print(real_pd_smartphone.shape)
real_pd_smartphone.head()

(22, 3)


Unnamed: 0,subject_id,smartphone_brand,Most_Common_Smartphone_Location
0,hbv002,1,1
1,hbv012,1,1
2,hbv013,1,1
3,hbv014,1,1
4,hbv016,0,1


In [171]:
real_pd_updrs_part1_2_4=pd.read_csv('clinical_data/REAL-PD_UPDRS_Part1_2_4.csv')
print(real_pd_updrs_part1_2_4.shape)
real_pd_updrs_part1_2_4.head()


(22, 9)


Unnamed: 0,subject_id,UPDRS_PartI_Total,UPDRS_PartII_Total,UPDRS_4.1,UPDRS_4.2,UPDRS_4.3,UPDRS_4.4,UPDRS_4.5,UPDRS_4.6
0,hbv002,8,13,1,0,1,3,1,0
1,hbv012,11,4,0,0,1,0,2,2
2,hbv013,10,13,1,0,2,3,3,1
3,hbv014,9,13,0,0,1,1,3,0
4,hbv016,7,8,0,0,0,4,1,0


In [172]:
real_pd_updrs_part3=pd.read_csv('clinical_data/REAL-PD_UPDRS_Part3.csv')
real_pd_updrs_part3=real_pd_updrs_part3.fillna(1)

real_pd_updrs_part3['Rigidity']=real_pd_updrs_part3[['UPDRS_3.3 Neck','UPDRS_3.3 Right Upper Extremity','UPDRS_3.3 Left Upper Extremity','UPDRS_3.3 Right Lower Extremity','UPDRS_3.3 Left Lower Extremity']].mean(axis=1)
real_pd_updrs_part3['Tapping']=real_pd_updrs_part3[['UPDRS_3.4 Right Hand','UPDRS_3.4 Left Hand','UPDRS_3.7 Right Foot','UPDRS_3.7 Left Foot']].mean(axis=1)
real_pd_updrs_part3['Body Tremor']=real_pd_updrs_part3[['UPDRS_3.15 Right Hand','UPDRS_3.15 Left Hand','UPDRS_3.16 Right Hand','UPDRS_3.16 Left Hand','UPDRS_3.17 Right Upper Extremity','UPDRS_3.17 Left Upper Extremity','UPDRS_3.17 Right Lower Extremity','UPDRS_3.17 Left Lower Extremity','UPDRS_3.17 Lip-Jaw']].mean(axis=1)
real_pd_updrs_part3['Movement']=real_pd_updrs_part3[['UPDRS_3.5 Right Hand','UPDRS_3.5 Left Hand','UPDRS_3.6 Right Hand','UPDRS_3.6 Left Hand']].mean(axis=1)

type1=real_pd_updrs_part3.copy()
type1.drop(['ParticipantState'],axis=1,inplace=True)
numeric_cols = list(set(type1.columns) - set(['subject_id']))
type1[numeric_cols] += 1

type2=type1.groupby('subject_id').pct_change()
type2['subject_id']=real_pd_updrs_part3['subject_id']
type2=type2.groupby('subject_id').max().reset_index().fillna(0)
type2.columns=['subject_id']+list('PerChange_'+type2.columns[1:])
real_pd_updrs_part3=pd.merge(real_pd_updrs_part3[real_pd_updrs_part3.ParticipantState=='Off'].drop(['ParticipantState'],axis=1),type2,on='subject_id')
print(real_pd_updrs_part3.shape)
real_pd_updrs_part3.head()

(22, 81)


Unnamed: 0,subject_id,UPDRS_3.1,UPDRS_3.2,UPDRS_3.3 Neck,UPDRS_3.3 Right Upper Extremity,UPDRS_3.3 Left Upper Extremity,UPDRS_3.3 Right Lower Extremity,UPDRS_3.3 Left Lower Extremity,UPDRS_3.4 Right Hand,UPDRS_3.4 Left Hand,...,PerChange_UPDRS_3.17 Left Lower Extremity,PerChange_UPDRS_3.17 Lip-Jaw,PerChange_UPDRS_3.18,PerChange_UPDRS_3.19A,PerChange_UPDRS_3.19B,PerChange_UPDRS_3.20,PerChange_Rigidity,PerChange_Tapping,PerChange_Body Tremor,PerChange_Movement
0,hbv002,3,3,0,3,3,0,0,2,2,...,0.0,0.0,0.0,1.0,-0.5,0.0,-0.272727,-0.333333,-0.214286,0.2
1,hbv012,1,1,0,1,0,1,0,1,2,...,-0.666667,0.0,-0.25,0.0,0.0,0.0,-0.142857,-0.1,-0.55,-0.181818
2,hbv013,1,2,2,3,2,0,1,3,3,...,0.0,0.0,-0.75,1.0,-0.5,0.0,-0.384615,-0.333333,-0.583333,-0.25
3,hbv014,2,1,1,1,2,0,0,2,3,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.111111,-0.416667,-0.166667,0.0
4,hbv016,2,2,0,3,2,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.090909,0.166667,0.076923,0.090909


In [173]:
clinical_data=pd.merge(real_pd_demo,real_pd_smartphone,on='subject_id')
clinical_data=pd.merge(clinical_data,real_pd_updrs_part1_2_4,on='subject_id')
clinical_data=pd.merge(clinical_data,real_pd_updrs_part3,on='subject_id')
print(clinical_data.shape)
clinical_data.head()

(22, 93)


Unnamed: 0,subject_id,Age,Gender,smartphone_brand,Most_Common_Smartphone_Location,UPDRS_PartI_Total,UPDRS_PartII_Total,UPDRS_4.1,UPDRS_4.2,UPDRS_4.3,...,PerChange_UPDRS_3.17 Left Lower Extremity,PerChange_UPDRS_3.17 Lip-Jaw,PerChange_UPDRS_3.18,PerChange_UPDRS_3.19A,PerChange_UPDRS_3.19B,PerChange_UPDRS_3.20,PerChange_Rigidity,PerChange_Tapping,PerChange_Body Tremor,PerChange_Movement
0,hbv002,65,1,1,1,8,13,1,0,1,...,0.0,0.0,0.0,1.0,-0.5,0.0,-0.272727,-0.333333,-0.214286,0.2
1,hbv012,57,0,1,1,11,4,0,0,1,...,-0.666667,0.0,-0.25,0.0,0.0,0.0,-0.142857,-0.1,-0.55,-0.181818
2,hbv013,68,0,1,1,10,13,1,0,2,...,0.0,0.0,-0.75,1.0,-0.5,0.0,-0.384615,-0.333333,-0.583333,-0.25
3,hbv014,67,1,1,1,9,13,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.111111,-0.416667,-0.166667,0.0
4,hbv016,71,1,0,1,7,8,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.090909,0.166667,0.076923,0.090909


In [176]:
clinical_data.to_csv('realpd_clinical_preprocessed.csv',index=False)

In [178]:
#smartwatch_gyroscope features
label=real_pd_training_id.append(pd.DataFrame(real_pd_ancillary_id), ignore_index=True)
df_train=pd.merge(label,clinical_data,on='subject_id')
df_train['subject_id']=preprocessing.LabelEncoder().fit(df_train['subject_id']).transform(df_train['subject_id'])
print(df_train.shape)
df_train.head()

(1081, 97)


Unnamed: 0,measurement_id,subject_id,on_off,dyskinesia,tremor,Age,Gender,smartphone_brand,Most_Common_Smartphone_Location,UPDRS_PartI_Total,...,PerChange_UPDRS_3.17 Left Lower Extremity,PerChange_UPDRS_3.17 Lip-Jaw,PerChange_UPDRS_3.18,PerChange_UPDRS_3.19A,PerChange_UPDRS_3.19B,PerChange_UPDRS_3.20,PerChange_Rigidity,PerChange_Tapping,PerChange_Body Tremor,PerChange_Movement
0,ee053d95-c155-400d-ae42-fe24834ad4a9,2,1.0,0.0,3.0,68,0,1,1,10,...,0.0,0.0,-0.75,1.0,-0.5,0.0,-0.384615,-0.333333,-0.583333,-0.25
1,ce51ee31-8553-4321-9f83-8cd3dabe2f66,2,1.0,0.0,2.0,68,0,1,1,10,...,0.0,0.0,-0.75,1.0,-0.5,0.0,-0.384615,-0.333333,-0.583333,-0.25
2,d3c89012-3ab9-4014-b577-61ff05e31968,2,1.0,0.0,0.0,68,0,1,1,10,...,0.0,0.0,-0.75,1.0,-0.5,0.0,-0.384615,-0.333333,-0.583333,-0.25
3,5c42911d-0ebd-47ba-9925-dd5ab1c0ed61,2,1.0,0.0,1.0,68,0,1,1,10,...,0.0,0.0,-0.75,1.0,-0.5,0.0,-0.384615,-0.333333,-0.583333,-0.25
4,235472d5-ad2e-4c76-947e-358c9d8c1280,2,1.0,0.0,0.0,68,0,1,1,10,...,0.0,0.0,-0.75,1.0,-0.5,0.0,-0.384615,-0.333333,-0.583333,-0.25


In [180]:
lightgbm(df_train,'tremor')
catboost(df_train,'tremor')

lightgbm(df_train,'dyskinesia')
catboost(df_train,'dyskinesia')

lightgbm(df_train,'on_off')
catboost(df_train,'on_off')

lightgbm test_MSE tremor : 0.391
catboost test_MSE tremor : 0.391
lightgbm test_MSE dyskinesia : 0.127
catboost test_MSE dyskinesia : 0.126
lightgbm test_MSE on_off : 0.153
catboost test_MSE on_off : 0.154
