In [None]:
import pandas as pd
import numpy as np
import pywt
import matplotlib.pyplot as plt
from scipy import interpolate
import glob
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm

from sklearn.decomposition import PCA
import time
from joblib import Parallel, delayed
import multiprocessing

from sklearn.model_selection import train_test_split
import lightgbm as lgb
from catboost import CatBoostRegressor, FeaturesData, Pool
from sklearn.metrics import mean_squared_error

In [None]:
#==========================================================================================
#making time stamp uniform by Interpolation

def preprocess(data):
    freq=50
    ls=['x','y','z']
    t1=np.arange(data.t[0],data.t[(data.shape[0])-1],0.02)
    df=pd.DataFrame({'Timestamp':t1})
    for i in ls:
        fcubic = interpolate.interp1d(data.t, data[i], kind='cubic')
        df[i]=fcubic(t1)
    df.columns=['Timestamp','acc_X','acc_Y','acc_Z']
    return df


def preprocess2(data):
    ls=['x','y','z']
    freq=round((1/((data.t.max()/data.t.shape[0]).round(3))),0)
    t1=np.arange(data.t[0],data.t[(data.shape[0])-1],(data.t.max()/data.t.shape[0]).round(3))
    df=pd.DataFrame({'Timestamp':t1})
    for i in ls:
        fcubic = interpolate.interp1d(data.t, data[i], kind='cubic')
        df[i]=fcubic(t1)
    df.columns=['Timestamp','acc_X','acc_Y','acc_Z']
    return df,freq


def preprocess_real_smartwatch(data):
    
    a=data.groupby('device_id').agg({'x':'var','y':'count'}).reset_index()
    deviceid=a.loc[a.x.idxmax(),'device_id']
    if int(a.loc[a.device_id==deviceid,'y'])<=data.shape[0]*0.2:
        deviceid=a.loc[a.x.idxmin(),'device_id']
    
    data=data[data.device_id==deviceid].reset_index()
    data.rename(columns={'t':'Timestamp','x':'X','y':'Y','z':'Z'},inplace=True)
   
    ls=['X','Y','Z']
    freq=50
    #freq=round((1/((data.Timestamp.max()/data.Timestamp.shape[0]).round(3))),0)
    t1=np.arange(data.Timestamp[0],data.Timestamp[(data.shape[0])-1],0.02)
    #t1=np.arange(data.Timestamp[0],data.Timestamp[(data.shape[0])-1],(data.Timestamp.max()/data.Timestamp.shape[0]).round(3))
    df=pd.DataFrame({'Timestamp':t1})
        
    for i in ls:
        fcubic = interpolate.interp1d(data.Timestamp, data[i])
        df[i]=fcubic(t1)
    df.rename(columns={'X':'acc_X','Y':'acc_Y','Z':'acc_Z'},inplace=True)
    return df[['Timestamp','acc_X','acc_Y','acc_Z']],deviceid,freq

#==========================================================================================
#median filter
from scipy.signal import medfilt # import the median filter function
def median(signal):# input: numpy array 1D (one column)  
    #applying the median filter
    return  medfilt(np.array(signal), kernel_size=3) # applying the median filter order3(kernel_size=3)


#==========================================================================================
#components_selection_one_signal
import math # import math library


def components_selection_one_signal(t_signal,sampling_freq):
    nyq=sampling_freq/float(2) # nyq is the nyquist frequency equal to the half of the sampling frequency[50/2= 25 Hz]

    freq1 = 0.3
    freq2 = 20

    t_signal=np.array(t_signal)
    t_signal_length=len(t_signal) # number of points in a t_signal
    
    # the t_signal in frequency domain after applying fft
    f_signal=np.fft.fft(t_signal) # 1D numpy array contains complex values (in C)
    
    # generate frequencies associated to f_signal complex values
    freqs=np.array(np.fft.fftfreq(t_signal_length, d=1/float(sampling_freq))) # frequency values between [-25hz:+25hz]
        
    df=pd.DataFrame({'freq':abs(freqs),'amplitute':f_signal})
    df['f_DC_signal']=np.where(df.freq>freq1,0,df.amplitute)
    df['f_noise_signal']=np.where(df.freq<=freq2,0,df.amplitute)
    df['f_body_signal']=np.where(df.freq<=freq1,0,np.where(df.freq>freq2,0,df.amplitute))

    
    # Inverse the transformation of signals in freq domain #
    # applying the inverse fft(ifft) to signals in freq domain and put them in float format
    t_DC_component= np.fft.ifft(np.array(df['f_DC_signal'])).real
    t_body_component= np.fft.ifft(np.array(df['f_body_signal'])).real
    t_noise=np.fft.ifft(np.array(df['f_noise_signal'])).real
    
    total_component=t_signal-t_noise # extracting the total component(filtered from noise) 
                                     #  by substracting noise from t_signal (the original signal).
    
    # return outputs mentioned earlier
    return (total_component,t_DC_component,t_body_component,t_noise) 


#=================================================================================================================
#Define verify gravity function
def mag_3_signals(df): # Euclidian magnitude
    return np.array(np.sqrt(np.square(df).sum(axis=1)))

def verify_gravity(data):
    
    acc_x=np.array(data['acc_X']) # copy acc_X column from dataframe in raw_dic having the key mentioned above
    acc_y=np.array(data['acc_Y'])# copy acc_Y column  from dataframe in raw_dic having the key mentioned above
    acc_z=np.array(data['acc_Z'])# copy acc_Z column  from dataframe in raw_dic having the key mentioned above

    # apply the filtering method to acc_[X,Y,Z] and store gravity components
    grav_acc_X=components_selection_one_signal(acc_x)[1] 
    grav_acc_Y=components_selection_one_signal(acc_y)[1]
    grav_acc_Z=components_selection_one_signal(acc_z)[1]
    
    # calculating gravity magnitude signal
    grav_acc_mag=mag_3_signals(grav_acc_X, grav_acc_Y,grav_acc_Z)
    print('mean value = ',round((sum(grav_acc_mag) / len(grav_acc_mag)),3),' g')
    
#=================================================================================================================    
#Define jerking and magnitude functions
def jerk_one_signal(signal,sampling_freq):
    signal=pd.DataFrame(signal)
    jerk=(signal.shift(-1)-signal)*sampling_freq
    return np.array(jerk.dropna()).transpose()[0]

## Id label

In [None]:
#Test Data
cis_pd_testing_id=pd.read_csv('test_data_Id/cis-pd.CIS-PD_Test_Data_IDs.csv')
real_pd_testing_id=pd.read_csv('test_data_Id/real-pd.REAL-PD_Test_Data_IDs.csv')

#Training Data
cis_pd_training_id=pd.read_csv('data_labels/CIS-PD_Training_Data_IDs_Labels.csv')
real_pd_training_id=pd.read_csv('data_labels/REAL-PD_Training_Data_IDs_Labels.csv')

#Ancillary Data
cis_pd_ancillary_id=pd.read_csv('data_labels/CIS-PD_Ancillary_Data_IDs_Labels.csv')
real_pd_ancillary_id=pd.read_csv('data_labels/REAL-PD_Ancillary_Data_IDs_Labels.csv')

In [None]:
def time_domain_signal(data,sampling_freq):
    time_sig_df=pd.DataFrame()
    for column in ['acc_X','acc_Y','acc_Z']:
        t_signal=np.array(data[column])
        #med_filtred=median(t_signal)
        med_filtred=(t_signal)
        _,grav_acc,body_acc,_=components_selection_one_signal(med_filtred,sampling_freq)
        body_acc_jerk=jerk_one_signal(body_acc,sampling_freq)
        time_sig_df['t_body_'+column]=body_acc[:-1]
        time_sig_df['t_grav_'+column]= grav_acc[:-1]
        time_sig_df['t_body_acc_jerk_'+column[-1]]=body_acc_jerk

    # all 15 axial signals generated above are reordered to facilitate magnitudes signals generation
    new_columns_ordered=['t_body_acc_X','t_body_acc_Y','t_body_acc_Z',
                              't_grav_acc_X','t_grav_acc_Y','t_grav_acc_Z',
                              't_body_acc_jerk_X','t_body_acc_jerk_Y','t_body_acc_jerk_Z']


    # create new dataframe to order columns
    time_sig_df=time_sig_df[new_columns_ordered]

    # Magnitude Features
    for i in range(0,9,3):
        mag_col_name=new_columns_ordered[i][:-1]+'mag'# Create the magnitude column name related to each 3-axial signals
        time_sig_df[mag_col_name]=mag_3_signals(time_sig_df[new_columns_ordered[i:i+3]]) # store the signal_mag with its appropriate column name

    return(time_sig_df)

In [None]:
a=glob.glob("training_data/smartphone_accelerometer/*.csv")
data=preprocess(pd.read_csv(a[14]))
time_sig_df=time_domain_signal(data,50)

In [None]:
def wavelet(dfl,ls):
    num=16
    scales= np.arange(1,num+1)
    pca = PCA(n_components=1)
    df2,sampling_freq=preprocess2(pd.read_csv(dfl))
    time_sig_df=time_domain_signal(df2,sampling_freq)
    wavelet_coeff=[]
    columns=time_sig_df.columns
    for i in columns:
        coeff, freq = pywt.cwt(time_sig_df[i],scales,ls)
        wavelet_coeff.append(list(pca.fit_transform(coeff).flatten()))
    return list(np.reshape(wavelet_coeff, (1,len(columns)*num))[0])

def wavelet_real_smartwatch(dfl,ls):
    num=16
    scales= np.arange(1,num+1)
    pca = PCA(n_components=1)
    df2,devide_id,sampling_freq=preprocess_real_smartwatch(pd.read_csv(dfl))
    time_sig_df=time_domain_signal(df2,sampling_freq)
    wavelet_coeff=[]
    columns=time_sig_df.columns
    #columns=['t_body_acc_Z','t_grav_acc_Z','t_body_acc_jerk_Z','t_body_acc_mag','t_grav_acc_mag','t_body_acc_jerk_mag']
    for i in columns:
        coeff, freq = pywt.cwt(time_sig_df[i],scales,ls)
        wavelet_coeff.append(list(pca.fit_transform(coeff).flatten()))
    return [devide_id]+list(np.reshape(wavelet_coeff, (1,len(columns)*num))[0])


def wavelet_real_smartwatch_gyro(dfl,ls):
    num=16
    scales= np.arange(1,num+1)
    pca = PCA(n_components=1)
    df2,devide_id,sampling_freq=preprocess_real_smartwatch(pd.read_csv(dfl))
    time_sig_df=time_domain_signal(df2,sampling_freq)
    wavelet_coeff=[]
    #columns=time_sig_df.columns
    columns=['t_body_acc_X','t_body_acc_Y','t_body_acc_Z','t_body_acc_jerk_X','t_body_acc_jerk_Y','t_body_acc_jerk_Z','t_body_acc_mag','t_body_acc_jerk_mag']
    for i in columns:
        coeff, freq = pywt.cwt(time_sig_df[i],scales,ls)
        wavelet_coeff.append(list(pca.fit_transform(coeff).flatten()))
    return [devide_id]+list(np.reshape(wavelet_coeff, (1,len(columns)*num))[0])


# Smartphone

In [None]:
# training
a=glob.glob("training_data/smartphone_accelerometer/*.csv")
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(wavelet)(i,'morl') for i in a)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#--- 9.026943965752919 Mins ---

In [None]:
columns=list(time_sig_df.columns)
df_train=pd.DataFrame(result)
df_train.columns=["wavelet_coeff_" + suit + '_' + str(i) for suit,i in zip(list(np.repeat(columns,16)),list(np.arange(1,17))*len(columns))]
df_train['measurement_id']=[item[len('training_data/smartphone_accelerometer/'):-4] for item in a]
df_train.head()

In [None]:
#ancillary_data
a=glob.glob("ancillary_data/smartphone_accelerometer/*.csv")
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(wavelet)(i,'morl') for i in a)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#--- 7.226141568024953 Mins ---

In [None]:
df_train2=pd.DataFrame(result)
df_train2.columns=["wavelet_coeff_" + suit + '_' + str(i) for suit,i in zip(list(np.repeat(columns,16)),list(np.arange(1,17))*len(columns))]
df_train2['measurement_id']=[item[len('ancillary_data/smartphone_accelerometer/'):-4] for item in a]
df_train2.head()

In [None]:
Frame = df_train.append(pd.DataFrame(df_train2), ignore_index=True)

In [None]:
#export part2 features from smartphone data of training data for realpd
Frame.to_csv('realpd_wavelet_features_smartphone_training.csv',index=False)

In [None]:
# test
a=glob.glob("testing_data/smartphone_accelerometer/*.csv")
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(wavelet)(i,'morl') for i in a)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#--- 9.026943965752919 Mins ---

columns=list(time_sig_df.columns)
df_train=pd.DataFrame(result)
df_train.columns=["wavelet_coeff_" + suit + '_' + str(i) for suit,i in zip(list(np.repeat(columns,16)),list(np.arange(1,17))*len(columns))]
df_train['measurement_id']=[item[len('testing_data/smartphone_accelerometer/'):-4] for item in a]
print(df_train.shape)
df_train.head()

In [None]:
#export part2 features from smartphone data of testing data for realpd
df_train.to_csv('realpd_wavelet_features_smartphone_testing.csv',index=False)

# Smartwatch

## accelerometer

In [None]:
# training
a=glob.glob("training_data/smartwatch_accelerometer/*.csv")
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(wavelet_real_smartwatch)(i,'morl') for i in a)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#--- 7.979812415440877 Mins ---

In [None]:
columns=list(time_sig_df.columns)
#columns=['t_body_acc_Z','t_grav_acc_Z','t_body_acc_jerk_Z','t_body_acc_mag','t_grav_acc_mag','t_body_acc_jerk_mag']
df_train=pd.DataFrame(result)
df_train.columns=['device_id']+["wavelet_coeff_" + suit + '_' + str(i) for suit,i in zip(list(np.repeat(columns,16)),list(np.arange(1,17))*len(columns))]
df_train['measurement_id']=[item[len('training_data/smartphone_accelerometer/'):-4] for item in a]
df_train.head()

In [None]:
#ancillary
a=glob.glob("ancillary_data/smartwatch_accelerometer/*.csv")
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(wavelet_real_smartwatch)(i,'morl') for i in a)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#--- 2.26669598420461 Mins ---

In [None]:
df_train2=pd.DataFrame(result)
df_train2.columns=['device_id']+["wavelet_coeff_" + suit + '_' + str(i) for suit,i in zip(list(np.repeat(columns,16)),list(np.arange(1,17))*len(columns))]
df_train2['measurement_id']=[item[len('ancillary_data/smartphone_accelerometer/'):-4] for item in a]
df_train2.head()

In [None]:
#merging training and ancillary
Frame_smartwatch_accelerometer = df_train.append(pd.DataFrame(df_train2), ignore_index=True)
Frame_smartwatch_accelerometer.columns=['device_id_acc']+list(Frame_smartwatch_accelerometer.columns)[1:]

In [None]:
# testing
a=glob.glob("testing_data/smartwatch_accelerometer/*.csv")
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(wavelet_real_smartwatch)(i,'morl') for i in a)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#--- 7.979812415440877 Mins ---

columns=list(time_sig_df.columns)
#columns=['t_body_acc_Z','t_grav_acc_Z','t_body_acc_jerk_Z','t_body_acc_mag','t_grav_acc_mag','t_body_acc_jerk_mag']
df_train=pd.DataFrame(result)
df_train.columns=['device_id']+["wavelet_coeff_" + suit + '_' + str(i) for suit,i in zip(list(np.repeat(columns,16)),list(np.arange(1,17))*len(columns))]
df_train['measurement_id']=[item[len('testing_data/smartphone_accelerometer/'):-4] for item in a]
Frame_smartwatch_accelerometer_test=df_train.copy()
Frame_smartwatch_accelerometer_test.columns=['device_id_acc']+list(Frame_smartwatch_accelerometer_test.columns)[1:]
print(Frame_smartwatch_accelerometer_test.shape)
Frame_smartwatch_accelerometer_test.head()

## gyroscope

In [None]:
#training
a=glob.glob("training_data/smartwatch_gyroscope/*.csv")
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(wavelet_real_smartwatch_gyro)(i,'morl') for i in a)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#--- 5.405052049954732 Mins ---

In [None]:
columns=['t_body_acc_X','t_body_acc_Y','t_body_acc_Z','t_body_acc_jerk_X','t_body_acc_jerk_Y','t_body_acc_jerk_Z','t_body_acc_mag','t_body_acc_jerk_mag']
df_train=pd.DataFrame(result)
df_train.columns=['device_id']+["wavelet_coeff_" + suit + '_' + str(i) for suit,i in zip(list(np.repeat(columns,16)),list(np.arange(1,17))*len(columns))]
df_train['measurement_id']=[item[len('training_data/smartwatch_gyroscope/'):-4] for item in a]
df_train.head()

In [None]:
#ancillary
a=glob.glob("ancillary_data/smartwatch_gyroscope/*.csv")
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(wavelet_real_smartwatch_gyro)(i,'morl') for i in a)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#--- 2.26669598420461 Mins ---

In [None]:
df_train2=pd.DataFrame(result)
df_train2.columns=['device_id']+["wavelet_coeff_" + suit + '_' + str(i) for suit,i in zip(list(np.repeat(columns,16)),list(np.arange(1,17))*len(columns))]
df_train2['measurement_id']=[item[len('ancillary_data/smartwatch_gyroscope/'):-4] for item in a]
df_train2.head()

In [None]:
#merging training and ancillary
Frame_smartwatch_gyro = df_train.append(pd.DataFrame(df_train2), ignore_index=True)
Frame_smartwatch_gyro.columns=[i.replace('acc','gyro') for i in list(Frame_smartwatch_gyro.columns)]
Frame_smartwatch_gyro.columns=['device_id_gyro']+list(Frame_smartwatch_gyro.columns)[1:]

In [None]:
# testing
a=glob.glob("testing_data/smartwatch_gyroscope/*.csv")
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(wavelet_real_smartwatch_gyro)(i,'morl') for i in a)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#--- 5.405052049954732 Mins ---

columns=['t_body_acc_X','t_body_acc_Y','t_body_acc_Z','t_body_acc_jerk_X','t_body_acc_jerk_Y','t_body_acc_jerk_Z','t_body_acc_mag','t_body_acc_jerk_mag']
df_train=pd.DataFrame(result)
df_train.columns=['device_id']+["wavelet_coeff_" + suit + '_' + str(i) for suit,i in zip(list(np.repeat(columns,16)),list(np.arange(1,17))*len(columns))]
df_train['measurement_id']=[item[len('testing_data/smartwatch_gyroscope/'):-4] for item in a]

Frame_smartwatch_gyro_test=df_train.copy()
Frame_smartwatch_gyro_test.columns=[i.replace('acc','gyro') for i in list(Frame_smartwatch_gyro_test.columns)]
Frame_smartwatch_gyro_test.columns=['device_id_gyro']+list(Frame_smartwatch_gyro_test.columns)[1:]

In [None]:
#mergining smartwatch_accelerometer & smartwatch_gyroscope
Frame_smartwatch=pd.merge(Frame_smartwatch_accelerometer,Frame_smartwatch_gyro,on='measurement_id')
Frame_smartwatch=Frame_smartwatch.drop('device_id_gyro',axis=1)

Frame_smartwatch_test=pd.merge(Frame_smartwatch_accelerometer_test,Frame_smartwatch_gyro_test,on='measurement_id')
Frame_smartwatch_test=Frame_smartwatch_test.drop('device_id_gyro',axis=1)

In [None]:
#export part2 features from smartwatch of training data for realpd
Frame_smartwatch.to_csv('realpd_wavelet_features_smartwatch_training.csv',index=False)

#export part2 features from smartwatch of testing data for realpd
Frame_smartwatch_test.to_csv('realpd_wavelet_features_smartwatch_testing.csv',index=False)