In [11]:
import pandas as pd
import numpy as np
import pywt
import matplotlib.pyplot as plt
from scipy import interpolate
import glob
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm

from sklearn.decomposition import PCA
import time
from joblib import Parallel, delayed
import multiprocessing

from sklearn.model_selection import train_test_split
import lightgbm as lgb
from catboost import CatBoostRegressor, FeaturesData, Pool
from sklearn.metrics import mean_squared_error

In [39]:
#==========================================================================================
#making time stamp uniform by Interpolation

def preprocess(data):
    freq=50
    ls=['x','y','z']
    t1=np.arange(data.t[0],data.t[(data.shape[0])-1],0.02)
    df=pd.DataFrame({'Timestamp':t1})
    for i in ls:
        fcubic = interpolate.interp1d(data.t, data[i], kind='cubic')
        df[i]=fcubic(t1)
    df.columns=['Timestamp','acc_X','acc_Y','acc_Z']
    return df


def preprocess2(data):
    ls=['x','y','z']
    freq=round((1/((data.t.max()/data.t.shape[0]).round(3))),0)
    t1=np.arange(data.t[0],data.t[(data.shape[0])-1],(data.t.max()/data.t.shape[0]).round(3))
    df=pd.DataFrame({'Timestamp':t1})
    for i in ls:
        fcubic = interpolate.interp1d(data.t, data[i], kind='cubic')
        df[i]=fcubic(t1)
    df.columns=['Timestamp','acc_X','acc_Y','acc_Z']
    return df,freq


def preprocess_real_smartwatch(data):
    
    a=data.groupby('device_id').agg({'x':'var','y':'count'}).reset_index()
    deviceid=a.loc[a.x.idxmax(),'device_id']
    if int(a.loc[a.device_id==deviceid,'y'])<=data.shape[0]*0.2:
        deviceid=a.loc[a.x.idxmin(),'device_id']
    
    data=data[data.device_id==deviceid].reset_index()
    data.rename(columns={'t':'Timestamp','x':'X','y':'Y','z':'Z'},inplace=True)
   
    ls=['X','Y','Z']
    freq=50
    #freq=round((1/((data.Timestamp.max()/data.Timestamp.shape[0]).round(3))),0)
    t1=np.arange(data.Timestamp[0],data.Timestamp[(data.shape[0])-1],0.02)
    #t1=np.arange(data.Timestamp[0],data.Timestamp[(data.shape[0])-1],(data.Timestamp.max()/data.Timestamp.shape[0]).round(3))
    df=pd.DataFrame({'Timestamp':t1})
        
    for i in ls:
        fcubic = interpolate.interp1d(data.Timestamp, data[i])
        df[i]=fcubic(t1)
    df.rename(columns={'X':'acc_X','Y':'acc_Y','Z':'acc_Z'},inplace=True)
    return df[['Timestamp','acc_X','acc_Y','acc_Z']],deviceid,freq

#==========================================================================================
#median filter
from scipy.signal import medfilt # import the median filter function
def median(signal):# input: numpy array 1D (one column)  
    #applying the median filter
    return  medfilt(np.array(signal), kernel_size=3) # applying the median filter order3(kernel_size=3)


#==========================================================================================
#components_selection_one_signal
import math # import math library


def components_selection_one_signal(t_signal,sampling_freq):
    nyq=sampling_freq/float(2) # nyq is the nyquist frequency equal to the half of the sampling frequency[50/2= 25 Hz]

    freq1 = 0.3
    freq2 = 20

    t_signal=np.array(t_signal)
    t_signal_length=len(t_signal) # number of points in a t_signal
    
    # the t_signal in frequency domain after applying fft
    f_signal=np.fft.fft(t_signal) # 1D numpy array contains complex values (in C)
    
    # generate frequencies associated to f_signal complex values
    freqs=np.array(np.fft.fftfreq(t_signal_length, d=1/float(sampling_freq))) # frequency values between [-25hz:+25hz]
        
    df=pd.DataFrame({'freq':abs(freqs),'amplitute':f_signal})
    df['f_DC_signal']=np.where(df.freq>freq1,0,df.amplitute)
    df['f_noise_signal']=np.where(df.freq<=freq2,0,df.amplitute)
    df['f_body_signal']=np.where(df.freq<=freq1,0,np.where(df.freq>freq2,0,df.amplitute))

    
    # Inverse the transformation of signals in freq domain #
    # applying the inverse fft(ifft) to signals in freq domain and put them in float format
    t_DC_component= np.fft.ifft(np.array(df['f_DC_signal'])).real
    t_body_component= np.fft.ifft(np.array(df['f_body_signal'])).real
    t_noise=np.fft.ifft(np.array(df['f_noise_signal'])).real
    
    total_component=t_signal-t_noise # extracting the total component(filtered from noise) 
                                     #  by substracting noise from t_signal (the original signal).
    
    # return outputs mentioned earlier
    return (total_component,t_DC_component,t_body_component,t_noise) 


#=================================================================================================================
#Define verify gravity function
def mag_3_signals(df): # Euclidian magnitude
    return np.array(np.sqrt(np.square(df).sum(axis=1)))

def verify_gravity(data):
    
    acc_x=np.array(data['acc_X']) # copy acc_X column from dataframe in raw_dic having the key mentioned above
    acc_y=np.array(data['acc_Y'])# copy acc_Y column  from dataframe in raw_dic having the key mentioned above
    acc_z=np.array(data['acc_Z'])# copy acc_Z column  from dataframe in raw_dic having the key mentioned above

    # apply the filtering method to acc_[X,Y,Z] and store gravity components
    grav_acc_X=components_selection_one_signal(acc_x)[1] 
    grav_acc_Y=components_selection_one_signal(acc_y)[1]
    grav_acc_Z=components_selection_one_signal(acc_z)[1]
    
    # calculating gravity magnitude signal
    grav_acc_mag=mag_3_signals(grav_acc_X, grav_acc_Y,grav_acc_Z)
    print('mean value = ',round((sum(grav_acc_mag) / len(grav_acc_mag)),3),' g')
    
#=================================================================================================================    
#Define jerking and magnitude functions
def jerk_one_signal(signal,sampling_freq):
    signal=pd.DataFrame(signal)
    jerk=(signal.shift(-1)-signal)*sampling_freq
    return np.array(jerk.dropna()).transpose()[0]





#==========
#model
#lightgbm
def lightgbm(df4,label):
    train=df4[~df4[label].isnull()]

    # Splitting the dataset into the Training set and Test set
    x_train, x_test, y_train, y_test = train_test_split(train.drop(['measurement_id','on_off','dyskinesia','tremor'], axis = 1), train[label], 
                                                        train_size=0.7, 
                                                        stratify = train[label],
                                                        random_state=1)

    d_train = lgb.Dataset(x_train, label=y_train)

    params = {}
    params['learning_rate'] = 0.03
    params['boosting_type'] = 'gbdt'
    params['objective'] = 'regression'
    params['metric'] = 'rmse'
    params['max_depth'] = 5
    params['feature_fraction'] = 0.8
    clf = lgb.train(params, d_train, 150)
    y_pred=clf.predict(x_test)
    y_pred2=clf.predict(x_train)

    #print('lightgbm train_MSE '+label+' :',round(mean_squared_error(y_train, y_pred2),3))
    print('lightgbm test_MSE '+label+' :',round(mean_squared_error(y_test, y_pred),3))
    
    
#Catboost    
#This will give indexes of the categorical features
def categorical_index(df,cols):
    cat=[]
    for c in cols:
        try:
            cat.append(df.columns.get_loc(c))
        except:
            pass # doing nothing on exception
    return cat

def catboost(df4,label):    
    train=df4[~df4[label].isnull()]
    X_train, X_test, y_train, y_test = train_test_split(train.drop(['measurement_id','on_off','dyskinesia','tremor'], axis = 1), train[label], 
                                                        train_size=0.7, 
                                                        stratify = train[label],
                                                        random_state=1)
    train_dataset = Pool(data=X_train,
                         label=y_train,
                         cat_features=categorical_index(X_train,['subject_id','Gender']))

    eval_dataset = Pool(data=X_test,
                        label=y_test,
                        cat_features=categorical_index(X_train,['subject_id','Gender']))



    model = CatBoostRegressor(iterations=50, learning_rate=0.1, depth=6,verbose=False)
    # Fit model
    model.fit(train_dataset)
    # Get predictions
    preds = model.predict(eval_dataset)
    preds2 = model.predict(train_dataset)

    #print('catboost train_MSE '+label+' :',round(mean_squared_error(y_train, preds2),3))
    print('catboost test_MSE '+label+' :',round(mean_squared_error(y_test, preds),3))

## Id label

In [13]:
#Test Data
cis_pd_testing_id=pd.read_csv('test_data_Id/cis-pd.CIS-PD_Test_Data_IDs.csv')
real_pd_testing_id=pd.read_csv('test_data_Id/real-pd.REAL-PD_Test_Data_IDs.csv')

#Training Data
cis_pd_training_id=pd.read_csv('data_labels/CIS-PD_Training_Data_IDs_Labels.csv')
real_pd_training_id=pd.read_csv('data_labels/REAL-PD_Training_Data_IDs_Labels.csv')

#Ancillary Data
cis_pd_ancillary_id=pd.read_csv('data_labels/CIS-PD_Ancillary_Data_IDs_Labels.csv')
real_pd_ancillary_id=pd.read_csv('data_labels/REAL-PD_Ancillary_Data_IDs_Labels.csv')

In [14]:

a=glob.glob("training_data/smartphone_accelerometer/*.csv")

In [15]:
data=preprocess(pd.read_csv(a[14]))
print(data.shape)
data.head()

(60000, 4)


Unnamed: 0,Timestamp,acc_X,acc_Y,acc_Z
0,0.0,0.3304,0.486023,9.694122
1,0.02,0.296881,0.474052,9.691729
2,0.04,0.320148,0.463147,9.697291
3,0.06,0.295887,0.49636,9.695011
4,0.08,0.300927,0.47452,9.700669


In [16]:
def time_domain_signal(data,sampling_freq):
    time_sig_df=pd.DataFrame()
    for column in ['acc_X','acc_Y','acc_Z']:
        t_signal=np.array(data[column])
        #med_filtred=median(t_signal)
        med_filtred=(t_signal)
        _,grav_acc,body_acc,_=components_selection_one_signal(med_filtred,sampling_freq)
        body_acc_jerk=jerk_one_signal(body_acc,sampling_freq)
        time_sig_df['t_body_'+column]=body_acc[:-1]
        time_sig_df['t_grav_'+column]= grav_acc[:-1]
        time_sig_df['t_body_acc_jerk_'+column[-1]]=body_acc_jerk

    # all 15 axial signals generated above are reordered to facilitate magnitudes signals generation
    new_columns_ordered=['t_body_acc_X','t_body_acc_Y','t_body_acc_Z',
                              't_grav_acc_X','t_grav_acc_Y','t_grav_acc_Z',
                              't_body_acc_jerk_X','t_body_acc_jerk_Y','t_body_acc_jerk_Z']


    # create new dataframe to order columns
    time_sig_df=time_sig_df[new_columns_ordered]

    # Magnitude Features
    for i in range(0,9,3):
        mag_col_name=new_columns_ordered[i][:-1]+'mag'# Create the magnitude column name related to each 3-axial signals
        time_sig_df[mag_col_name]=mag_3_signals(time_sig_df[new_columns_ordered[i:i+3]]) # store the signal_mag with its appropriate column name

    return(time_sig_df)

In [18]:
time_sig_df=time_domain_signal(data,50)
time_sig_df.shape

(59999, 12)

In [19]:
time_sig_df.head()

Unnamed: 0,t_body_acc_X,t_body_acc_Y,t_body_acc_Z,t_grav_acc_X,t_grav_acc_Y,t_grav_acc_Z,t_body_acc_jerk_X,t_body_acc_jerk_Y,t_body_acc_jerk_Z,t_body_acc_mag,t_grav_acc_mag,t_body_acc_jerk_mag
0,0.061591,0.043569,-0.015099,0.242497,0.43806,9.707419,0.815859,-0.275069,-0.01122,0.07694,9.720323,0.861054
1,0.077908,0.038067,-0.015324,0.243941,0.439185,9.707617,-1.177281,-0.844242,0.297738,0.088055,9.720608,1.478981
2,0.054363,0.021183,-0.009369,0.245383,0.440309,9.707814,0.425086,1.682655,-0.315949,0.059091,9.720891,1.764044
3,0.062864,0.054836,-0.015688,0.246823,0.441432,9.708009,-0.847576,-1.056236,0.581125,0.084882,9.721174,1.473678
4,0.045913,0.033711,-0.004065,0.24826,0.442554,9.708202,0.016949,-0.878736,0.130021,0.057105,9.721455,0.888464


In [25]:
def wavelet(dfl,ls):
    num=16
    scales= np.arange(1,num+1)
    pca = PCA(n_components=1)
    df2,sampling_freq=preprocess2(pd.read_csv(dfl))
    time_sig_df=time_domain_signal(df2,sampling_freq)
    wavelet_coeff=[]
    columns=time_sig_df.columns
    for i in columns:
        coeff, freq = pywt.cwt(time_sig_df[i],scales,ls)
        wavelet_coeff.append(list(pca.fit_transform(coeff).flatten()))
    return list(np.reshape(wavelet_coeff, (1,len(columns)*num))[0])

def wavelet_real_smartwatch(dfl,ls):
    num=16
    scales= np.arange(1,num+1)
    pca = PCA(n_components=1)
    df2,devide_id,sampling_freq=preprocess_real_smartwatch(pd.read_csv(dfl))
    time_sig_df=time_domain_signal(df2,sampling_freq)
    wavelet_coeff=[]
    columns=time_sig_df.columns
    #columns=['t_body_acc_Z','t_grav_acc_Z','t_body_acc_jerk_Z','t_body_acc_mag','t_grav_acc_mag','t_body_acc_jerk_mag']
    for i in columns:
        coeff, freq = pywt.cwt(time_sig_df[i],scales,ls)
        wavelet_coeff.append(list(pca.fit_transform(coeff).flatten()))
    return [devide_id]+list(np.reshape(wavelet_coeff, (1,len(columns)*num))[0])


def wavelet_real_smartwatch_gyro(dfl,ls):
    num=16
    scales= np.arange(1,num+1)
    pca = PCA(n_components=1)
    df2,devide_id,sampling_freq=preprocess_real_smartwatch(pd.read_csv(dfl))
    time_sig_df=time_domain_signal(df2,sampling_freq)
    wavelet_coeff=[]
    #columns=time_sig_df.columns
    columns=['t_body_acc_X','t_body_acc_Y','t_body_acc_Z','t_body_acc_jerk_X','t_body_acc_jerk_Y','t_body_acc_jerk_Z','t_body_acc_mag','t_body_acc_jerk_mag']
    for i in columns:
        coeff, freq = pywt.cwt(time_sig_df[i],scales,ls)
        wavelet_coeff.append(list(pca.fit_transform(coeff).flatten()))
    return [devide_id]+list(np.reshape(wavelet_coeff, (1,len(columns)*num))[0])


## smartphone_accelerometer

### training

In [26]:
a=glob.glob("training_data/smartphone_accelerometer/*.csv")
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(wavelet)(i,'morl') for i in a)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#--- 9.026943965752919 Mins ---



--- 17.48899003267288 Mins ---


In [27]:
columns=list(time_sig_df.columns)
df_train=pd.DataFrame(result)
df_train.columns=["wavelet_coeff_" + suit + '_' + str(i) for suit,i in zip(list(np.repeat(columns,16)),list(np.arange(1,17))*len(columns))]
df_train['measurement_id']=[item[len('training_data/smartphone_accelerometer/'):-4] for item in a]
df_train.head()


Unnamed: 0,wavelet_coeff_t_body_acc_X_1,wavelet_coeff_t_body_acc_X_2,wavelet_coeff_t_body_acc_X_3,wavelet_coeff_t_body_acc_X_4,wavelet_coeff_t_body_acc_X_5,wavelet_coeff_t_body_acc_X_6,wavelet_coeff_t_body_acc_X_7,wavelet_coeff_t_body_acc_X_8,wavelet_coeff_t_body_acc_X_9,wavelet_coeff_t_body_acc_X_10,...,wavelet_coeff_t_body_acc_jerk_mag_8,wavelet_coeff_t_body_acc_jerk_mag_9,wavelet_coeff_t_body_acc_jerk_mag_10,wavelet_coeff_t_body_acc_jerk_mag_11,wavelet_coeff_t_body_acc_jerk_mag_12,wavelet_coeff_t_body_acc_jerk_mag_13,wavelet_coeff_t_body_acc_jerk_mag_14,wavelet_coeff_t_body_acc_jerk_mag_15,wavelet_coeff_t_body_acc_jerk_mag_16,measurement_id
0,-14.266961,-14.3571,-14.77353,-16.147987,-17.927389,-19.373473,-19.506617,-16.946947,-11.578639,-3.085964,...,-2290.872877,-1710.852053,-530.508939,743.226444,2289.973967,3773.337811,5014.606191,6317.580419,6850.369824,05205dc3-9c82-4b6b-921e-b6c2edd2c993
1,-209.412386,-207.598598,-210.733522,-219.716633,-233.10094,-243.424726,-233.129552,-186.557926,-106.818287,6.514707,...,-4972.942315,-2606.296602,521.219779,3750.301402,6844.482744,9119.782867,10489.181697,10960.157786,10485.979059,aba31c29-79ef-4221-9412-156538a2fd4e
2,-65.245414,-64.646133,-65.60466,-68.322222,-72.389279,-75.438649,-71.390649,-54.92185,-27.176236,10.290326,...,-2321.485513,-1356.614337,-9.409414,1495.745113,3032.046067,4280.492821,5126.888645,5530.296057,5434.346245,6cab466b-3c95-4cc0-ba1b-8650f5a12504
3,-30.392137,-29.874333,-30.107837,-30.407058,-30.737255,-31.091492,-29.752231,-24.615892,-15.569642,-1.808304,...,-527.489215,-274.251731,50.819398,371.145961,681.126171,908.970841,1058.720731,1127.400643,1107.976754,27eccfc4-e329-4695-aee8-6d706b247191
4,16.82645,22.324748,29.000863,45.876664,65.691681,75.582783,63.664849,36.210584,7.57301,-18.792894,...,-2014.240753,-1342.347913,-372.72363,640.398707,1691.094758,2638.382894,3435.757068,4004.181796,4209.226082,ed560c25-e5c5-4dba-82c7-3fc18c248ce4


In [28]:
a=glob.glob("ancillary_data/smartphone_accelerometer/*.csv")
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(wavelet)(i,'morl') for i in a)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#--- 7.226141568024953 Mins ---

--- 13.877576784292858 Mins ---


In [29]:
df_train2=pd.DataFrame(result)
df_train2.columns=["wavelet_coeff_" + suit + '_' + str(i) for suit,i in zip(list(np.repeat(columns,16)),list(np.arange(1,17))*len(columns))]
df_train2['measurement_id']=[item[len('ancillary_data/smartphone_accelerometer/'):-4] for item in a]
df_train2.head()


Unnamed: 0,wavelet_coeff_t_body_acc_X_1,wavelet_coeff_t_body_acc_X_2,wavelet_coeff_t_body_acc_X_3,wavelet_coeff_t_body_acc_X_4,wavelet_coeff_t_body_acc_X_5,wavelet_coeff_t_body_acc_X_6,wavelet_coeff_t_body_acc_X_7,wavelet_coeff_t_body_acc_X_8,wavelet_coeff_t_body_acc_X_9,wavelet_coeff_t_body_acc_X_10,...,wavelet_coeff_t_body_acc_jerk_mag_8,wavelet_coeff_t_body_acc_jerk_mag_9,wavelet_coeff_t_body_acc_jerk_mag_10,wavelet_coeff_t_body_acc_jerk_mag_11,wavelet_coeff_t_body_acc_jerk_mag_12,wavelet_coeff_t_body_acc_jerk_mag_13,wavelet_coeff_t_body_acc_jerk_mag_14,wavelet_coeff_t_body_acc_jerk_mag_15,wavelet_coeff_t_body_acc_jerk_mag_16,measurement_id
0,-0.862582,-0.829417,-0.837372,-0.828164,-0.806349,-0.819354,-0.837263,-0.773561,-0.748035,-0.595266,...,-171.575417,-79.291744,50.170111,192.779169,316.688926,394.410264,415.125446,379.882477,306.769306,eab1cc17-40aa-4e1f-8e4f-64e6d5743509
1,-49.076596,-48.83792,-50.063581,-53.382303,-56.39696,-57.826841,-56.258334,-48.826989,-36.656761,-16.479608,...,-1568.621663,-933.105874,-41.377504,902.537816,1843.063322,2585.940009,3098.92762,3375.965824,3388.291311,b1a5fd6d-db9c-4870-a3c0-943e0656d112
2,-42.034266,-44.667868,-47.612942,-56.962247,-73.421047,-91.54413,-97.69765,-81.87336,-47.554411,-1.32635,...,-1499.494965,-811.765072,43.296286,771.135947,1402.975469,1818.374175,2084.545179,2204.486413,2191.771722,bca5e12d-9fd6-496b-ac08-9e2472d8b299
3,-59.27927,-59.280333,-60.25378,-63.562363,-70.32202,-80.083062,-85.022264,-75.354578,-50.407792,-11.135897,...,-2413.011119,-1378.546133,34.456315,1575.478355,3155.670207,4416.626394,5273.580549,5745.688165,5704.609188,9d74f5e1-241a-4f4a-bc6f-2779edf410cd
4,-0.847759,-1.041468,-1.379905,-1.920664,-2.371906,-2.580869,-2.32691,-1.619987,-0.631937,0.47939,...,-86.329533,-48.765132,3.574355,57.464464,107.587199,144.179289,166.170103,173.943439,167.709392,49f80736-6b50-44a6-a77b-9b1572334a8c


In [30]:
Frame = df_train.append(pd.DataFrame(df_train2), ignore_index=True)

In [31]:
Frame.shape

(877, 193)

In [42]:
Frame.to_csv('realpd_wavelet_features_smartphone_training.csv',index=False)

### test

In [40]:
a=glob.glob("testing_data/smartphone_accelerometer/*.csv")
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(wavelet)(i,'morl') for i in a)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#--- 9.026943965752919 Mins ---

columns=list(time_sig_df.columns)
df_train=pd.DataFrame(result)
df_train.columns=["wavelet_coeff_" + suit + '_' + str(i) for suit,i in zip(list(np.repeat(columns,16)),list(np.arange(1,17))*len(columns))]
df_train['measurement_id']=[item[len('testing_data/smartphone_accelerometer/'):-4] for item in a]
print(df_train.shape)
df_train.head()




--- 5.677277064323425 Mins ---
(169, 193)


Unnamed: 0,wavelet_coeff_t_body_acc_X_1,wavelet_coeff_t_body_acc_X_2,wavelet_coeff_t_body_acc_X_3,wavelet_coeff_t_body_acc_X_4,wavelet_coeff_t_body_acc_X_5,wavelet_coeff_t_body_acc_X_6,wavelet_coeff_t_body_acc_X_7,wavelet_coeff_t_body_acc_X_8,wavelet_coeff_t_body_acc_X_9,wavelet_coeff_t_body_acc_X_10,...,wavelet_coeff_t_body_acc_jerk_mag_8,wavelet_coeff_t_body_acc_jerk_mag_9,wavelet_coeff_t_body_acc_jerk_mag_10,wavelet_coeff_t_body_acc_jerk_mag_11,wavelet_coeff_t_body_acc_jerk_mag_12,wavelet_coeff_t_body_acc_jerk_mag_13,wavelet_coeff_t_body_acc_jerk_mag_14,wavelet_coeff_t_body_acc_jerk_mag_15,wavelet_coeff_t_body_acc_jerk_mag_16,measurement_id
0,-413.835668,-407.961277,-412.489104,-422.76087,-437.745209,-448.216871,-418.66744,-313.586499,-136.813717,97.109836,...,-6097.173511,-2816.969782,1224.958819,5283.938737,9178.995513,11960.04868,13703.653957,14351.344104,13908.471748,7860035d-c9df-44e9-ba0e-0855dfad90f4
1,-378.493913,-373.65967,-377.948703,-387.885105,-405.181772,-428.011171,-430.120194,-378.031065,-274.916779,-102.231795,...,-9161.776125,-2905.943395,4593.792441,11632.438005,17380.704331,20431.75141,21183.649728,20153.519859,17839.06161,8abf7688-c6bb-488f-bb9d-c359a5f86b35
2,-82.649847,-81.749772,-82.778568,-85.407236,-89.627282,-93.884235,-90.969162,-73.733664,-43.16834,0.580687,...,-2827.603925,-1786.787049,-267.932454,1439.080225,3250.698957,4823.910193,6045.161258,6879.393378,7166.770383,5b76c255-d8dd-40d3-82e9-745cb943607f
3,6.912332,8.395854,10.119802,16.942758,25.616088,29.590308,26.787528,18.231901,7.097077,-5.08481,...,-1026.459248,-630.679285,-75.721534,522.349676,1190.091646,1756.937208,2159.365992,2444.234916,2485.103636,c4f0d5dc-c1db-4cc5-89d1-26d7fee32299
4,-44.985963,-44.617567,-45.269944,-47.138997,-50.064158,-52.761853,-51.101246,-41.064003,-23.155259,2.134287,...,-1006.355382,-644.219405,-125.701203,454.096015,1078.341711,1625.645277,2052.139361,2341.377752,2440.530371,6bb1ba19-25ac-4889-8e5b-8e4215784e82


In [43]:
df_train.to_csv('realpd_wavelet_features_smartphone_testing.csv',index=False)

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
import ppscore as pps
import lightgbm as lgb
from catboost import CatBoostRegressor, FeaturesData, Pool
from sklearn import preprocessing

In [34]:
#smartwatch_gyroscope features
label=real_pd_training_id.append(pd.DataFrame(real_pd_ancillary_id), ignore_index=True)
df_train=pd.merge(label,Frame,on='measurement_id')
df_train['subject_id']=preprocessing.LabelEncoder().fit(df_train['subject_id']).transform(df_train['subject_id'])
print(df_train.shape)
df_train.head()

(877, 197)


Unnamed: 0,measurement_id,subject_id,on_off,dyskinesia,tremor,wavelet_coeff_t_body_acc_X_1,wavelet_coeff_t_body_acc_X_2,wavelet_coeff_t_body_acc_X_3,wavelet_coeff_t_body_acc_X_4,wavelet_coeff_t_body_acc_X_5,...,wavelet_coeff_t_body_acc_jerk_mag_7,wavelet_coeff_t_body_acc_jerk_mag_8,wavelet_coeff_t_body_acc_jerk_mag_9,wavelet_coeff_t_body_acc_jerk_mag_10,wavelet_coeff_t_body_acc_jerk_mag_11,wavelet_coeff_t_body_acc_jerk_mag_12,wavelet_coeff_t_body_acc_jerk_mag_13,wavelet_coeff_t_body_acc_jerk_mag_14,wavelet_coeff_t_body_acc_jerk_mag_15,wavelet_coeff_t_body_acc_jerk_mag_16
0,ee053d95-c155-400d-ae42-fe24834ad4a9,2,1.0,0.0,3.0,-11.640204,-15.356673,-19.521645,-22.708042,-24.018429,...,-1544.920579,-1234.077362,-683.184955,43.801573,748.946057,1408.463108,1871.690006,2144.944353,2238.759882,2138.383795
1,ce51ee31-8553-4321-9f83-8cd3dabe2f66,2,1.0,0.0,2.0,-125.976071,-134.851817,-144.560424,-187.814125,-255.142271,...,-8965.150656,-6760.157997,-2820.854779,2172.922139,7353.533362,12196.511994,15718.646779,17126.766705,17923.513352,16700.861561
2,5c42911d-0ebd-47ba-9925-dd5ab1c0ed61,2,1.0,0.0,1.0,18.859606,66.621929,149.849683,313.768146,415.113127,...,-16156.398707,-12985.00018,-7520.53003,495.187965,8947.264148,17045.277839,23198.737614,26916.76936,28401.831795,27197.940868
3,274f5bc8-2e4f-4d7c-a546-b65b7d6bd01e,2,0.0,0.0,,-37.638428,-5.23234,33.032425,112.027435,189.348891,...,-9641.652242,-7826.196176,-4784.695791,-599.133943,3753.192007,8056.220138,11485.581043,13907.325792,15154.203448,15225.297397
4,ecbeea40-8770-455d-90a6-597e7f896e1b,2,0.0,1.0,0.0,-86.566097,-69.858881,-53.022302,37.10778,168.795331,...,-12713.164208,-9231.443553,-4210.601634,2102.266389,8236.144219,13624.677666,17071.048243,18794.262383,18818.973292,17526.61926


In [37]:
lightgbm(df_train,'tremor')
catboost(df_train,'tremor')

lightgbm(df_train,'dyskinesia')
catboost(df_train,'dyskinesia')

lightgbm(df_train,'on_off')
catboost(df_train,'on_off')

lightgbm test_MSE tremor : 0.621
catboost test_MSE tremor : 0.651
lightgbm test_MSE dyskinesia : 0.17
catboost test_MSE dyskinesia : 0.179
lightgbm test_MSE on_off : 0.171
catboost test_MSE on_off : 0.179


In [None]:
#======================================SMARTPHONE ACCELEROMETER============================================
#from taking 100 freq by using all signals
lightgbm test_MSE tremor : 0.621
catboost test_MSE tremor : 0.651
lightgbm test_MSE dyskinesia : 0.17
catboost test_MSE dyskinesia : 0.179
lightgbm test_MSE on_off : 0.171
catboost test_MSE on_off : 0.179

# Smartwatch

## accelerometer

### training

In [44]:
a=glob.glob("training_data/smartwatch_accelerometer/*.csv")
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(wavelet_real_smartwatch)(i,'morl') for i in a)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#--- 7.979812415440877 Mins ---

--- 7.877879766623179 Mins ---


In [45]:
columns=list(time_sig_df.columns)
#columns=['t_body_acc_Z','t_grav_acc_Z','t_body_acc_jerk_Z','t_body_acc_mag','t_grav_acc_mag','t_body_acc_jerk_mag']
df_train=pd.DataFrame(result)
df_train.columns=['device_id']+["wavelet_coeff_" + suit + '_' + str(i) for suit,i in zip(list(np.repeat(columns,16)),list(np.arange(1,17))*len(columns))]
df_train['measurement_id']=[item[len('training_data/smartphone_accelerometer/'):-4] for item in a]
df_train.head()

Unnamed: 0,device_id,wavelet_coeff_t_body_acc_X_1,wavelet_coeff_t_body_acc_X_2,wavelet_coeff_t_body_acc_X_3,wavelet_coeff_t_body_acc_X_4,wavelet_coeff_t_body_acc_X_5,wavelet_coeff_t_body_acc_X_6,wavelet_coeff_t_body_acc_X_7,wavelet_coeff_t_body_acc_X_8,wavelet_coeff_t_body_acc_X_9,...,wavelet_coeff_t_body_acc_jerk_mag_8,wavelet_coeff_t_body_acc_jerk_mag_9,wavelet_coeff_t_body_acc_jerk_mag_10,wavelet_coeff_t_body_acc_jerk_mag_11,wavelet_coeff_t_body_acc_jerk_mag_12,wavelet_coeff_t_body_acc_jerk_mag_13,wavelet_coeff_t_body_acc_jerk_mag_14,wavelet_coeff_t_body_acc_jerk_mag_15,wavelet_coeff_t_body_acc_jerk_mag_16,measurement_id
0,2VSP,-113.258712,-113.019573,-116.083608,-120.742291,-125.918915,-131.568461,-129.136442,-110.120035,-76.528121,...,-3440.704215,-1942.043152,99.926157,2155.897882,4069.803786,5462.714772,6319.101839,6631.959181,6420.441516,05205dc3-9c82-4b6b-921e-b6c2edd2c993
1,2WLT,-249.900665,-252.93525,-265.333014,-283.369728,-296.963511,-308.13516,-300.278343,-252.071547,-162.753267,...,-8454.306848,-5415.25701,-994.210188,3836.810801,8737.092921,12853.804036,15934.662572,17816.297915,18293.438036,aba31c29-79ef-4221-9412-156538a2fd4e
2,2ZX3,-78.992213,-80.052549,-83.272378,-90.86343,-100.756648,-109.653792,-108.305448,-89.78608,-58.011133,...,-1620.715474,-652.553003,511.432398,1560.328674,2456.473465,3006.728712,3305.114858,3366.528001,3195.355123,6cab466b-3c95-4cc0-ba1b-8650f5a12504
3,2WH7,-21.94018,-22.312575,-23.469052,-24.400411,-24.561085,-24.73642,-23.797719,-20.140356,-14.041502,...,-838.91442,-557.140826,-147.336419,293.746229,756.379351,1163.937157,1500.782982,1747.339932,1855.760023,ed560c25-e5c5-4dba-82c7-3fc18c248ce4
4,2WH7,-8.251838,-11.045222,-12.718602,-18.216305,-32.211938,-57.165884,-77.954877,-74.557405,-48.514476,...,-1193.394106,-538.061314,274.67995,1036.962882,1736.579912,2177.518873,2378.085803,2446.08075,2264.505616,f96752b5-850e-4a5a-a74a-69ab4893b6aa


In [46]:
a=glob.glob("ancillary_data/smartwatch_accelerometer/*.csv")
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(wavelet_real_smartwatch)(i,'morl') for i in a)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#--- 2.26669598420461 Mins ---

--- 6.3832910815874735 Mins ---


In [47]:
df_train2=pd.DataFrame(result)
df_train2.columns=['device_id']+["wavelet_coeff_" + suit + '_' + str(i) for suit,i in zip(list(np.repeat(columns,16)),list(np.arange(1,17))*len(columns))]
df_train2['measurement_id']=[item[len('ancillary_data/smartphone_accelerometer/'):-4] for item in a]
df_train2.head()

Unnamed: 0,device_id,wavelet_coeff_t_body_acc_X_1,wavelet_coeff_t_body_acc_X_2,wavelet_coeff_t_body_acc_X_3,wavelet_coeff_t_body_acc_X_4,wavelet_coeff_t_body_acc_X_5,wavelet_coeff_t_body_acc_X_6,wavelet_coeff_t_body_acc_X_7,wavelet_coeff_t_body_acc_X_8,wavelet_coeff_t_body_acc_X_9,...,wavelet_coeff_t_body_acc_jerk_mag_8,wavelet_coeff_t_body_acc_jerk_mag_9,wavelet_coeff_t_body_acc_jerk_mag_10,wavelet_coeff_t_body_acc_jerk_mag_11,wavelet_coeff_t_body_acc_jerk_mag_12,wavelet_coeff_t_body_acc_jerk_mag_13,wavelet_coeff_t_body_acc_jerk_mag_14,wavelet_coeff_t_body_acc_jerk_mag_15,wavelet_coeff_t_body_acc_jerk_mag_16,measurement_id
0,2WLT,-45.640426,-48.030838,-50.83405,-54.832421,-59.337969,-63.134132,-62.471301,-55.473618,-41.993026,...,-2379.432387,-1649.365962,-531.588399,685.369015,1971.815842,3091.551101,3992.049595,4617.564145,4843.060573,eab1cc17-40aa-4e1f-8e4f-64e6d5743509
1,2WH7,-4.877789,-6.117715,-7.784857,-11.839776,-15.854575,-17.34046,-15.653711,-11.156013,-5.231624,...,-443.590676,-225.713258,38.353528,263.586922,463.427218,584.838023,655.400931,678.138855,660.176581,166ba983-209f-4639-a5a6-d6e66adeba2b
2,2WH5,-58.766105,-60.7666,-65.352813,-71.859004,-76.997149,-80.568829,-77.130978,-63.048825,-40.865819,...,-2006.811257,-926.075336,450.752854,1739.323247,2836.087942,3503.988236,3833.311154,3837.25191,3579.062894,bca5e12d-9fd6-496b-ac08-9e2472d8b299
3,327T,-4.907578,-5.850618,-7.11699,-8.651272,-9.784561,-9.877606,-8.177598,-4.883016,-0.642661,...,-186.728904,-81.626581,42.582721,152.224778,251.759977,310.977699,342.417464,347.190702,322.807612,49f80736-6b50-44a6-a77b-9b1572334a8c
4,327T,-29.311308,-29.58689,-30.693973,-32.603873,-34.948431,-36.795504,-35.069913,-27.421843,-14.513702,...,-592.243327,-228.61507,206.296223,582.44524,887.385172,1048.994752,1102.738596,1055.287417,932.803821,26f49660-ce1a-4946-8f83-f88850f03ec1


In [48]:
Frame_smartwatch_accelerometer = df_train.append(pd.DataFrame(df_train2), ignore_index=True)

In [49]:
Frame_smartwatch_accelerometer.shape

(963, 194)

In [50]:
Frame_smartwatch_accelerometer.head()

Unnamed: 0,device_id,wavelet_coeff_t_body_acc_X_1,wavelet_coeff_t_body_acc_X_2,wavelet_coeff_t_body_acc_X_3,wavelet_coeff_t_body_acc_X_4,wavelet_coeff_t_body_acc_X_5,wavelet_coeff_t_body_acc_X_6,wavelet_coeff_t_body_acc_X_7,wavelet_coeff_t_body_acc_X_8,wavelet_coeff_t_body_acc_X_9,...,wavelet_coeff_t_body_acc_jerk_mag_8,wavelet_coeff_t_body_acc_jerk_mag_9,wavelet_coeff_t_body_acc_jerk_mag_10,wavelet_coeff_t_body_acc_jerk_mag_11,wavelet_coeff_t_body_acc_jerk_mag_12,wavelet_coeff_t_body_acc_jerk_mag_13,wavelet_coeff_t_body_acc_jerk_mag_14,wavelet_coeff_t_body_acc_jerk_mag_15,wavelet_coeff_t_body_acc_jerk_mag_16,measurement_id
0,2VSP,-113.258712,-113.019573,-116.083608,-120.742291,-125.918915,-131.568461,-129.136442,-110.120035,-76.528121,...,-3440.704215,-1942.043152,99.926157,2155.897882,4069.803786,5462.714772,6319.101839,6631.959181,6420.441516,05205dc3-9c82-4b6b-921e-b6c2edd2c993
1,2WLT,-249.900665,-252.93525,-265.333014,-283.369728,-296.963511,-308.13516,-300.278343,-252.071547,-162.753267,...,-8454.306848,-5415.25701,-994.210188,3836.810801,8737.092921,12853.804036,15934.662572,17816.297915,18293.438036,aba31c29-79ef-4221-9412-156538a2fd4e
2,2ZX3,-78.992213,-80.052549,-83.272378,-90.86343,-100.756648,-109.653792,-108.305448,-89.78608,-58.011133,...,-1620.715474,-652.553003,511.432398,1560.328674,2456.473465,3006.728712,3305.114858,3366.528001,3195.355123,6cab466b-3c95-4cc0-ba1b-8650f5a12504
3,2WH7,-21.94018,-22.312575,-23.469052,-24.400411,-24.561085,-24.73642,-23.797719,-20.140356,-14.041502,...,-838.91442,-557.140826,-147.336419,293.746229,756.379351,1163.937157,1500.782982,1747.339932,1855.760023,ed560c25-e5c5-4dba-82c7-3fc18c248ce4
4,2WH7,-8.251838,-11.045222,-12.718602,-18.216305,-32.211938,-57.165884,-77.954877,-74.557405,-48.514476,...,-1193.394106,-538.061314,274.67995,1036.962882,1736.579912,2177.518873,2378.085803,2446.08075,2264.505616,f96752b5-850e-4a5a-a74a-69ab4893b6aa


In [51]:
Frame_smartwatch_accelerometer.columns=['device_id_acc']+list(Frame_smartwatch_accelerometer.columns)[1:]

In [52]:
Frame_smartwatch_accelerometer.head()

Unnamed: 0,device_id_acc,wavelet_coeff_t_body_acc_X_1,wavelet_coeff_t_body_acc_X_2,wavelet_coeff_t_body_acc_X_3,wavelet_coeff_t_body_acc_X_4,wavelet_coeff_t_body_acc_X_5,wavelet_coeff_t_body_acc_X_6,wavelet_coeff_t_body_acc_X_7,wavelet_coeff_t_body_acc_X_8,wavelet_coeff_t_body_acc_X_9,...,wavelet_coeff_t_body_acc_jerk_mag_8,wavelet_coeff_t_body_acc_jerk_mag_9,wavelet_coeff_t_body_acc_jerk_mag_10,wavelet_coeff_t_body_acc_jerk_mag_11,wavelet_coeff_t_body_acc_jerk_mag_12,wavelet_coeff_t_body_acc_jerk_mag_13,wavelet_coeff_t_body_acc_jerk_mag_14,wavelet_coeff_t_body_acc_jerk_mag_15,wavelet_coeff_t_body_acc_jerk_mag_16,measurement_id
0,2VSP,-113.258712,-113.019573,-116.083608,-120.742291,-125.918915,-131.568461,-129.136442,-110.120035,-76.528121,...,-3440.704215,-1942.043152,99.926157,2155.897882,4069.803786,5462.714772,6319.101839,6631.959181,6420.441516,05205dc3-9c82-4b6b-921e-b6c2edd2c993
1,2WLT,-249.900665,-252.93525,-265.333014,-283.369728,-296.963511,-308.13516,-300.278343,-252.071547,-162.753267,...,-8454.306848,-5415.25701,-994.210188,3836.810801,8737.092921,12853.804036,15934.662572,17816.297915,18293.438036,aba31c29-79ef-4221-9412-156538a2fd4e
2,2ZX3,-78.992213,-80.052549,-83.272378,-90.86343,-100.756648,-109.653792,-108.305448,-89.78608,-58.011133,...,-1620.715474,-652.553003,511.432398,1560.328674,2456.473465,3006.728712,3305.114858,3366.528001,3195.355123,6cab466b-3c95-4cc0-ba1b-8650f5a12504
3,2WH7,-21.94018,-22.312575,-23.469052,-24.400411,-24.561085,-24.73642,-23.797719,-20.140356,-14.041502,...,-838.91442,-557.140826,-147.336419,293.746229,756.379351,1163.937157,1500.782982,1747.339932,1855.760023,ed560c25-e5c5-4dba-82c7-3fc18c248ce4
4,2WH7,-8.251838,-11.045222,-12.718602,-18.216305,-32.211938,-57.165884,-77.954877,-74.557405,-48.514476,...,-1193.394106,-538.061314,274.67995,1036.962882,1736.579912,2177.518873,2378.085803,2446.08075,2264.505616,f96752b5-850e-4a5a-a74a-69ab4893b6aa


### testing

In [53]:
a=glob.glob("testing_data/smartwatch_accelerometer/*.csv")
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(wavelet_real_smartwatch)(i,'morl') for i in a)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#--- 7.979812415440877 Mins ---

columns=list(time_sig_df.columns)
#columns=['t_body_acc_Z','t_grav_acc_Z','t_body_acc_jerk_Z','t_body_acc_mag','t_grav_acc_mag','t_body_acc_jerk_mag']
df_train=pd.DataFrame(result)
df_train.columns=['device_id']+["wavelet_coeff_" + suit + '_' + str(i) for suit,i in zip(list(np.repeat(columns,16)),list(np.arange(1,17))*len(columns))]
df_train['measurement_id']=[item[len('testing_data/smartphone_accelerometer/'):-4] for item in a]
Frame_smartwatch_accelerometer_test=df_train.copy()
Frame_smartwatch_accelerometer_test.columns=['device_id_acc']+list(Frame_smartwatch_accelerometer_test.columns)[1:]
print(Frame_smartwatch_accelerometer_test.shape)
Frame_smartwatch_accelerometer_test.head()

--- 2.9160772840181988 Mins ---
(172, 194)


Unnamed: 0,device_id_acc,wavelet_coeff_t_body_acc_X_1,wavelet_coeff_t_body_acc_X_2,wavelet_coeff_t_body_acc_X_3,wavelet_coeff_t_body_acc_X_4,wavelet_coeff_t_body_acc_X_5,wavelet_coeff_t_body_acc_X_6,wavelet_coeff_t_body_acc_X_7,wavelet_coeff_t_body_acc_X_8,wavelet_coeff_t_body_acc_X_9,...,wavelet_coeff_t_body_acc_jerk_mag_8,wavelet_coeff_t_body_acc_jerk_mag_9,wavelet_coeff_t_body_acc_jerk_mag_10,wavelet_coeff_t_body_acc_jerk_mag_11,wavelet_coeff_t_body_acc_jerk_mag_12,wavelet_coeff_t_body_acc_jerk_mag_13,wavelet_coeff_t_body_acc_jerk_mag_14,wavelet_coeff_t_body_acc_jerk_mag_15,wavelet_coeff_t_body_acc_jerk_mag_16,measurement_id
0,2ZX6,-358.487285,-359.87136,-374.769313,-391.889269,-400.036003,-402.552111,-375.8429,-298.071937,-171.651657,...,-8741.616011,-4421.835244,1172.625811,6632.240139,11544.601756,14851.511292,16650.790957,16925.16512,15912.328391,7860035d-c9df-44e9-ba0e-0855dfad90f4
1,2WH8,-81.521457,-82.275248,-85.577635,-90.644295,-95.450561,-98.862744,-94.065148,-75.204958,-43.900045,...,-3028.448271,-1616.8865,273.281135,2122.041316,3756.334275,4866.793873,5471.750354,5596.574931,5338.435317,0c593c3f-6636-4f0f-b9b1-f489c1cd3852
2,2VSP,-53.202921,-56.641826,-61.313636,-68.21029,-69.143251,-66.09978,-59.784017,-48.878664,-34.387097,...,-1806.126635,-1096.783845,-167.067573,835.482393,1942.488414,2910.228332,3703.956722,4356.627984,4644.585372,8abf7688-c6bb-488f-bb9d-c359a5f86b35
3,2VSP,-38.000119,-38.53022,-40.283796,-42.632755,-44.875023,-46.783389,-45.098781,-36.871526,-23.001916,...,-1295.99623,-632.625631,219.880577,1034.192987,1749.620924,2223.713387,2474.159722,2513.205602,2359.761639,c4f0d5dc-c1db-4cc5-89d1-26d7fee32299
4,2WLT,-82.926281,-83.283854,-86.132753,-89.339772,-91.632529,-93.570386,-90.25611,-75.82475,-50.142683,...,-3724.964607,-2121.047088,95.080179,2310.379054,4337.290622,5742.321509,6537.40843,6687.066351,6264.040414,6bb1ba19-25ac-4889-8e5b-8e4215784e82


In [58]:
#smartwatch_gyroscope features
label=real_pd_training_id.append(pd.DataFrame(real_pd_ancillary_id), ignore_index=True)
df_train=pd.merge(label,Frame_smartwatch_accelerometer,on='measurement_id')
df_train['subject_id']=preprocessing.LabelEncoder().fit(df_train['subject_id']).transform(df_train['subject_id'])
df_train['device_id_acc']=preprocessing.LabelEncoder().fit(df_train['device_id_acc']).transform(df_train['device_id_acc'])
print(df_train.shape)
df_train.head()

(963, 198)


Unnamed: 0,measurement_id,subject_id,on_off,dyskinesia,tremor,device_id_acc,wavelet_coeff_t_body_acc_X_1,wavelet_coeff_t_body_acc_X_2,wavelet_coeff_t_body_acc_X_3,wavelet_coeff_t_body_acc_X_4,...,wavelet_coeff_t_body_acc_jerk_mag_7,wavelet_coeff_t_body_acc_jerk_mag_8,wavelet_coeff_t_body_acc_jerk_mag_9,wavelet_coeff_t_body_acc_jerk_mag_10,wavelet_coeff_t_body_acc_jerk_mag_11,wavelet_coeff_t_body_acc_jerk_mag_12,wavelet_coeff_t_body_acc_jerk_mag_13,wavelet_coeff_t_body_acc_jerk_mag_14,wavelet_coeff_t_body_acc_jerk_mag_15,wavelet_coeff_t_body_acc_jerk_mag_16
0,ee053d95-c155-400d-ae42-fe24834ad4a9,2,1.0,0.0,3.0,4,3.602402,4.876646,6.483259,10.077689,...,-492.377019,-378.757159,-186.186162,48.574061,247.827128,423.090183,526.281753,579.749826,588.049709,552.097503
1,ce51ee31-8553-4321-9f83-8cd3dabe2f66,2,1.0,0.0,2.0,4,-34.138102,-38.478251,-45.019545,-57.137746,...,-2602.693115,-2087.210906,-1164.528841,107.192749,1378.671289,2532.976306,3344.325476,3810.665461,3931.322995,3747.056612
2,d3c89012-3ab9-4014-b577-61ff05e31968,2,1.0,0.0,0.0,0,-91.452146,-91.980055,-96.220705,-103.397261,...,-2197.310912,-1671.580615,-868.721574,188.788032,1248.335571,2246.977826,2959.042869,3388.740044,3522.297053,3352.601536
3,5c42911d-0ebd-47ba-9925-dd5ab1c0ed61,2,1.0,0.0,1.0,4,-62.412121,-65.498066,-71.235087,-85.648581,...,-2736.176656,-2142.928246,-1193.744651,76.722081,1362.553352,2590.998448,3508.723037,4107.631546,4379.729799,4305.767543
4,235472d5-ad2e-4c76-947e-358c9d8c1280,2,1.0,0.0,0.0,0,-58.416876,-58.501862,-60.42081,-64.011159,...,-1494.34482,-1135.261524,-576.559092,142.00788,837.999142,1479.526638,1922.24322,2184.127142,2266.592553,2169.010897


In [59]:
lightgbm(df_train,'tremor')
catboost(df_train,'tremor')

lightgbm(df_train,'dyskinesia')
catboost(df_train,'dyskinesia')

lightgbm(df_train,'on_off')
catboost(df_train,'on_off')

lightgbm test_MSE tremor : 0.583
catboost test_MSE tremor : 0.623
lightgbm test_MSE dyskinesia : 0.233
catboost test_MSE dyskinesia : 0.239
lightgbm test_MSE on_off : 0.234
catboost test_MSE on_off : 0.229


In [None]:
#======================================SMARTWATCH ACCELEROMETER============================================
#from taking 50 freq and high  variance device by using all signals
lightgbm test_MSE tremor : 0.583
catboost test_MSE tremor : 0.623
lightgbm test_MSE dyskinesia : 0.233
catboost test_MSE dyskinesia : 0.239
lightgbm test_MSE on_off : 0.234
catboost test_MSE on_off : 0.229

## smartwatch_gyroscope

In [60]:
a=glob.glob("training_data/smartwatch_gyroscope/*.csv")
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(wavelet_real_smartwatch_gyro)(i,'morl') for i in a)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#--- 5.405052049954732 Mins ---

--- 5.8055768489837645 Mins ---


In [61]:
columns=['t_body_acc_X','t_body_acc_Y','t_body_acc_Z','t_body_acc_jerk_X','t_body_acc_jerk_Y','t_body_acc_jerk_Z','t_body_acc_mag','t_body_acc_jerk_mag']
df_train=pd.DataFrame(result)
df_train.columns=['device_id']+["wavelet_coeff_" + suit + '_' + str(i) for suit,i in zip(list(np.repeat(columns,16)),list(np.arange(1,17))*len(columns))]
df_train['measurement_id']=[item[len('training_data/smartwatch_gyroscope/'):-4] for item in a]
df_train.head()

Unnamed: 0,device_id,wavelet_coeff_t_body_acc_X_1,wavelet_coeff_t_body_acc_X_2,wavelet_coeff_t_body_acc_X_3,wavelet_coeff_t_body_acc_X_4,wavelet_coeff_t_body_acc_X_5,wavelet_coeff_t_body_acc_X_6,wavelet_coeff_t_body_acc_X_7,wavelet_coeff_t_body_acc_X_8,wavelet_coeff_t_body_acc_X_9,...,wavelet_coeff_t_body_acc_jerk_mag_8,wavelet_coeff_t_body_acc_jerk_mag_9,wavelet_coeff_t_body_acc_jerk_mag_10,wavelet_coeff_t_body_acc_jerk_mag_11,wavelet_coeff_t_body_acc_jerk_mag_12,wavelet_coeff_t_body_acc_jerk_mag_13,wavelet_coeff_t_body_acc_jerk_mag_14,wavelet_coeff_t_body_acc_jerk_mag_15,wavelet_coeff_t_body_acc_jerk_mag_16,measurement_id
0,2VSP,-118.063207,-116.847264,-118.546913,-121.779043,-126.468353,-132.041237,-130.159435,-112.202973,-79.350495,...,-1141.032621,-588.847756,92.716358,731.176554,1333.844056,1759.882074,2036.045696,2147.643574,2093.034877,05205dc3-9c82-4b6b-921e-b6c2edd2c993
1,2WLT,-224.14338,-225.945966,-233.141864,-250.872233,-276.393304,-308.231336,-321.827035,-285.951381,-197.576774,...,-3161.50957,-1752.773113,117.760585,1988.714905,3793.559649,5123.258104,5988.937858,6387.202671,6257.878473,aba31c29-79ef-4221-9412-156538a2fd4e
2,2ZX3,-39.084788,-40.167471,-42.075651,-48.234513,-58.048567,-66.661939,-66.402844,-53.324486,-31.060424,...,-610.791299,-312.952484,61.189827,417.398594,760.290596,1007.96634,1173.759975,1258.386267,1240.551904,6cab466b-3c95-4cc0-ba1b-8650f5a12504
3,2WH7,-17.65084,-17.609973,-18.055261,-18.698336,-19.48093,-20.246683,-19.700282,-16.57596,-11.199493,...,-214.650905,-116.228203,10.170531,133.05984,252.933786,342.453576,405.912929,441.931419,439.803617,ed560c25-e5c5-4dba-82c7-3fc18c248ce4
4,2WH7,-30.911573,-30.959446,-31.829413,-33.299444,-35.177732,-37.690744,-37.922881,-32.682598,-22.610535,...,-313.52917,-128.575538,87.638295,279.266402,454.602105,561.516048,617.583142,631.24246,588.972126,f96752b5-850e-4a5a-a74a-69ab4893b6aa


In [62]:
a=glob.glob("ancillary_data/smartwatch_gyroscope/*.csv")
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(wavelet_real_smartwatch_gyro)(i,'morl') for i in a)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#--- 2.26669598420461 Mins ---

--- 4.697981866200765 Mins ---


In [63]:
df_train2=pd.DataFrame(result)
df_train2.columns=['device_id']+["wavelet_coeff_" + suit + '_' + str(i) for suit,i in zip(list(np.repeat(columns,16)),list(np.arange(1,17))*len(columns))]
df_train2['measurement_id']=[item[len('ancillary_data/smartwatch_gyroscope/'):-4] for item in a]
df_train2.head()

Unnamed: 0,device_id,wavelet_coeff_t_body_acc_X_1,wavelet_coeff_t_body_acc_X_2,wavelet_coeff_t_body_acc_X_3,wavelet_coeff_t_body_acc_X_4,wavelet_coeff_t_body_acc_X_5,wavelet_coeff_t_body_acc_X_6,wavelet_coeff_t_body_acc_X_7,wavelet_coeff_t_body_acc_X_8,wavelet_coeff_t_body_acc_X_9,...,wavelet_coeff_t_body_acc_jerk_mag_8,wavelet_coeff_t_body_acc_jerk_mag_9,wavelet_coeff_t_body_acc_jerk_mag_10,wavelet_coeff_t_body_acc_jerk_mag_11,wavelet_coeff_t_body_acc_jerk_mag_12,wavelet_coeff_t_body_acc_jerk_mag_13,wavelet_coeff_t_body_acc_jerk_mag_14,wavelet_coeff_t_body_acc_jerk_mag_15,wavelet_coeff_t_body_acc_jerk_mag_16,measurement_id
0,2WLT,-35.506404,-36.593648,-37.973016,-40.65039,-43.939729,-46.623588,-46.048025,-41.104516,-32.56084,...,-877.821314,-589.620135,-158.22909,288.575775,749.877078,1129.014525,1416.473047,1607.332903,1658.500235,eab1cc17-40aa-4e1f-8e4f-64e6d5743509
1,2WH7,-8.763748,-8.767822,-9.020221,-9.647574,-10.397176,-10.885554,-10.511306,-8.759548,-5.970753,...,-105.899149,-47.646798,23.753386,87.896835,143.971276,179.265237,197.687803,200.154834,185.084313,166ba983-209f-4639-a5a6-d6e66adeba2b
2,2WH5,-77.609392,-77.014893,-78.457626,-81.389234,-85.115533,-88.639747,-86.591156,-73.35023,-49.60895,...,-1014.424663,-585.789277,-11.553488,548.984542,1079.11058,1469.00792,1712.193781,1802.52668,1739.188796,bca5e12d-9fd6-496b-ac08-9e2472d8b299
3,327T,-4.098664,-4.042425,-4.097283,-4.150738,-4.196609,-4.26639,-4.102013,-3.386345,-2.110806,...,-46.846555,-28.452381,-4.083202,21.001207,46.954341,68.393401,83.139068,89.73352,87.766875,49f80736-6b50-44a6-a77b-9b1572334a8c
4,327T,-15.829323,-16.07211,-16.538134,-18.160534,-21.529311,-25.376629,-25.797031,-20.528126,-10.951145,...,-177.651902,-74.087787,45.512457,149.024229,243.40216,302.928829,339.384358,356.912296,349.595504,26f49660-ce1a-4946-8f83-f88850f03ec1


In [64]:
Frame_smartwatch_gyro = df_train.append(pd.DataFrame(df_train2), ignore_index=True)

In [65]:
Frame_smartwatch_gyro.shape

(963, 130)

In [66]:
Frame_smartwatch_gyro.columns=[i.replace('acc','gyro') for i in list(Frame_smartwatch_gyro.columns)]

In [67]:
Frame_smartwatch_gyro.columns=['device_id_gyro']+list(Frame_smartwatch_gyro.columns)[1:]

In [68]:
Frame_smartwatch_gyro.head()

Unnamed: 0,device_id_gyro,wavelet_coeff_t_body_gyro_X_1,wavelet_coeff_t_body_gyro_X_2,wavelet_coeff_t_body_gyro_X_3,wavelet_coeff_t_body_gyro_X_4,wavelet_coeff_t_body_gyro_X_5,wavelet_coeff_t_body_gyro_X_6,wavelet_coeff_t_body_gyro_X_7,wavelet_coeff_t_body_gyro_X_8,wavelet_coeff_t_body_gyro_X_9,...,wavelet_coeff_t_body_gyro_jerk_mag_8,wavelet_coeff_t_body_gyro_jerk_mag_9,wavelet_coeff_t_body_gyro_jerk_mag_10,wavelet_coeff_t_body_gyro_jerk_mag_11,wavelet_coeff_t_body_gyro_jerk_mag_12,wavelet_coeff_t_body_gyro_jerk_mag_13,wavelet_coeff_t_body_gyro_jerk_mag_14,wavelet_coeff_t_body_gyro_jerk_mag_15,wavelet_coeff_t_body_gyro_jerk_mag_16,measurement_id
0,2VSP,-118.063207,-116.847264,-118.546913,-121.779043,-126.468353,-132.041237,-130.159435,-112.202973,-79.350495,...,-1141.032621,-588.847756,92.716358,731.176554,1333.844056,1759.882074,2036.045696,2147.643574,2093.034877,05205dc3-9c82-4b6b-921e-b6c2edd2c993
1,2WLT,-224.14338,-225.945966,-233.141864,-250.872233,-276.393304,-308.231336,-321.827035,-285.951381,-197.576774,...,-3161.50957,-1752.773113,117.760585,1988.714905,3793.559649,5123.258104,5988.937858,6387.202671,6257.878473,aba31c29-79ef-4221-9412-156538a2fd4e
2,2ZX3,-39.084788,-40.167471,-42.075651,-48.234513,-58.048567,-66.661939,-66.402844,-53.324486,-31.060424,...,-610.791299,-312.952484,61.189827,417.398594,760.290596,1007.96634,1173.759975,1258.386267,1240.551904,6cab466b-3c95-4cc0-ba1b-8650f5a12504
3,2WH7,-17.65084,-17.609973,-18.055261,-18.698336,-19.48093,-20.246683,-19.700282,-16.57596,-11.199493,...,-214.650905,-116.228203,10.170531,133.05984,252.933786,342.453576,405.912929,441.931419,439.803617,ed560c25-e5c5-4dba-82c7-3fc18c248ce4
4,2WH7,-30.911573,-30.959446,-31.829413,-33.299444,-35.177732,-37.690744,-37.922881,-32.682598,-22.610535,...,-313.52917,-128.575538,87.638295,279.266402,454.602105,561.516048,617.583142,631.24246,588.972126,f96752b5-850e-4a5a-a74a-69ab4893b6aa


### testing

In [69]:
a=glob.glob("testing_data/smartwatch_gyroscope/*.csv")
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(wavelet_real_smartwatch_gyro)(i,'morl') for i in a)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#--- 5.405052049954732 Mins ---

columns=['t_body_acc_X','t_body_acc_Y','t_body_acc_Z','t_body_acc_jerk_X','t_body_acc_jerk_Y','t_body_acc_jerk_Z','t_body_acc_mag','t_body_acc_jerk_mag']
df_train=pd.DataFrame(result)
df_train.columns=['device_id']+["wavelet_coeff_" + suit + '_' + str(i) for suit,i in zip(list(np.repeat(columns,16)),list(np.arange(1,17))*len(columns))]
df_train['measurement_id']=[item[len('testing_data/smartwatch_gyroscope/'):-4] for item in a]

Frame_smartwatch_gyro_test=df_train.copy()
Frame_smartwatch_gyro_test.columns=[i.replace('acc','gyro') for i in list(Frame_smartwatch_gyro_test.columns)]
Frame_smartwatch_gyro_test.columns=['device_id_gyro']+list(Frame_smartwatch_gyro_test.columns)[1:]
Frame_smartwatch_gyro_test.head()


--- 1.9309370477994283 Mins ---


Unnamed: 0,device_id_gyro,wavelet_coeff_t_body_gyro_X_1,wavelet_coeff_t_body_gyro_X_2,wavelet_coeff_t_body_gyro_X_3,wavelet_coeff_t_body_gyro_X_4,wavelet_coeff_t_body_gyro_X_5,wavelet_coeff_t_body_gyro_X_6,wavelet_coeff_t_body_gyro_X_7,wavelet_coeff_t_body_gyro_X_8,wavelet_coeff_t_body_gyro_X_9,...,wavelet_coeff_t_body_gyro_jerk_mag_8,wavelet_coeff_t_body_gyro_jerk_mag_9,wavelet_coeff_t_body_gyro_jerk_mag_10,wavelet_coeff_t_body_gyro_jerk_mag_11,wavelet_coeff_t_body_gyro_jerk_mag_12,wavelet_coeff_t_body_gyro_jerk_mag_13,wavelet_coeff_t_body_gyro_jerk_mag_14,wavelet_coeff_t_body_gyro_jerk_mag_15,wavelet_coeff_t_body_gyro_jerk_mag_16,measurement_id
0,2ZX6,-187.042069,-188.028206,-195.817017,-211.116811,-223.552712,-231.304679,-221.427794,-180.364696,-111.167308,...,-2160.784901,-951.068017,523.160278,1867.791626,3054.017285,3787.954768,4176.104949,4235.560527,3987.683289,7860035d-c9df-44e9-ba0e-0855dfad90f4
1,2WH8,-72.829678,-72.275208,-73.707227,-76.588987,-80.075606,-82.234069,-77.532116,-62.072309,-37.153267,...,-840.028763,-448.616623,64.619698,577.522006,1059.269031,1406.866411,1616.49264,1680.338359,1606.094225,0c593c3f-6636-4f0f-b9b1-f489c1cd3852
2,2VSP,-44.042313,-44.655908,-46.587055,-49.718794,-50.695081,-49.408062,-45.721336,-38.050304,-26.682427,...,-539.610771,-295.333652,23.676774,335.116113,652.120869,881.976523,1040.485148,1142.262154,1140.983659,8abf7688-c6bb-488f-bb9d-c359a5f86b35
3,2VSP,-45.6639,-45.493277,-46.58106,-48.951446,-52.049489,-54.961862,-54.022399,-45.819513,-31.434465,...,-452.950775,-144.471037,211.42205,519.897099,768.662569,896.701354,941.594627,911.183613,825.57166,c4f0d5dc-c1db-4cc5-89d1-26d7fee32299
4,2WLT,-109.60484,-108.705076,-110.743513,-114.609447,-119.418292,-122.719646,-117.345732,-97.09644,-62.535868,...,-1515.208986,-1105.23288,-395.194305,389.429036,1170.254471,1776.163228,2164.321079,2358.156695,2327.388258,6bb1ba19-25ac-4889-8e5b-8e4215784e82


In [70]:
Frame_smartwatch_gyro_test.shape

(172, 130)

In [72]:
#smartwatch_gyroscope features
label=real_pd_training_id.append(pd.DataFrame(real_pd_ancillary_id), ignore_index=True)
df_train=pd.merge(label,Frame_smartwatch_gyro,on='measurement_id')
df_train['subject_id']=preprocessing.LabelEncoder().fit(df_train['subject_id']).transform(df_train['subject_id'])
df_train['device_id_gyro']=preprocessing.LabelEncoder().fit(df_train['device_id_gyro']).transform(df_train['device_id_gyro'])
print(df_train.shape)
df_train.head()

(963, 134)


Unnamed: 0,measurement_id,subject_id,on_off,dyskinesia,tremor,device_id_gyro,wavelet_coeff_t_body_gyro_X_1,wavelet_coeff_t_body_gyro_X_2,wavelet_coeff_t_body_gyro_X_3,wavelet_coeff_t_body_gyro_X_4,...,wavelet_coeff_t_body_gyro_jerk_mag_7,wavelet_coeff_t_body_gyro_jerk_mag_8,wavelet_coeff_t_body_gyro_jerk_mag_9,wavelet_coeff_t_body_gyro_jerk_mag_10,wavelet_coeff_t_body_gyro_jerk_mag_11,wavelet_coeff_t_body_gyro_jerk_mag_12,wavelet_coeff_t_body_gyro_jerk_mag_13,wavelet_coeff_t_body_gyro_jerk_mag_14,wavelet_coeff_t_body_gyro_jerk_mag_15,wavelet_coeff_t_body_gyro_jerk_mag_16
0,ee053d95-c155-400d-ae42-fe24834ad4a9,2,1.0,0.0,3.0,4,-2.38069,-2.778062,-3.138025,-4.560209,...,-121.274532,-108.173043,-71.430831,-17.553158,36.576496,92.43877,137.601482,169.840752,189.875326,192.331687
1,ce51ee31-8553-4321-9f83-8cd3dabe2f66,2,1.0,0.0,2.0,4,8.419639,10.196829,12.118192,19.464114,...,-491.18731,-356.895739,-146.330305,118.70312,363.698319,573.324023,696.530204,747.395866,735.0741,661.268319
2,d3c89012-3ab9-4014-b577-61ff05e31968,2,1.0,0.0,0.0,0,-39.476311,-40.572579,-42.783798,-49.385175,...,-852.584827,-624.363989,-289.900538,130.567562,534.086424,904.861322,1144.786326,1266.696876,1276.887464,1177.535025
3,5c42911d-0ebd-47ba-9925-dd5ab1c0ed61,2,1.0,0.0,1.0,4,13.56167,16.342996,19.104336,30.182116,...,-720.82271,-561.758117,-310.522171,24.327086,361.754346,698.687568,952.102167,1118.905548,1200.447207,1175.023765
4,235472d5-ad2e-4c76-947e-358c9d8c1280,2,1.0,0.0,0.0,0,-30.391662,-30.672154,-31.773359,-34.790737,...,-540.404494,-396.45978,-191.518836,59.731279,293.567135,510.550236,657.313111,747.201713,778.617798,751.854043


In [73]:
lightgbm(df_train,'tremor')
catboost(df_train,'tremor')

lightgbm(df_train,'dyskinesia')
catboost(df_train,'dyskinesia')

lightgbm(df_train,'on_off')
catboost(df_train,'on_off')

lightgbm test_MSE tremor : 0.637
catboost test_MSE tremor : 0.646
lightgbm test_MSE dyskinesia : 0.237
catboost test_MSE dyskinesia : 0.228
lightgbm test_MSE on_off : 0.208
catboost test_MSE on_off : 0.213


In [None]:
#======================================SMARTWATCH GYROSCOPE============================================
#from taking 50 freq and high  variance device by using gyroscope signals
lightgbm test_MSE tremor : 0.637
catboost test_MSE tremor : 0.646
lightgbm test_MSE dyskinesia : 0.237
catboost test_MSE dyskinesia : 0.228
lightgbm test_MSE on_off : 0.208
catboost test_MSE on_off : 0.213

## SmartWatch

In [74]:
Frame_smartwatch_gyro.shape,Frame_smartwatch_accelerometer.shape

((963, 130), (963, 194))

In [75]:
Frame_smartwatch_gyro_test.shape,Frame_smartwatch_accelerometer_test.shape

((172, 130), (172, 194))

In [79]:
Frame_smartwatch=pd.merge(Frame_smartwatch_accelerometer,Frame_smartwatch_gyro,on='measurement_id')
Frame_smartwatch=Frame_smartwatch.drop('device_id_gyro',axis=1)
print(Frame_smartwatch.shape)

Frame_smartwatch_test=pd.merge(Frame_smartwatch_accelerometer_test,Frame_smartwatch_gyro_test,on='measurement_id')
Frame_smartwatch_test=Frame_smartwatch_test.drop('device_id_gyro',axis=1)
print(Frame_smartwatch_test.shape)

(963, 322)
(172, 322)


In [80]:
Frame_smartwatch.to_csv('realpd_wavelet_features_smartwatch_training.csv',index=False)
Frame_smartwatch_test.to_csv('realpd_wavelet_features_smartwatch_testing.csv',index=False)


In [81]:
#smartwatch_accelerometer features
label=real_pd_training_id.append(pd.DataFrame(real_pd_ancillary_id), ignore_index=True)
df_train=pd.merge(label,Frame_smartwatch,on='measurement_id')
df_train['subject_id']=preprocessing.LabelEncoder().fit(df_train['subject_id']).transform(df_train['subject_id'])
df_train['device_id_acc']=preprocessing.LabelEncoder().fit(df_train['device_id_acc']).transform(df_train['device_id_acc'])


In [82]:
lightgbm(df_train,'tremor')
catboost(df_train,'tremor')

lightgbm(df_train,'dyskinesia')
catboost(df_train,'dyskinesia')

lightgbm(df_train,'on_off')
catboost(df_train,'on_off')

lightgbm test_MSE tremor : 0.578
catboost test_MSE tremor : 0.629
lightgbm test_MSE dyskinesia : 0.225
catboost test_MSE dyskinesia : 0.224
lightgbm test_MSE on_off : 0.219
catboost test_MSE on_off : 0.223


In [None]:
lightgbm test_MSE tremor : 0.578
catboost test_MSE tremor : 0.629
lightgbm test_MSE dyskinesia : 0.225
catboost test_MSE dyskinesia : 0.224
lightgbm test_MSE on_off : 0.219
catboost test_MSE on_off : 0.223

# Data Analysis

In [307]:
import pandas as pd
import numpy as np

from scipy import interpolate
import glob
import pywt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from matplotlib import cm

import time
from joblib import Parallel, delayed
import multiprocessing

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
import ppscore as pps
import lightgbm as lgb
from catboost import CatBoostRegressor, FeaturesData, Pool
from sklearn import preprocessing


# Build a Classification model

In [56]:
#lightgbm
def lightgbm(df4,label):
    df4=df4[df4['tremor']!=4]
    train=df4[~df4[label].isnull()]

    # Splitting the dataset into the Training set and Test set
    x_train, x_test, y_train, y_test = train_test_split(train.drop(['subject_id','measurement_id','on_off','dyskinesia','tremor'], axis = 1), train[label], 
                                                        train_size=0.7, 
                                                        stratify = train[label],
                                                        random_state=1)

    d_train = lgb.Dataset(x_train, label=y_train)

    params = {}
    params['learning_rate'] = 0.03
    params['boosting_type'] = 'gbdt'
    params['objective'] = 'regression'
    params['metric'] = 'rmse'
    params['max_depth'] = 5
    params['feature_fraction'] = 0.8
    clf = lgb.train(params, d_train, 150)
    y_pred=clf.predict(x_test)
    y_pred2=clf.predict(x_train)

    #print('lightgbm train_MSE '+label+' :',round(mean_squared_error(y_train, y_pred2),3))
    print('lightgbm test_MSE '+label+' :',round(mean_squared_error(y_test, y_pred),3))
    
    
#Catboost    
#This will give indexes of the categorical features
def categorical_index(df,cols):
    cat=[]
    for c in cols:
        try:
            cat.append(df.columns.get_loc(c))
        except:
            pass # doing nothing on exception
    return cat

def catboost(df4,label): 
    df4=df4[df4['tremor']!=4]
    train=df4[~df4[label].isnull()]
    X_train, X_test, y_train, y_test = train_test_split(train.drop(['subject_id','measurement_id','on_off','dyskinesia','tremor'], axis = 1), train[label], 
                                                        train_size=0.7, 
                                                        stratify = train[label],
                                                        random_state=1)
    train_dataset = Pool(data=X_train,
                         label=y_train,
                         cat_features=categorical_index(X_train,[]))

    eval_dataset = Pool(data=X_test,
                        label=y_test,
                        cat_features=categorical_index(X_train,[]))



    model = CatBoostRegressor(iterations=50, learning_rate=0.1, depth=6,verbose=False)
    # Fit model
    model.fit(train_dataset)
    # Get predictions
    preds = model.predict(eval_dataset)
    preds2 = model.predict(train_dataset)

    #print('catboost train_MSE '+label+' :',round(mean_squared_error(y_train, preds2),3))
    print('catboost test_MSE '+label+' :',round(mean_squared_error(y_test, preds),3))

In [402]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

def get_lgbm_varimp(df4,label, max_vars=50):
    df4=df4[df4['tremor']!=4]
    train=df4[~df4[label].isnull()]

    # Splitting the dataset into the Training set and Test set
    x_train, x_test, y_train, y_test = train_test_split(train.drop(['subject_id','measurement_id','on_off','dyskinesia','tremor'], axis = 1), train[label], 
                                                        train_size=0.7, 
                                                        stratify = train[label],
                                                        random_state=1)

    d_train = lgb.Dataset(x_train, label=y_train)

    params = {}
    params['learning_rate'] = 0.03
    params['boosting_type'] = 'gbdt'
    params['objective'] = 'regression'
    params['metric'] = 'rmse'
    params['max_depth'] = 5
    params['feature_fraction'] = 0.8
    clf = lgb.train(params, d_train, 150)
    
    X=x_train
    model=clf
    num=max_vars
    
    feature_imp = pd.DataFrame({'Value':model.feature_importance(),'Feature':X.columns})
    plt.figure(figsize=(40, 40))
    sns.set(font_scale = 5)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", 
                                                        ascending=False)[0:num])
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances-01.png')
    plt.show()

In [None]:
get_lgbm_varimp(df_train,'tremor')