In [1]:
import pandas as pd
import numpy as np
import pywt
import matplotlib.pyplot as plt
from scipy import interpolate
import glob
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm

from sklearn.decomposition import PCA
import time
from joblib import Parallel, delayed
import multiprocessing

from sklearn.model_selection import train_test_split
import lightgbm as lgb
from catboost import CatBoostRegressor, FeaturesData, Pool
from sklearn.metrics import mean_squared_error

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
#==========================================================================================
#making time stamp uniform by Interpolation

def preprocess(data):
    freq=50
    ls=['X','Y','Z']
    t1=np.arange(data.Timestamp[0],data.Timestamp[(data.shape[0])-1],0.02)
    df=pd.DataFrame({'Timestamp':t1})
    for i in ls:
        fcubic = interpolate.interp1d(data.Timestamp, data[i], kind='cubic')
        df[i]=fcubic(t1)
    df.columns=['Timestamp','acc_X','acc_Y','acc_Z']
    return df

#==========================================================================================
#median filter
from scipy.signal import medfilt # import the median filter function
def median(signal):# input: numpy array 1D (one column)  
    #applying the median filter
    return  medfilt(np.array(signal), kernel_size=3) # applying the median filter order3(kernel_size=3)


#==========================================================================================
#components_selection_one_signal
import math # import math library


def components_selection_one_signal(t_signal):
    sampling_freq=50
    nyq=sampling_freq/float(2) # nyq is the nyquist frequency equal to the half of the sampling frequency[50/2= 25 Hz]

    freq1 = 0.3
    freq2 = 20

    t_signal=np.array(t_signal)
    t_signal_length=len(t_signal) # number of points in a t_signal
    
    # the t_signal in frequency domain after applying fft
    f_signal=np.fft.fft(t_signal) # 1D numpy array contains complex values (in C)
    
    # generate frequencies associated to f_signal complex values
    freqs=np.array(np.fft.fftfreq(t_signal_length, d=1/float(sampling_freq))) # frequency values between [-25hz:+25hz]
        
    df=pd.DataFrame({'freq':abs(freqs),'amplitute':f_signal})
    df['f_DC_signal']=np.where(df.freq>freq1,0,df.amplitute)
    df['f_noise_signal']=np.where(df.freq<=freq2,0,df.amplitute)
    df['f_body_signal']=np.where(df.freq<=freq1,0,np.where(df.freq>freq2,0,df.amplitute))

    
    # Inverse the transformation of signals in freq domain #
    # applying the inverse fft(ifft) to signals in freq domain and put them in float format
    t_DC_component= np.fft.ifft(np.array(df['f_DC_signal'])).real
    t_body_component= np.fft.ifft(np.array(df['f_body_signal'])).real
    t_noise=np.fft.ifft(np.array(df['f_noise_signal'])).real
    
    total_component=t_signal-t_noise # extracting the total component(filtered from noise) 
                                     #  by substracting noise from t_signal (the original signal).
    
    # return outputs mentioned earlier
    return (total_component,t_DC_component,t_body_component,t_noise) 


#=================================================================================================================
#Define verify gravity function
def mag_3_signals(df): # Euclidian magnitude
    return np.array(np.sqrt(np.square(df).sum(axis=1)))

def verify_gravity(data):
    
    acc_x=np.array(data['acc_X']) # copy acc_X column from dataframe in raw_dic having the key mentioned above
    acc_y=np.array(data['acc_Y'])# copy acc_Y column  from dataframe in raw_dic having the key mentioned above
    acc_z=np.array(data['acc_Z'])# copy acc_Z column  from dataframe in raw_dic having the key mentioned above

    # apply the filtering method to acc_[X,Y,Z] and store gravity components
    grav_acc_X=components_selection_one_signal(acc_x)[1] 
    grav_acc_Y=components_selection_one_signal(acc_y)[1]
    grav_acc_Z=components_selection_one_signal(acc_z)[1]
    
    # calculating gravity magnitude signal
    grav_acc_mag=mag_3_signals(grav_acc_X, grav_acc_Y,grav_acc_Z)
    print('mean value = ',round((sum(grav_acc_mag) / len(grav_acc_mag)),3),' g')
    
#=================================================================================================================    
#Define jerking and magnitude functions
def jerk_one_signal(signal):
    signal=pd.DataFrame(signal)
    jerk=(signal.shift(-1)-signal)/0.02
    return np.array(jerk.dropna()).transpose()[0]





#==========
#model
#lightgbm
def lightgbm(df4,label):
    train=df4[~df4[label].isnull()]

    # Splitting the dataset into the Training set and Test set
    x_train, x_test, y_train, y_test = train_test_split(train.drop(['measurement_id','on_off','dyskinesia','tremor'], axis = 1), train[label], 
                                                        train_size=0.7, 
                                                        stratify = train[label],
                                                        random_state=1)

    d_train = lgb.Dataset(x_train, label=y_train)

    params = {}
    params['learning_rate'] = 0.03
    params['boosting_type'] = 'gbdt'
    params['objective'] = 'regression'
    params['metric'] = 'rmse'
    params['max_depth'] = 5
    params['feature_fraction'] = 0.8
    clf = lgb.train(params, d_train, 150)
    y_pred=clf.predict(x_test)
    y_pred2=clf.predict(x_train)

    #print('lightgbm train_MSE '+label+' :',round(mean_squared_error(y_train, y_pred2),3))
    print('lightgbm test_MSE '+label+' :',round(mean_squared_error(y_test, y_pred),3))
    
    
#Catboost    
#This will give indexes of the categorical features
def categorical_index(df,cols):
    cat=[]
    for c in cols:
        try:
            cat.append(df.columns.get_loc(c))
        except:
            pass # doing nothing on exception
    return cat

def catboost(df4,label):    
    train=df4[~df4[label].isnull()]
    X_train, X_test, y_train, y_test = train_test_split(train.drop(['measurement_id','on_off','dyskinesia','tremor'], axis = 1), train[label], 
                                                        train_size=0.7, 
                                                        stratify = train[label],
                                                        random_state=1)
    train_dataset = Pool(data=X_train,
                         label=y_train,
                         cat_features=categorical_index(X_train,['subject_id','Gender']))

    eval_dataset = Pool(data=X_test,
                        label=y_test,
                        cat_features=categorical_index(X_train,['subject_id','Gender']))



    model = CatBoostRegressor(iterations=50, learning_rate=0.1, depth=6,verbose=False)
    # Fit model
    model.fit(train_dataset)
    # Get predictions
    preds = model.predict(eval_dataset)
    preds2 = model.predict(train_dataset)

    #print('catboost train_MSE '+label+' :',round(mean_squared_error(y_train, preds2),3))
    print('catboost test_MSE '+label+' :',round(mean_squared_error(y_test, preds),3))

## Id label

In [3]:
#Test Data
cis_pd_testing_id=pd.read_csv('test_data_Id/cis-pd.CIS-PD_Test_Data_IDs.csv')
real_pd_testing_id=pd.read_csv('test_data_Id/real-pd.REAL-PD_Test_Data_IDs.csv')

#Training Data
cis_pd_training_id=pd.read_csv('data_labels/CIS-PD_Training_Data_IDs_Labels.csv')
real_pd_training_id=pd.read_csv('data_labels/REAL-PD_Training_Data_IDs_Labels.csv')

#Ancillary Data
cis_pd_ancillary_id=pd.read_csv('data_labels/CIS-PD_Ancillary_Data_IDs_Labels.csv')
real_pd_ancillary_id=pd.read_csv('data_labels/REAL-PD_Ancillary_Data_IDs_Labels.csv')

In [4]:

a=glob.glob("training_data/*.csv")

In [5]:
data=preprocess(pd.read_csv(a[14]))
print(data.shape)
data.head()

(60000, 4)


Unnamed: 0,Timestamp,acc_X,acc_Y,acc_Z
0,0.0,0.186035,0.170654,-0.878174
1,0.02,0.148438,0.209961,-0.894287
2,0.04,0.166016,0.219727,-0.925049
3,0.06,0.206299,0.194092,-0.926758
4,0.08,0.173584,0.204102,-0.983154


In [6]:
def time_domain_signal(data):
    time_sig_df=pd.DataFrame()
    for column in ['acc_X','acc_Y','acc_Z']:
        t_signal=np.array(data[column])
        #med_filtred=median(t_signal)
        med_filtred=(t_signal)
        _,grav_acc,body_acc,_=components_selection_one_signal(med_filtred)
        body_acc_jerk=jerk_one_signal(body_acc)
        time_sig_df['t_body_'+column]=body_acc[:-1]
        time_sig_df['t_grav_'+column]= grav_acc[:-1]
        time_sig_df['t_body_acc_jerk_'+column[-1]]=body_acc_jerk

    # all 15 axial signals generated above are reordered to facilitate magnitudes signals generation
    new_columns_ordered=['t_body_acc_X','t_body_acc_Y','t_body_acc_Z',
                              't_grav_acc_X','t_grav_acc_Y','t_grav_acc_Z',
                              't_body_acc_jerk_X','t_body_acc_jerk_Y','t_body_acc_jerk_Z']


    # create new dataframe to order columns
    time_sig_df=time_sig_df[new_columns_ordered]

    # Magnitude Features
    for i in range(0,9,3):
        mag_col_name=new_columns_ordered[i][:-1]+'mag'# Create the magnitude column name related to each 3-axial signals
        time_sig_df[mag_col_name]=mag_3_signals(time_sig_df[new_columns_ordered[i:i+3]]) # store the signal_mag with its appropriate column name

    return(time_sig_df)

In [7]:
time_sig_df=time_domain_signal(data)
time_sig_df.shape

(59999, 12)

In [8]:
time_sig_df.head()

Unnamed: 0,t_body_acc_X,t_body_acc_Y,t_body_acc_Z,t_grav_acc_X,t_grav_acc_Y,t_grav_acc_Z,t_body_acc_jerk_X,t_body_acc_jerk_Y,t_body_acc_jerk_Z,t_body_acc_mag,t_grav_acc_mag,t_body_acc_jerk_mag
0,0.051701,-0.309492,-0.430242,0.131715,0.55875,-0.319212,-1.716811,-4.916696,-12.082357,0.53251,0.656846,13.156925
1,0.017365,-0.407826,-0.671889,0.131556,0.54942,-0.335519,0.975826,6.859124,9.164233,0.786167,0.657071,11.488384
2,0.036881,-0.270644,-0.488604,0.131486,0.5401,-0.351798,1.638469,-4.648829,-5.982242,0.55977,0.657844,7.751349
3,0.069651,-0.36362,-0.608249,0.131505,0.530795,-0.36804,-1.029729,2.558821,1.223206,0.712066,0.659159,3.017307
4,0.049056,-0.312444,-0.583785,0.131613,0.52151,-0.384237,2.517675,1.077161,-0.436842,0.663952,0.661009,2.773048


In [9]:
def wavelet(dfl,ls):
    num=16
    scales= np.arange(1,num+1)
    pca = PCA(n_components=1)
    df2=preprocess(pd.read_csv(dfl))
    time_sig_df=time_domain_signal(df2)
    wavelet_coeff=[]
    for i in time_sig_df.columns:
        coeff, freq = pywt.cwt(time_sig_df[i],scales,ls)
        wavelet_coeff.append(list(pca.fit_transform(coeff).flatten()))
    return list(np.reshape(wavelet_coeff, (1,len(time_sig_df.columns)*num))[0])


In [10]:
#train data
a=glob.glob("training_data/*.csv")
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(wavelet)(i,'morl') for i in a)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#--- 18.011835567156474 Mins ---

--- 27.400479185581208 Mins ---


In [11]:
df_train=pd.DataFrame(result)
df_train.columns='wavelet_coeff_'+np.repeat(time_sig_df.columns,16)+'_'+list(np.arange(1,17).astype('str'))*len(time_sig_df.columns)
df_train['measurement_id']=[item[len('training_data/'):-4] for item in a]
print(df_train.shape)
df_train.head()


(1858, 193)


Unnamed: 0,wavelet_coeff_t_body_acc_X_1,wavelet_coeff_t_body_acc_X_2,wavelet_coeff_t_body_acc_X_3,wavelet_coeff_t_body_acc_X_4,wavelet_coeff_t_body_acc_X_5,wavelet_coeff_t_body_acc_X_6,wavelet_coeff_t_body_acc_X_7,wavelet_coeff_t_body_acc_X_8,wavelet_coeff_t_body_acc_X_9,wavelet_coeff_t_body_acc_X_10,...,wavelet_coeff_t_body_acc_jerk_mag_8,wavelet_coeff_t_body_acc_jerk_mag_9,wavelet_coeff_t_body_acc_jerk_mag_10,wavelet_coeff_t_body_acc_jerk_mag_11,wavelet_coeff_t_body_acc_jerk_mag_12,wavelet_coeff_t_body_acc_jerk_mag_13,wavelet_coeff_t_body_acc_jerk_mag_14,wavelet_coeff_t_body_acc_jerk_mag_15,wavelet_coeff_t_body_acc_jerk_mag_16,measurement_id
0,-8.728989,-9.077652,-9.905225,-10.968815,-11.551044,-11.773246,-10.786782,-7.915179,-3.465994,2.228902,...,-315.521235,-85.702886,175.010258,397.551148,571.209333,639.744727,630.210794,553.187207,439.750086,4d9e9934-8c16-439b-934c-c0f46c0e043a
1,-6.148706,-6.232344,-6.496734,-6.838421,-7.173369,-7.397604,-7.034429,-5.744748,-3.656207,-0.605927,...,-203.852726,-41.455854,147.540362,312.078053,430.06715,475.464942,470.493753,423.124356,354.793381,26f5e58c-ab76-42e1-a252-43ed459a4ad7
2,-1.87272,-2.407007,-3.143185,-3.985342,-4.824523,-5.682523,-5.792414,-4.686366,-2.594112,0.080823,...,-185.967375,-115.813897,-16.610263,88.705109,194.610759,280.322863,340.952769,374.17696,376.401031,33928414-f322-43ca-b2b1-9152c2c72906
3,-12.343531,-12.319537,-12.687153,-13.251718,-13.829986,-14.270227,-13.511195,-10.56584,-5.645731,1.163982,...,-291.657735,-167.524946,2.570003,176.315041,342.387062,468.054549,550.507256,588.16875,579.24971,2a72fb3e-89a1-41cc-a2bd-c8d110fd4b2d
4,-4.527629,-4.564506,-4.75596,-5.060684,-5.345859,-5.576273,-5.399056,-4.501617,-3.010904,-0.808577,...,-87.353188,-47.526797,3.818354,54.143788,102.562496,138.027696,161.322663,171.664031,168.780129,dcb79a9e-e481-4d8c-9ea6-d5bd450386e7


In [12]:
#anciliary data
a=glob.glob("ancillary_data/*.csv")
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(wavelet)(i,'morl') for i in a)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#--- 18.011835567156474 Mins ---

--- 5.0709392110506695 Mins ---


In [13]:
df_train2=pd.DataFrame(result)
df_train2.columns='wavelet_coeff_'+np.repeat(time_sig_df.columns,16)+'_'+list(np.arange(1,17).astype('str'))*len(time_sig_df.columns)
df_train2['measurement_id']=[item[len('ancillary_data/'):-4] for item in a]
print(df_train2.shape)
df_train2.head()


(352, 193)


Unnamed: 0,wavelet_coeff_t_body_acc_X_1,wavelet_coeff_t_body_acc_X_2,wavelet_coeff_t_body_acc_X_3,wavelet_coeff_t_body_acc_X_4,wavelet_coeff_t_body_acc_X_5,wavelet_coeff_t_body_acc_X_6,wavelet_coeff_t_body_acc_X_7,wavelet_coeff_t_body_acc_X_8,wavelet_coeff_t_body_acc_X_9,wavelet_coeff_t_body_acc_X_10,...,wavelet_coeff_t_body_acc_jerk_mag_8,wavelet_coeff_t_body_acc_jerk_mag_9,wavelet_coeff_t_body_acc_jerk_mag_10,wavelet_coeff_t_body_acc_jerk_mag_11,wavelet_coeff_t_body_acc_jerk_mag_12,wavelet_coeff_t_body_acc_jerk_mag_13,wavelet_coeff_t_body_acc_jerk_mag_14,wavelet_coeff_t_body_acc_jerk_mag_15,wavelet_coeff_t_body_acc_jerk_mag_16,measurement_id
0,-9.057671,-9.411483,-10.07308,-11.193359,-12.200219,-12.685992,-11.916457,-9.421976,-5.511764,-0.0967,...,-325.628426,-184.093854,6.782731,199.301562,383.790071,520.668275,610.472493,653.212625,645.640361,05b3f5ec-24aa-4558-b19f-7ab82cd3f1d3
1,-5.169666,-5.243607,-5.413162,-5.769564,-6.239277,-6.601242,-6.352178,-5.115594,-3.034811,-0.069207,...,-171.456097,-93.591725,9.523574,112.243162,209.888266,280.758347,325.496113,343.898075,334.558389,248e1db4-0919-4721-b11a-2b979a60ff11
2,-6.091947,-6.371627,-6.856136,-7.692263,-8.610445,-9.236879,-8.796686,-6.927577,-3.975383,0.014738,...,-263.407066,-150.450492,0.832161,151.69557,293.826057,398.474079,463.331727,489.934881,480.118426,eb95eefd-b52c-45dd-a206-7122eeac085f
3,-5.00811,-5.271228,-5.618915,-6.296472,-6.9878,-7.292373,-6.84284,-5.443403,-3.257027,-0.201745,...,-205.789501,-89.99941,52.351533,185.215122,301.82888,374.436443,411.003798,414.146201,386.849864,fae9e929-899a-4fe6-a3de-4fc96364611c
4,-3.12823,-3.366341,-3.589745,-3.862244,-4.172581,-4.426649,-4.305647,-3.551227,-2.193599,-0.232437,...,-216.313684,-128.549121,-6.207456,123.889187,250.25817,349.453703,412.495926,437.124202,421.549216,dcca711d-e105-400a-8389-de6c10bbd209


In [14]:
Frame = df_train.append(pd.DataFrame(df_train2), ignore_index=True)

In [16]:
Frame.shape

(2210, 193)

In [30]:
Frame.to_csv('cispd_wavelet_training_features.csv',index=False)

### test data

In [15]:
#train data
a=glob.glob("testing_data/*.csv")
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(wavelet)(i,'morl') for i in a)


df_test=pd.DataFrame(result)
df_test.columns='wavelet_coeff_'+np.repeat(time_sig_df.columns,16)+'_'+list(np.arange(1,17).astype('str'))*len(time_sig_df.columns)
df_test['measurement_id']=[item[len('testing_data/'):-4] for item in a]
print(df_test.shape)
df_test.head()


(618, 193)


Unnamed: 0,wavelet_coeff_t_body_acc_X_1,wavelet_coeff_t_body_acc_X_2,wavelet_coeff_t_body_acc_X_3,wavelet_coeff_t_body_acc_X_4,wavelet_coeff_t_body_acc_X_5,wavelet_coeff_t_body_acc_X_6,wavelet_coeff_t_body_acc_X_7,wavelet_coeff_t_body_acc_X_8,wavelet_coeff_t_body_acc_X_9,wavelet_coeff_t_body_acc_X_10,...,wavelet_coeff_t_body_acc_jerk_mag_8,wavelet_coeff_t_body_acc_jerk_mag_9,wavelet_coeff_t_body_acc_jerk_mag_10,wavelet_coeff_t_body_acc_jerk_mag_11,wavelet_coeff_t_body_acc_jerk_mag_12,wavelet_coeff_t_body_acc_jerk_mag_13,wavelet_coeff_t_body_acc_jerk_mag_14,wavelet_coeff_t_body_acc_jerk_mag_15,wavelet_coeff_t_body_acc_jerk_mag_16,measurement_id
0,-10.350184,-10.404952,-10.731128,-11.099379,-11.483486,-11.869614,-11.503612,-9.646242,-6.43708,-1.574983,...,-357.023521,-199.705458,14.697686,234.683648,440.760393,593.919622,690.706637,728.91811,710.391191,4bcb3a76-9009-4afd-ad05-1f472e142dbe
1,-12.462294,-12.365734,-12.705732,-13.160676,-13.620218,-14.277896,-14.408686,-13.014295,-10.960797,-7.130076,...,-247.479128,-173.54911,-62.780933,56.973397,186.801988,309.73227,422.452387,529.682483,622.49429,c5958651-8695-4d6b-b8c5-9eb6c6866078
2,-22.315683,-22.66003,-23.813566,-25.186162,-26.169698,-26.990466,-26.189024,-22.310149,-15.833186,-5.766334,...,-908.621019,-540.572223,-20.810379,532.46193,1064.769352,1480.172684,1755.916215,1883.353551,1862.466849,7598e18a-79c2-4936-a951-b9aedae10d31
3,-3.545312,-4.292199,-5.071036,-5.238753,-5.243854,-4.997,-4.423231,-3.485693,-2.077015,-0.052203,...,-196.539844,-36.200898,146.737925,301.002133,415.101339,464.538271,469.004597,434.959311,371.42682,b31841bf-dce6-4cb1-9389-2b3d2810a9ea
4,-2.715656,-2.770242,-2.922428,-3.144839,-3.370979,-3.458592,-3.046677,-1.950381,-0.314584,1.634765,...,-77.340202,-44.53089,0.854659,48.120785,93.260253,127.702423,151.015885,164.180488,166.357062,3f2382de-6aba-49e1-b4c2-fc4f0498e9ca


In [17]:
df_test.to_csv('cispd_wavelet_testing_features.csv',index=False)