In [None]:
import pandas as pd
import numpy as np
import pywt
import matplotlib.pyplot as plt
from scipy import interpolate
import glob
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm

from sklearn.decomposition import PCA
import time
from joblib import Parallel, delayed
import multiprocessing

from sklearn.model_selection import train_test_split
import lightgbm as lgb
from catboost import CatBoostRegressor, FeaturesData, Pool
from sklearn.metrics import mean_squared_error

In [None]:
#==========================================================================================
#making time stamp uniform by Interpolation

def preprocess(data):
    freq=50
    ls=['X','Y','Z']
    t1=np.arange(data.Timestamp[0],data.Timestamp[(data.shape[0])-1],0.02)
    df=pd.DataFrame({'Timestamp':t1})
    for i in ls:
        fcubic = interpolate.interp1d(data.Timestamp, data[i], kind='cubic')
        df[i]=fcubic(t1)
    df.columns=['Timestamp','acc_X','acc_Y','acc_Z']
    return df

#==========================================================================================
#median filter
from scipy.signal import medfilt # import the median filter function
def median(signal):# input: numpy array 1D (one column)  
    #applying the median filter
    return  medfilt(np.array(signal), kernel_size=3) # applying the median filter order3(kernel_size=3)


#==========================================================================================
#components_selection_one_signal
import math # import math library


def components_selection_one_signal(t_signal):
    sampling_freq=50
    nyq=sampling_freq/float(2) # nyq is the nyquist frequency equal to the half of the sampling frequency[50/2= 25 Hz]

    freq1 = 0.3
    freq2 = 20

    t_signal=np.array(t_signal)
    t_signal_length=len(t_signal) # number of points in a t_signal
    
    # the t_signal in frequency domain after applying fft
    f_signal=np.fft.fft(t_signal) # 1D numpy array contains complex values (in C)
    
    # generate frequencies associated to f_signal complex values
    freqs=np.array(np.fft.fftfreq(t_signal_length, d=1/float(sampling_freq))) # frequency values between [-25hz:+25hz]
        
    df=pd.DataFrame({'freq':abs(freqs),'amplitute':f_signal})
    df['f_DC_signal']=np.where(df.freq>freq1,0,df.amplitute)
    df['f_noise_signal']=np.where(df.freq<=freq2,0,df.amplitute)
    df['f_body_signal']=np.where(df.freq<=freq1,0,np.where(df.freq>freq2,0,df.amplitute))

    
    # Inverse the transformation of signals in freq domain #
    # applying the inverse fft(ifft) to signals in freq domain and put them in float format
    t_DC_component= np.fft.ifft(np.array(df['f_DC_signal'])).real
    t_body_component= np.fft.ifft(np.array(df['f_body_signal'])).real
    t_noise=np.fft.ifft(np.array(df['f_noise_signal'])).real
    
    total_component=t_signal-t_noise # extracting the total component(filtered from noise) 
                                     #  by substracting noise from t_signal (the original signal).
    
    # return outputs mentioned earlier
    return (total_component,t_DC_component,t_body_component,t_noise) 


#=================================================================================================================
#Define verify gravity function
def mag_3_signals(df): # Euclidian magnitude
    return np.array(np.sqrt(np.square(df).sum(axis=1)))

def verify_gravity(data):
    
    acc_x=np.array(data['acc_X']) # copy acc_X column from dataframe in raw_dic having the key mentioned above
    acc_y=np.array(data['acc_Y'])# copy acc_Y column  from dataframe in raw_dic having the key mentioned above
    acc_z=np.array(data['acc_Z'])# copy acc_Z column  from dataframe in raw_dic having the key mentioned above

    # apply the filtering method to acc_[X,Y,Z] and store gravity components
    grav_acc_X=components_selection_one_signal(acc_x)[1] 
    grav_acc_Y=components_selection_one_signal(acc_y)[1]
    grav_acc_Z=components_selection_one_signal(acc_z)[1]
    
    # calculating gravity magnitude signal
    grav_acc_mag=mag_3_signals(grav_acc_X, grav_acc_Y,grav_acc_Z)
    print('mean value = ',round((sum(grav_acc_mag) / len(grav_acc_mag)),3),' g')
    
#=================================================================================================================    
#Define jerking and magnitude functions
def jerk_one_signal(signal):
    signal=pd.DataFrame(signal)
    jerk=(signal.shift(-1)-signal)/0.02
    return np.array(jerk.dropna()).transpose()[0]


## Id label

In [None]:
#Test Data
cis_pd_testing_id=pd.read_csv('test_data_Id/cis-pd.CIS-PD_Test_Data_IDs.csv')
real_pd_testing_id=pd.read_csv('test_data_Id/real-pd.REAL-PD_Test_Data_IDs.csv')

#Training Data
cis_pd_training_id=pd.read_csv('data_labels/CIS-PD_Training_Data_IDs_Labels.csv')
real_pd_training_id=pd.read_csv('data_labels/REAL-PD_Training_Data_IDs_Labels.csv')

#Ancillary Data
cis_pd_ancillary_id=pd.read_csv('data_labels/CIS-PD_Ancillary_Data_IDs_Labels.csv')
real_pd_ancillary_id=pd.read_csv('data_labels/REAL-PD_Ancillary_Data_IDs_Labels.csv')

In [None]:
def time_domain_signal(data):
    time_sig_df=pd.DataFrame()
    for column in ['acc_X','acc_Y','acc_Z']:
        t_signal=np.array(data[column])
        #med_filtred=median(t_signal)
        med_filtred=(t_signal)
        _,grav_acc,body_acc,_=components_selection_one_signal(med_filtred)
        body_acc_jerk=jerk_one_signal(body_acc)
        time_sig_df['t_body_'+column]=body_acc[:-1]
        time_sig_df['t_grav_'+column]= grav_acc[:-1]
        time_sig_df['t_body_acc_jerk_'+column[-1]]=body_acc_jerk

    # all 15 axial signals generated above are reordered to facilitate magnitudes signals generation
    new_columns_ordered=['t_body_acc_X','t_body_acc_Y','t_body_acc_Z',
                              't_grav_acc_X','t_grav_acc_Y','t_grav_acc_Z',
                              't_body_acc_jerk_X','t_body_acc_jerk_Y','t_body_acc_jerk_Z']


    # create new dataframe to order columns
    time_sig_df=time_sig_df[new_columns_ordered]

    # Magnitude Features
    for i in range(0,9,3):
        mag_col_name=new_columns_ordered[i][:-1]+'mag'# Create the magnitude column name related to each 3-axial signals
        time_sig_df[mag_col_name]=mag_3_signals(time_sig_df[new_columns_ordered[i:i+3]]) # store the signal_mag with its appropriate column name

    return(time_sig_df)

In [None]:
a=glob.glob("training_data/*.csv")
data=preprocess(pd.read_csv(a[14]))
time_sig_df=time_domain_signal(data)

In [None]:
def wavelet(dfl,ls):
    num=16
    scales= np.arange(1,num+1)
    pca = PCA(n_components=1)
    df2=preprocess(pd.read_csv(dfl))
    time_sig_df=time_domain_signal(df2)
    wavelet_coeff=[]
    for i in time_sig_df.columns:
        coeff, freq = pywt.cwt(time_sig_df[i],scales,ls)
        wavelet_coeff.append(list(pca.fit_transform(coeff).flatten()))
    return list(np.reshape(wavelet_coeff, (1,len(time_sig_df.columns)*num))[0])


In [None]:
#train data
a=glob.glob("training_data/*.csv")
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(wavelet)(i,'morl') for i in a)
print("--- %s Mins ---" % ((time.time() - start_time)/60))
#--- 18.011835567156474 Mins ---

In [None]:
df_train=pd.DataFrame(result)
df_train.columns='wavelet_coeff_'+np.repeat(time_sig_df.columns,16)+'_'+list(np.arange(1,17).astype('str'))*len(time_sig_df.columns)
df_train['measurement_id']=[item[len('training_data/'):-4] for item in a]
print(df_train.shape)
df_train.head()

In [None]:
#anciliary data
a=glob.glob("ancillary_data/*.csv")
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(wavelet)(i,'morl') for i in a)
print("--- %s Mins ---" % ((time.time() - start_time)/60))

In [None]:
df_train2=pd.DataFrame(result)
df_train2.columns='wavelet_coeff_'+np.repeat(time_sig_df.columns,16)+'_'+list(np.arange(1,17).astype('str'))*len(time_sig_df.columns)
df_train2['measurement_id']=[item[len('ancillary_data/'):-4] for item in a]
print(df_train2.shape)
df_train2.head()

In [None]:
#appending train and anciliary
Frame = df_train.append(pd.DataFrame(df_train2), ignore_index=True)

In [None]:
#export part2 features of training data for cispd
Frame.to_csv('cispd_wavelet_training_features.csv',index=False)

### test data

In [None]:
#train data
a=glob.glob("testing_data/*.csv")
start_time = time.time()
num_cores = multiprocessing.cpu_count()
result=Parallel(n_jobs=num_cores)(delayed(wavelet)(i,'morl') for i in a)


df_test=pd.DataFrame(result)
df_test.columns='wavelet_coeff_'+np.repeat(time_sig_df.columns,16)+'_'+list(np.arange(1,17).astype('str'))*len(time_sig_df.columns)
df_test['measurement_id']=[item[len('testing_data/'):-4] for item in a]
print(df_test.shape)
df_test.head()

In [None]:
#export part2 features of testing data for cispd
df_test.to_csv('cispd_wavelet_testing_features.csv',index=False)