In [1]:
# Import Libraries:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
#import random
#import csv
from scipy import stats
import matplotlib.pyplot as plt
import glob
#from sklearn.linear_model import LogisticRegression
#from sklearn.feature_selection import mutual_info_classif
#from sklearn.metrics import average_precision_score
#from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
#import lightgbm as lgb
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans

pd.set_option('use_inf_as_na', True)

#TEMPORARY!!!! RM!
np.seterr(divide='ignore', invalid='ignore')
#TEMPORARY!!!! RM!


####################################################################
########              PARAMETERS & CONSTANTS:               ########
####################################################################
gg = 9.806

sampleRateTDCSFOG = 128     #per second
sampleRateDEFOG   = 100     #per second

batchesTDCS = 100
batchesDEFOG = 1000

deadRelTimeStart = 0.05
deadRelTimeEnd   = 0.95

dummyVariable = 9 #ignore



In [2]:
#A low pass filter to remove high frequency noise.
def lowPassFilter(kArr, freqArr, cutOffFreq):
    for i in range(0,len(freqArr)):
        if freqArr[i] > cutOffFreq:
            kArr.real[i] = 0; 
            kArr.imag[i] = 0;
    return kArr


#A high pass filter to analyze only high frequencies.  
def highPassFilter(kArr, freqArr, cutOffFreq):
    for i in range(0,len(freqArr)):
        if freqArr[i] < cutOffFreq:
            kArr.real[i] = 0;
            kArr.imag[i] = 0;
    return kArr


#A quick FFT where W can be x, y, z accelerations etc.
def quickFFT(inputT, inputW, sampleRate, filterType, cutOff):
    kspaceData = np.fft.rfft(inputW)
    freq = np.fft.rfftfreq(inputT.shape[-1], d=1.0/sampleRate)
    if filterType == "low":
        filteredData = lowPassFilter(kspaceData, freq, cutOff)
    elif filterType == "high":
        filteredData = highPassFilter(kspaceData, freq, cutOff)
    else:
        filteredData = kspaceData
    outputW = np.fft.irfft(filteredData, len(inputW))
    return outputW


#A quick FFT where W can be x, y, z accelerations etc. (returns k-space)
def quickFFT_k(inputT, inputW, sampleRate, filterType, cutOff):
    kspaceData = np.fft.rfft(inputW)
    freq = np.fft.rfftfreq(inputT.shape[-1], d=1.0/sampleRate)
    if filterType == "low":
        filteredData = lowPassFilter(kspaceData, freq, cutOff)
    elif filterType == "high":
        filteredData = highPassFilter(kspaceData, freq, cutOff)
    else:
        filteredData = kspaceData
    return freq, filteredData


In [3]:
def gen1featuresTDCS(time, aML, aAP, aVert, sampleRate, cutOff, numBatches):
    numPoints = len(time)
    windowSize = numPoints // numBatches
    #ex: 4133 // 100 = 41
    f1 = [] #note if statement and numpointstoadd
    f2 = []
    f3 = []
    for i in range(0, numBatches):
        t_Clip = time.loc[i*windowSize:(i+1)*windowSize]
        aML_Clip = aML.loc[i*windowSize:(i+1)*windowSize]
        aAP_Clip = aAP.loc[i*windowSize:(i+1)*windowSize]
        freqAP, ampsAP = quickFFT_k(t_Clip, aAP_Clip, sampleRate, "none", cutOff)
        angles = np.angle(ampsAP)
        meanWeightedAngle = np.mean(angles*abs(ampsAP))
        f1_tmp = np.std(angles) #phaseSpread
        freqML, ampsML = quickFFT_k(t_Clip, aML_Clip, sampleRate, "none", cutOff)
        lowfampsAP = ampsAP[(freqAP > 0.5)*(freqAP < 3)]
        highfampsML = ampsML[(freqML > 3)*(freqML < 8)]
        lowf_ampWeightedFreqAP_tmp = np.mean(abs(lowfampsAP)*freqAP[(freqAP > 0.5)*(freqAP < 3)])
        highf_ampWeightedFreqML_tmp = np.mean(abs(highfampsML)*freqML[(freqML > 3)*(freqML < 8)])
        f2_tmp = highf_ampWeightedFreqML_tmp/(1+lowf_ampWeightedFreqAP_tmp)
        f1.extend([f1_tmp]*(windowSize))
        f2.extend([f2_tmp]*(windowSize))
        f3.extend([meanWeightedAngle]*(windowSize)) #meanAmpWeightedPhase
    if len(f1) < numPoints:
        numPointsToAdd = numPoints - len(f1)
        t_Clip = time.loc[numPoints - numPointsToAdd:numPoints-1]
        aML_Clip = aML.loc[numPoints - numPointsToAdd:numPoints-1]
        aAP_Clip = aAP.loc[numPoints - numPointsToAdd:numPoints-1]
        freqAP, ampsAP = quickFFT_k(t_Clip, aAP_Clip, sampleRate, "none", cutOff)
        angles = np.angle(ampsAP)
        meanWeightedAngle = np.mean(angles*abs(ampsAP))
        f1_tmp = np.std(angles) #phaseSpread
        freqML, ampsML = quickFFT_k(t_Clip, aML_Clip, sampleRate, "none", cutOff)
        lowfampsAP = ampsAP[(freqAP > 0.5)*(freqAP < 3)]
        highfampsML = ampsML[(freqML > 3)*(freqML < 8)]
        lowf_ampWeightedFreqAP_tmp = np.mean(abs(lowfampsAP)*freqAP[(freqAP > 0.5)*(freqAP < 3)])
        highf_ampWeightedFreqML_tmp = np.mean(abs(highfampsML)*freqML[(freqML > 3)*(freqML < 8)])
        f2_tmp = highf_ampWeightedFreqML_tmp/(1+lowf_ampWeightedFreqAP_tmp)
        f1.extend([f1_tmp]*(numPointsToAdd))
        f2.extend([f2_tmp]*(numPointsToAdd))
        f3.extend([meanWeightedAngle]*(numPointsToAdd)) #meanAmpWeightedPhase
    return f1, f2, f3


def gen1featuresDEFOG(time, aML, aAP, aVert, sampleRate, cutOff, numBatches):
    numPoints = len(time)
    windowSize = numPoints // numBatches
    #ex: 4133 // 100 = 41
    f1 = [] #note if statement and numpointstoadd #phase spread
    f2 = []     #phaseExcursion
    f3 = []     #meanAmpWeightedPhase
    f4 = []     #highfMLbylowfAP
    f5 = []     #max_aAP 
    f6 = []     #max_aVert
    f7 = []     #iqrRatio_ML_Vert
    for i in range(0, numBatches):
        t_Clip = time.loc[i*windowSize:(i+1)*windowSize]
        aML_Clip = aML.loc[i*windowSize:(i+1)*windowSize]
        aAP_Clip = aAP.loc[i*windowSize:(i+1)*windowSize]
        aVert_Clip = aVert.loc[i*windowSize:(i+1)*windowSize]
        #aTOT_Clip = aTOT.loc[i*windowSize:(i+1)*windowSize]
        freqAP, ampsAP = quickFFT_k(t_Clip, aAP_Clip, sampleRate, "none", cutOff)
        angles = np.angle(ampsAP)
        meanWeightedAngle = np.nanmean(angles*abs(ampsAP))
        f1_tmp = np.std(angles) #phaseStd
        f2_tmp = np.sum(abs(angles - meanWeightedAngle)*abs(ampsAP)) #weightedPhase
        freqML, ampsML = quickFFT_k(t_Clip, aML_Clip, sampleRate, "none", cutOff)
        lowfampsAP = ampsAP[(freqAP > 0.5)*(freqAP < 3)]
        highfampsML = ampsML[(freqML > 3)*(freqML < 8)]
        lowf_ampWeightedFreqAP_tmp = np.mean(abs(lowfampsAP)*freqAP[(freqAP > 0.5)*(freqAP < 3)])
        highf_ampWeightedFreqML_tmp = np.mean(abs(highfampsML)*freqML[(freqML > 3)*(freqML < 8)])
        f4_tmp = highf_ampWeightedFreqML_tmp/(1+lowf_ampWeightedFreqAP_tmp)
        f5_tmp = np.amax(aAP_Clip)
        f6_tmp = np.amax(aVert_Clip)
        #iqr ratio metric
        q75ML, q25ML = np.percentile(aML_Clip, [75 ,25])
        iqr_ML = q75ML - q25ML
        q75Vert, q25Vert = np.percentile(aVert_Clip, [75 ,25])
        iqr_Vert = q75Vert - q25Vert
        f7_tmp = iqr_ML/(1+iqr_Vert)
        f1.extend([f1_tmp]*(windowSize))
        f2.extend([f2_tmp]*(windowSize))
        f3.extend([meanWeightedAngle]*(windowSize))
        f4.extend([f4_tmp]*(windowSize))
        f5.extend([f5_tmp]*(windowSize))
        f6.extend([f6_tmp]*(windowSize))
        f7.extend([f7_tmp]*(windowSize))
    if len(f1) < numPoints:
        numPointsToAdd = numPoints - len(f1)
        t_Clip = time.loc[numPoints - numPointsToAdd:numPoints-1]
        aML_Clip = aML.loc[numPoints - numPointsToAdd:numPoints-1]
        aAP_Clip = aAP.loc[numPoints - numPointsToAdd:numPoints-1]
        aVert_Clip = aVert.loc[numPoints - numPointsToAdd:numPoints-1]
        #aTOT_Clip = aTOT.loc[numPoints - numPointsToAdd:numPoints-1]
        freqAP, ampsAP = quickFFT_k(t_Clip, aAP_Clip, sampleRate, "none", cutOff)
        angles = np.angle(ampsAP)
        meanWeightedAngle = np.nanmean(angles*abs(ampsAP))
        f1_tmp = np.std(angles) #phaseStd
        f2_tmp = np.sum(abs(angles - meanWeightedAngle)*abs(ampsAP)) #weightedPhase
        freqML, ampsML = quickFFT_k(t_Clip, aML_Clip, sampleRate, "none", cutOff)
        lowfampsAP = ampsAP[(freqAP > 0.5)*(freqAP < 3)]
        highfampsML = ampsML[(freqML > 3)*(freqML < 8)]
        lowf_ampWeightedFreqAP_tmp = np.mean(abs(lowfampsAP)*freqAP[(freqAP > 0.5)*(freqAP < 3)])
        highf_ampWeightedFreqML_tmp = np.mean(abs(highfampsML)*freqML[(freqML > 3)*(freqML < 8)])
        f4_tmp = highf_ampWeightedFreqML_tmp/(1+lowf_ampWeightedFreqAP_tmp)
        f5_tmp = np.amax(aAP_Clip)
        f6_tmp = np.amax(aVert_Clip)
        #iqr ratio metric
        q75ML, q25ML = np.percentile(aML_Clip, [75 ,25])
        iqr_ML = q75ML - q25ML
        q75Vert, q25Vert = np.percentile(aVert_Clip, [75 ,25])
        iqr_Vert = q75Vert - q25Vert
        f7_tmp = iqr_ML/(1+iqr_Vert)
        f1.extend([f1_tmp]*(numPointsToAdd))
        f2.extend([f2_tmp]*(numPointsToAdd))
        f3.extend([meanWeightedAngle]*(numPointsToAdd))
        f4.extend([f4_tmp]*(numPointsToAdd))
        f5.extend([f5_tmp]*(numPointsToAdd))
        f6.extend([f6_tmp]*(numPointsToAdd))
        f7.extend([f7_tmp]*(numPointsToAdd))
    return f1, f2, f3, f4, f5, f6, f7
    



def engineerFeatures5(time, aML, aAP, aVert, sampleRate, cutOff, numBatches):
    numPoints = len(time)
    windowSize = numPoints // numBatches
    #ex: 4133 // 100 = 41
    f1 = [] #note if statement and numpointstoadd
    f2 = []
    f3 = []
    for i in range(0, numBatches):
        t_Clip = time.loc[i*windowSize:(i+1)*windowSize]
        aAP_Clip = aAP.loc[i*windowSize:(i+1)*windowSize]
        freqAP, ampsAP = quickFFT_k(t_Clip, aAP_Clip, sampleRate, "none", cutOff)
        meanWeightedFreq = np.mean(freqAP*abs(ampsAP))
        f1_tmp = np.mean(abs(ampsAP[freqAP>3])*abs(ampsAP[freqAP>3]))
        f2_tmp = np.sum(abs(freqAP - meanWeightedFreq)*abs(ampsAP)) #weightedFreqExcursionAP
        f1.extend([f1_tmp]*(windowSize))
        f2.extend([f2_tmp]*(windowSize))
        f3.extend([meanWeightedFreq]*(windowSize))
    if len(f1) < numPoints:
        numPointsToAdd = numPoints - len(f1)
        t_Clip = time.loc[numPoints - numPointsToAdd:numPoints-1]
        aAP_Clip = aAP.loc[numPoints - numPointsToAdd:numPoints-1]
        freqAP, ampsAP = quickFFT_k(t_Clip, aAP_Clip, sampleRate, "none", cutOff)
        meanWeightedFreq = np.mean(freqAP*abs(ampsAP))
        f1_tmp = np.mean(abs(ampsAP[freqAP>3])*abs(ampsAP[freqAP>3]))
        f2_tmp = np.sum(abs(freqAP - meanWeightedFreq)*abs(ampsAP)) #weightedFreqExcursionAP
        f1.extend([f1_tmp]*(numPointsToAdd))
        f2.extend([f2_tmp]*(numPointsToAdd))
        f3.extend([meanWeightedFreq]*(numPointsToAdd))
    return f1, f2, f3

In [4]:
#seglearn fxns
def seglearn_fxns(whichFeature, time, inputW, sampleRate, cutOff, numBatches):
    numPoints = len(time)
    windowSize = numPoints // numBatches
    ret = []
    for i in range(0, numBatches):
        t_Clip = time.loc[i*windowSize:(i+1)*windowSize]
        inputW_Clip = inputW.loc[i*windowSize:(i+1)*windowSize]
        if whichFeature == 0:
            ret.extend([np.mean(inputW_Clip)]*(windowSize))
        elif whichFeature == 1:
            ret.extend([np.median(inputW_Clip)]*(windowSize))
        elif whichFeature == 2:
            ret.extend([stats.gmean(np.abs(inputW_Clip))]*(windowSize))
        elif whichFeature == 3:
            ret.extend([stats.hmean(np.abs(inputW_Clip))]*(windowSize))
        elif whichFeature == 4:
            ret.extend([np.sum(inputW_Clip)]*(windowSize))
        elif whichFeature == 5:
            ret.extend([np.sum(np.abs(inputW_Clip))]*(windowSize))
        elif whichFeature == 6:
            ret.extend([np.sum(inputW_Clip*inputW_Clip)]*(windowSize))
        elif whichFeature == 7:
            ret.extend([stats.skew(inputW_Clip)]*(windowSize))
        elif whichFeature == 8:
            ret.extend([stats.kurtosis(inputW_Clip)]*(windowSize))
        elif whichFeature == 9:
            ret.extend([np.mean(np.diff(inputW_Clip))]*(windowSize))
        elif whichFeature == 10:
            ret.extend([np.mean(np.abs(np.diff(inputW_Clip)))]*(windowSize))
        elif whichFeature == 11:
            ret.extend([np.mean(np.square(np.abs(np.fft.fft(inputW_Clip))))]*(windowSize))
        elif whichFeature == 12:
            #zero crossings about mean
            meanW = np.mean(inputW)
            signW = np.heaviside((inputW - meanW), 1) - 0.5
            ret.extend([np.sum(np.abs(np.diff(signW)))]*(windowSize))
        elif whichFeature == 13:
            difArr = np.diff(inputW_Clip)
            signDifW = np.heaviside(difArr, 1) - 0.5
            ret.extend([np.sum(np.abs(np.diff(signDifW)))]*(windowSize))
        elif whichFeature == 14:
            ret.extend([np.sum(np.abs(np.diff(inputW_Clip)))]*(windowSize))
    if len(ret) < numPoints:
        numPointsToAdd = numPoints - len(ret)
        t_Clip = time.loc[numPoints - numPointsToAdd:numPoints-1]
        inputW_Clip = inputW.loc[numPoints - numPointsToAdd:numPoints-1]
        if whichFeature == 0:
            ret.extend([np.mean(inputW_Clip)]*(numPointsToAdd))
        elif whichFeature == 1:
            ret.extend([np.median(inputW_Clip)]*(numPointsToAdd))
        elif whichFeature == 2:
            ret.extend([stats.gmean(np.abs(inputW_Clip))]*(numPointsToAdd))
        elif whichFeature == 3:
            ret.extend([stats.hmean(np.abs(inputW_Clip))]*(numPointsToAdd))
        elif whichFeature == 4:
            ret.extend([np.sum(inputW_Clip)]*(numPointsToAdd))
        elif whichFeature == 5:
            ret.extend([np.sum(np.abs(inputW_Clip))]*(numPointsToAdd))
        elif whichFeature == 6:
            ret.extend([np.sum(inputW_Clip*inputW_Clip)]*(numPointsToAdd))
        elif whichFeature == 7:
            ret.extend([stats.skew(inputW_Clip)]*(numPointsToAdd))
        elif whichFeature == 8:
            ret.extend([stats.kurtosis(inputW_Clip)]*(numPointsToAdd))
        elif whichFeature == 9:
            ret.extend([np.mean(np.diff(inputW_Clip))]*(numPointsToAdd))
        elif whichFeature == 10:
            ret.extend([np.mean(np.abs(np.diff(inputW_Clip)))]*(numPointsToAdd))
        elif whichFeature == 11:
            ret.extend([np.mean(np.square(np.abs(np.fft.fft(inputW_Clip))))]*(numPointsToAdd))
        elif whichFeature == 12:
            #zero crossings about mean
            meanW = np.mean(inputW)
            signW = np.heaviside((inputW - meanW), 1) - 0.5
            ret.extend([np.sum(np.abs(np.diff(signW)))]*(numPointsToAdd))
        elif whichFeature == 13:
            difArr = np.diff(inputW_Clip)
            signDifW = np.heaviside(difArr, 1) - 0.5
            ret.extend([np.sum(np.abs(np.diff(signDifW)))]*(numPointsToAdd))
        elif whichFeature == 14:
            ret.extend([np.sum(np.abs(np.diff(inputW_Clip)))]*(numPointsToAdd))
    return ret

In [5]:
metadataTDCSFOG_path = '/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/tdcsfog_metadata.csv'
metadataDEFOG_path = '/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/defog_metadata.csv'
subjects_path = '/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/subjects.csv'



df_TDCSmetadata = pd.read_csv(metadataTDCSFOG_path)
#df_TDCSmetadata = df_TDCSmetadata_tmp.drop(columns=['Test'])
one_hot = pd.get_dummies(df_TDCSmetadata['Medication']).drop(columns=['off'])
df_TDCSmetadata['Medication'] = one_hot #one if medication is 'on'
#print(df_TDCSmetadata.head())



df_DEFOGmetadata = pd.read_csv(metadataDEFOG_path)
one_hot = pd.get_dummies(df_DEFOGmetadata['Medication']).drop(columns=['off'])
df_DEFOGmetadata['Medication'] = one_hot #one if medication is 'on'
#print(df_DEFOGmetadata.head())

df_subjects = pd.read_csv(subjects_path)
one_hot = pd.get_dummies(df_subjects['Sex']).drop(columns=['F'])
df_subjects['Sex'] = one_hot #one if medication is 'on'
#print(df_subjects.head())



In [6]:
TDCS_Train_Path = '/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/tdcsfog/'
DEFOG_Train_Path = '/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/defog/'

###################TDCS####################
TDCS_Files = glob.glob(os.path.join(TDCS_Train_Path,"*.csv"))

dfList_TDCS = []
for f in TDCS_Files:
    data = pd.read_csv(f)
    data['Id'] = f[-14:-4]
    data['meanHighFreqPowerAP'], data['weightedFreqExcursionAP'], data['meanWeightedFreqAP'] = engineerFeatures5(data.Time, data.AccML, data.AccAP, data.AccV, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['MLmAP'] = data.AccML - data.AccAP
    data['phaseSpread'], data['highfMLbylowfAP'], data['meanAmpWeightedPhase'] = gen1featuresTDCS(data.Time, data.AccML, data.AccAP, data.AccV, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['APmVert'] = data.AccAP - data.AccV
    data['t_rel'] = data.Time/np.amax(data.Time)
    data['slopeSignChangeAP'] = seglearn_fxns(13, data.Time, data.AccAP, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['slopeSignChangeML'] = seglearn_fxns(13, data.Time, data.AccML, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['slopeSignChangeVert'] = seglearn_fxns(13, data.Time, data.AccV, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['sumML'] = seglearn_fxns(4, data.Time, data.AccML, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['sumAbsDiffAP'] = seglearn_fxns(14, data.Time, data.AccAP, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['sumAbsDiffVert'] = seglearn_fxns(14, data.Time, data.AccV, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['meanML'] = seglearn_fxns(0, data.Time, data.AccML, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['meanAP'] = seglearn_fxns(0, data.Time, data.AccAP, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['kurtAP'] = seglearn_fxns(8, data.Time, data.AccAP, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['skewML'] = seglearn_fxns(7, data.Time, data.AccML, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['zeroXML'] = seglearn_fxns(13, data.Time, data.AccML, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['zeroXAP'] = seglearn_fxns(13, data.Time, data.AccAP, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    dfList_TDCS.append(data)


df_TDCS = pd.concat(dfList_TDCS, ignore_index=True)
df_TDCS.drop(columns=['Time', 'AccV', 'AccML', 'AccAP'], inplace=True)

###################DEFOG####################
DEFOG_Files = glob.glob(os.path.join(DEFOG_Train_Path,"*.csv"))

dfList_DEFOG = []
for f in DEFOG_Files:
    data = pd.read_csv(f)
    data['Id'] = f[-14:-4]
    data['phaseSpread'], data['phaseExcursion'], data['meanAmpWeightedPhase'], data['highfMLbylowfAP'], data['max_aAP'], data['max_aVert'], data['iqrRatio_ML_Vert'] = gen1featuresDEFOG(data.Time, data.AccML, data.AccAP, data.AccV, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    data['t_rel'] = data.Time/np.amax(data.Time)
    data['slopeSignChangeAP'] = seglearn_fxns(13, data.Time, data.AccAP, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    data['slopeSignChangeML'] = seglearn_fxns(13, data.Time, data.AccML, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    data['sumAbsDiffAP'] = seglearn_fxns(14, data.Time, data.AccAP, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    data['sumAbsDiffVert'] = seglearn_fxns(14, data.Time, data.AccV, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    data['meanML'] = seglearn_fxns(0, data.Time, data.AccML, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    data['meanAP'] = seglearn_fxns(0, data.Time, data.AccAP, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    data['kurtAP'] = seglearn_fxns(8, data.Time, data.AccAP, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    data['skewML'] = seglearn_fxns(7, data.Time, data.AccML, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    data['zeroXML'] = seglearn_fxns(13, data.Time, data.AccML, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    data['zeroXAP'] = seglearn_fxns(13, data.Time, data.AccAP, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    dfList_DEFOG.append(data)


df_DEFOG = pd.concat(dfList_DEFOG, ignore_index=True)
df_DEFOG.drop(columns =['Time', 'AccV', 'AccML', 'AccAP'], inplace=True)

  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis

In [7]:
###################TDCS####################
left_TDCS = df_TDCS.set_index('Id')
right_TDCS = df_TDCSmetadata.set_index('Id')
df_new_TDCS = left_TDCS.join(right_TDCS)
df_new_TDCS.reset_index(inplace=True)

nextLeft_TDCS = df_new_TDCS.set_index('Subject')
nextRight_TDCS = df_subjects.set_index(['Subject']) #Visit is NaN for TDCS
df_final_TDCS = nextLeft_TDCS.join(nextRight_TDCS, on=['Subject'], lsuffix='_meta', rsuffix='_subjects')
df_final_TDCS.reset_index(inplace=True)
df_final_TDCS['Visit'] = df_final_TDCS['Visit_meta'] 
df_final_TDCS = df_final_TDCS.drop(columns=['Visit_subjects'])
df_final_TDCS = df_final_TDCS.drop(columns=['Visit_meta'])


###################DEFOG####################
left_DEFOG = df_DEFOG.set_index('Id')
right_DEFOG = df_DEFOGmetadata.set_index('Id')
df_new_DEFOG = left_DEFOG.join(right_DEFOG)
df_new_DEFOG.reset_index(inplace=True)

nextLeft_DEFOG = df_new_DEFOG.set_index('Subject')
nextRight_DEFOG = df_subjects.set_index(['Subject', 'Visit'])
df_final_DEFOG = nextLeft_DEFOG.join(nextRight_DEFOG, on=['Subject', 'Visit'], lsuffix='_meta', rsuffix='_subjects')
df_final_DEFOG.reset_index(inplace=True)
df_final_DEFOG = df_final_DEFOG.drop(columns=['Task', 'Valid'])


del left_TDCS
del right_TDCS
del df_new_TDCS
del nextLeft_TDCS
del nextRight_TDCS

del left_DEFOG
del right_DEFOG
del df_new_DEFOG
del nextLeft_DEFOG
del nextRight_DEFOG

df_final_TDCS.fillna(0, inplace=True)
df_final_DEFOG.fillna(0, inplace=True)

#print(df_final_TDCS.isna().sum())
df_final_TDCS['relevantScore'] = df_final_TDCS.Medication*df_final_TDCS.UPDRSIII_On + (1-df_final_TDCS.Medication)*df_final_TDCS.UPDRSIII_Off
df_final_DEFOG['relevantScore'] = df_final_DEFOG.Medication*df_final_DEFOG.UPDRSIII_On + (1-df_final_DEFOG.Medication)*df_final_DEFOG.UPDRSIII_Off


'Visit', 'Age', 'YearsSinceDx', 'UPDRSIII_On', 
            'UPDRSIII_Off', 'NFOGQ', 'highfMLbylowfAP', 
            't_rel', 'meanAmpWeightedPhase', 'phaseSpread', 'Sex', 
            'slopeSignChangeAP', 'slopeSignChangeML', 'sumAbsDiffAP',
            'sumAbsDiffVert', 'meanML', 'meanAP', 'kurtAP', 'skewML',
            'zeroXML', 'zeroXAP']


In [8]:
featuresToNorm = ['highfMLbylowfAP', 
            'meanAmpWeightedPhase', 'phaseSpread', 
            'slopeSignChangeAP', 'slopeSignChangeML', 'sumAbsDiffAP',
            'sumAbsDiffVert', 'Visit', 'Age', 'YearsSinceDx', 'UPDRSIII_On', 
            'UPDRSIII_Off', 'NFOGQ', 'relevantScore', 'meanML', 'meanAP', 'kurtAP', 'skewML',
            'zeroXML', 'zeroXAP']

TDCS_FeaturesToNorm = ['MLmAP', 'APmVert', 'meanWeightedFreqAP', 
                       'sumML', 'slopeSignChangeVert',  'Test']

DEFOG_FeaturesToNorm = ['max_aAP', 'max_aVert', 'iqrRatio_ML_Vert', 'phaseExcursion']




featuresTDCS2norm = []
featuresTDCS2norm.extend(featuresToNorm)
featuresTDCS2norm.extend(TDCS_FeaturesToNorm)

featuresDEFOG2norm = []
featuresDEFOG2norm.extend(featuresToNorm)
featuresDEFOG2norm.extend(DEFOG_FeaturesToNorm)

#Normalization:

#Train:
for col in featuresTDCS2norm:
    currCol = df_final_TDCS[col]
    currMean = np.mean(currCol)
    currStdDev = np.std(currCol)
    transformedCol = (currCol - currMean)/currStdDev
    df_final_TDCS[col] = transformedCol


for col in featuresDEFOG2norm:
    currCol = df_final_DEFOG[col]
    currMean = np.mean(currCol)
    currStdDev = np.std(currCol)
    transformedCol = (currCol - currMean)/currStdDev
    df_final_DEFOG[col] = transformedCol



In [9]:
features = ['highfMLbylowfAP', 
            't_rel', 'meanAmpWeightedPhase', 'phaseSpread',  
            'slopeSignChangeAP', 'slopeSignChangeML', 'sumAbsDiffAP',
            'sumAbsDiffVert',  'meanML', 'meanAP', 'kurtAP', 'skewML',
            'zeroXML', 'zeroXAP']

TDCS_Only_Features = ['MLmAP', 'APmVert',
                      'meanWeightedFreqAP', 'sumML', 'slopeSignChangeVert',
                      'te_cluster_SH', 'te_cluster_Tu', 'te_cluster_Wa']

DEFOG_Only_Features = ['max_aAP', 'max_aVert', 'iqrRatio_ML_Vert', 
                       'phaseExcursion']

featuresTDCS = []
featuresTDCS.extend(features)
featuresTDCS.extend(TDCS_Only_Features)

featuresDEFOG = []
featuresDEFOG.extend(features)
featuresDEFOG.extend(DEFOG_Only_Features)




#X_train_TDCS = df_final_TDCS.loc[:, featuresTDCS]
#y_train_TDCS = df_final_TDCS.loc[:, outputCols]

#del df_final_TDCS

#X_train_DEFOG = df_final_DEFOG.loc[:, featuresDEFOG]
#y_train_DEFOG = df_final_DEFOG.loc[:, outputCols]

#del df_final_DEFOG

"The present finding that sex is not a predictor of FOG could aid clinicians in counselling persons with PD about FOG." https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7815550/#:~:text=The%20pooled%20estimate%20of%20FOG,%25%20CI%2031%E2%80%9352%25


In [10]:
#SHUFFLE:
df_final_TDCS = shuffle(df_final_TDCS, random_state=0)

print("Target encoding TDCS data...")

#Target Encode Clusters:
clusterFeatures_TDCS = ['Medication', 'Test', 'NFOGQ', 'relevantScore', 'Age', 'YearsSinceDx']

kmeansTDCS = KMeans(n_clusters=6, n_init=10, random_state=42)

df_final_TDCS['cluster'] = kmeansTDCS.fit_predict(df_final_TDCS[clusterFeatures_TDCS])


"""print(df_final_TDCS.loc[df_final_TDCS.cluster == 0, 'cluster'].count())
print(df_final_TDCS.loc[df_final_TDCS.cluster == 1, 'cluster'].count())
print(df_final_TDCS.loc[df_final_TDCS.cluster == 2, 'cluster'].count())
print(df_final_TDCS.loc[df_final_TDCS.cluster == 3, 'cluster'].count())
print(df_final_TDCS.loc[df_final_TDCS.cluster == 4, 'cluster'].count())
print(df_final_TDCS.loc[df_final_TDCS.cluster == 5, 'cluster'].count())"""

print(df_final_TDCS.loc[df_final_TDCS.cluster == 0, ['Turn', 'Walking', 'StartHesitation']].mean())
print(df_final_TDCS.loc[df_final_TDCS.cluster == 1, ['Turn', 'Walking', 'StartHesitation']].mean())
print(df_final_TDCS.loc[df_final_TDCS.cluster == 2, ['Turn', 'Walking', 'StartHesitation']].mean())
print(df_final_TDCS.loc[df_final_TDCS.cluster == 3, ['Turn', 'Walking', 'StartHesitation']].mean())
print(df_final_TDCS.loc[df_final_TDCS.cluster == 4, ['Turn', 'Walking', 'StartHesitation']].mean())
print(df_final_TDCS.loc[df_final_TDCS.cluster == 5, ['Turn', 'Walking', 'StartHesitation']].mean())
#print(df_final_TDCS.loc[df_final_TDCS.cluster == 6, ['Turn', 'Walking', 'StartHesitation']].mean())
#print(df_final_TDCS.loc[df_final_TDCS.cluster == 7, ['Turn', 'Walking', 'StartHesitation']].mean())

cluster0_SH_mean = df_final_TDCS.loc[df_final_TDCS.cluster == 0, 'StartHesitation'].mean()
cluster1_SH_mean = df_final_TDCS.loc[df_final_TDCS.cluster == 1, 'StartHesitation'].mean()
cluster2_SH_mean = df_final_TDCS.loc[df_final_TDCS.cluster == 2, 'StartHesitation'].mean()
cluster3_SH_mean = df_final_TDCS.loc[df_final_TDCS.cluster == 3, 'StartHesitation'].mean()
cluster4_SH_mean = df_final_TDCS.loc[df_final_TDCS.cluster == 4, 'StartHesitation'].mean()
cluster5_SH_mean = df_final_TDCS.loc[df_final_TDCS.cluster == 5, 'StartHesitation'].mean()

cluster0_Tu_mean = df_final_TDCS.loc[df_final_TDCS.cluster == 0, 'Turn'].mean()
cluster1_Tu_mean = df_final_TDCS.loc[df_final_TDCS.cluster == 1, 'Turn'].mean()
cluster2_Tu_mean = df_final_TDCS.loc[df_final_TDCS.cluster == 2, 'Turn'].mean()
cluster3_Tu_mean = df_final_TDCS.loc[df_final_TDCS.cluster == 3, 'Turn'].mean()
cluster4_Tu_mean = df_final_TDCS.loc[df_final_TDCS.cluster == 4, 'Turn'].mean()
cluster5_Tu_mean = df_final_TDCS.loc[df_final_TDCS.cluster == 5, 'Turn'].mean()

cluster0_Wa_mean = df_final_TDCS.loc[df_final_TDCS.cluster == 0, 'Walking'].mean()
cluster1_Wa_mean = df_final_TDCS.loc[df_final_TDCS.cluster == 1, 'Walking'].mean()
cluster2_Wa_mean = df_final_TDCS.loc[df_final_TDCS.cluster == 2, 'Walking'].mean()
cluster3_Wa_mean = df_final_TDCS.loc[df_final_TDCS.cluster == 3, 'Walking'].mean()
cluster4_Wa_mean = df_final_TDCS.loc[df_final_TDCS.cluster == 4, 'Walking'].mean()
cluster5_Wa_mean = df_final_TDCS.loc[df_final_TDCS.cluster == 5, 'Walking'].mean()


df_final_TDCS['te_cluster_SH'] = [0] * len(df_final_TDCS)
df_final_TDCS['te_cluster_Tu'] = [0] * len(df_final_TDCS)
df_final_TDCS['te_cluster_Wa'] = [0] * len(df_final_TDCS)


df_final_TDCS.loc[df_final_TDCS['cluster'] == 0, 'te_cluster_SH'] = cluster0_SH_mean
df_final_TDCS.loc[df_final_TDCS['cluster'] == 1, 'te_cluster_SH'] = cluster1_SH_mean
df_final_TDCS.loc[df_final_TDCS['cluster'] == 2, 'te_cluster_SH'] = cluster2_SH_mean
df_final_TDCS.loc[df_final_TDCS['cluster'] == 3, 'te_cluster_SH'] = cluster3_SH_mean
df_final_TDCS.loc[df_final_TDCS['cluster'] == 4, 'te_cluster_SH'] = cluster4_SH_mean
df_final_TDCS.loc[df_final_TDCS['cluster'] == 5, 'te_cluster_SH'] = cluster5_SH_mean

df_final_TDCS.loc[df_final_TDCS['cluster'] == 0, 'te_cluster_Tu'] = cluster0_Tu_mean
df_final_TDCS.loc[df_final_TDCS['cluster'] == 1, 'te_cluster_Tu'] = cluster1_Tu_mean
df_final_TDCS.loc[df_final_TDCS['cluster'] == 2, 'te_cluster_Tu'] = cluster2_Tu_mean
df_final_TDCS.loc[df_final_TDCS['cluster'] == 3, 'te_cluster_Tu'] = cluster3_Tu_mean
df_final_TDCS.loc[df_final_TDCS['cluster'] == 4, 'te_cluster_Tu'] = cluster4_Tu_mean
df_final_TDCS.loc[df_final_TDCS['cluster'] == 5, 'te_cluster_Tu'] = cluster5_Tu_mean

df_final_TDCS.loc[df_final_TDCS['cluster'] == 0, 'te_cluster_Wa'] = cluster0_Wa_mean
df_final_TDCS.loc[df_final_TDCS['cluster'] == 1, 'te_cluster_Wa'] = cluster1_Wa_mean
df_final_TDCS.loc[df_final_TDCS['cluster'] == 2, 'te_cluster_Wa'] = cluster2_Wa_mean
df_final_TDCS.loc[df_final_TDCS['cluster'] == 3, 'te_cluster_Wa'] = cluster3_Wa_mean
df_final_TDCS.loc[df_final_TDCS['cluster'] == 4, 'te_cluster_Wa'] = cluster4_Wa_mean
df_final_TDCS.loc[df_final_TDCS['cluster'] == 5, 'te_cluster_Wa'] = cluster5_Wa_mean



print(df_final_TDCS.loc[:, ['cluster', 'Medication', 'Test', 'NFOGQ', 'relevantScore', 'Age', 'YearsSinceDx']].tail(25))

#plt.scatter(X_train_TDCS.relevantScore, X_train_TDCS.NFOGQ, c=X_train_TDCS.cluster, s=0.01)
#plt.show()

Target encoding TDCS data...
Turn               0.394063
Walking            0.045811
StartHesitation    0.095080
dtype: float64
Turn               0.168161
Walking            0.004322
StartHesitation    0.002758
dtype: float64
Turn               0.340377
Walking            0.060995
StartHesitation    0.074072
dtype: float64
Turn               0.222548
Walking            0.018501
StartHesitation    0.024313
dtype: float64
Turn               0.041528
Walking            0.003537
StartHesitation    0.001916
dtype: float64
Turn               0.066589
Walking            0.000862
StartHesitation    0.001371
dtype: float64
         cluster  Medication      Test     NFOGQ  relevantScore       Age  \
3107661        0           1  1.091884  0.116074       0.228307  0.683378   
606745         0           1  1.091884  1.099884       0.456028  0.553795   
887633         0           1  1.091884  0.116074       0.076492  0.812962   
7036504        2           1 -0.112665  1.099884       0.456028  0.55

In [11]:
outputCols = ['StartHesitation', 'Turn', 'Walking']

X_train_TDCS = df_final_TDCS.loc[:, featuresTDCS]
y_train_TDCS = df_final_TDCS.loc[:, outputCols]

X_train_DEFOG = df_final_DEFOG.loc[:, featuresDEFOG]
y_train_DEFOG = df_final_DEFOG.loc[:, outputCols]


del df_final_TDCS
del df_final_DEFOG

In [12]:
#NN models:

neuralModel_TDCS_SH = MLPClassifier(solver='sgd', batch_size=400, max_iter=250, early_stopping=True, validation_fraction=0.10, alpha=1e-4, hidden_layer_sizes=(16), random_state=42)
neuralModel_TDCS_Tu = MLPClassifier(solver='sgd', batch_size=400, max_iter=250, early_stopping=True, validation_fraction=0.10, alpha=1e-4, hidden_layer_sizes=(16), random_state=42)
neuralModel_TDCS_Wa = MLPClassifier(solver='sgd', batch_size=400, max_iter=250, early_stopping=True, validation_fraction=0.10, alpha=1e-4, hidden_layer_sizes=(16), random_state=42)

neuralModel_DEFOG_SH = MLPClassifier(solver='sgd', batch_size=400, max_iter=250, early_stopping=True, validation_fraction=0.40, alpha=1e-4, hidden_layer_sizes=(16), random_state=42)
neuralModel_DEFOG_Tu = MLPClassifier(solver='sgd', batch_size=400, max_iter=250, early_stopping=True, validation_fraction=0.40, alpha=1e-4, hidden_layer_sizes=(16), random_state=42)
neuralModel_DEFOG_Wa = MLPClassifier(solver='sgd', batch_size=400, max_iter=250, early_stopping=True, validation_fraction=0.40, alpha=1e-4, hidden_layer_sizes=(16), random_state=42)


In [13]:
print("Training the TDCS models...")
print("Start Hesitation")
neuralModel_TDCS_SH.fit(X_train_TDCS, y_train_TDCS.StartHesitation) 
print("Turn")
neuralModel_TDCS_Tu.fit(X_train_TDCS, y_train_TDCS.Turn) 
print("Walking")
neuralModel_TDCS_Wa.fit(X_train_TDCS, y_train_TDCS.Walking) 


print("Training the DEFOG models...")
print("Start Hesitation")
neuralModel_DEFOG_SH.fit(X_train_DEFOG, y_train_DEFOG.StartHesitation)
print("Turn")
neuralModel_DEFOG_Tu.fit(X_train_DEFOG, y_train_DEFOG.Turn) 
print("Walking")
neuralModel_DEFOG_Wa.fit(X_train_DEFOG, y_train_DEFOG.Walking) 


#Free memory
del X_train_TDCS
del y_train_TDCS
del X_train_DEFOG
del y_train_DEFOG

Training the TDCS models...
Start Hesitation
Turn
Walking
Training the DEFOG models...
Start Hesitation
Turn
Walking


In [14]:
#READ IN TEST FILES

TDCS_Test_Path = '/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/test/tdcsfog/'
DEFOG_Test_Path = '/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/test/defog/'

print("Reading in Test Files...")

###################TDCS####################
TDCS_Files_Test = glob.glob(os.path.join(TDCS_Test_Path,"*.csv"))

dfList_TDCS_Test = []
for f in TDCS_Files_Test:
    data = pd.read_csv(f)
    data['Id'] = f[-14:-4]
    data['meanHighFreqPowerAP'], data['weightedFreqExcursionAP'], data['meanWeightedFreqAP'] = engineerFeatures5(data.Time, data.AccML, data.AccAP, data.AccV, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['MLmAP'] = data.AccML - data.AccAP
    data['phaseSpread'], data['highfMLbylowfAP'], data['meanAmpWeightedPhase'] = gen1featuresTDCS(data.Time, data.AccML, data.AccAP, data.AccV, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['APmVert'] = data.AccAP - data.AccV
    data['t_rel'] = data.Time/np.amax(data.Time)
    data['slopeSignChangeAP'] = seglearn_fxns(13, data.Time, data.AccAP, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['slopeSignChangeML'] = seglearn_fxns(13, data.Time, data.AccML, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['slopeSignChangeVert'] = seglearn_fxns(13, data.Time, data.AccV, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['sumML'] = seglearn_fxns(4, data.Time, data.AccML, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['sumAbsDiffAP'] = seglearn_fxns(14, data.Time, data.AccAP, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['sumAbsDiffVert'] = seglearn_fxns(14, data.Time, data.AccV, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['meanML'] = seglearn_fxns(0, data.Time, data.AccML, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['meanAP'] = seglearn_fxns(0, data.Time, data.AccAP, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['kurtAP'] = seglearn_fxns(8, data.Time, data.AccAP, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['skewML'] = seglearn_fxns(7, data.Time, data.AccML, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['zeroXML'] = seglearn_fxns(13, data.Time, data.AccML, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['zeroXAP'] = seglearn_fxns(13, data.Time, data.AccAP, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['Id_t'] = data.Id.astype('string') + "_" + data.Time.astype('string')
    dfList_TDCS_Test.append(data)


df_TDCS_Test = pd.concat(dfList_TDCS_Test, ignore_index=True)


###################DEFOG####################
DEFOG_Files_Test = glob.glob(os.path.join(DEFOG_Test_Path,"*.csv"))

dfList_DEFOG_Test = []
for f in DEFOG_Files_Test:
    data = pd.read_csv(f)
    data['Id'] = f[-14:-4]
    data['phaseSpread'], data['phaseExcursion'], data['meanAmpWeightedPhase'], data['highfMLbylowfAP'], data['max_aAP'], data['max_aVert'], data['iqrRatio_ML_Vert'] = gen1featuresDEFOG(data.Time, data.AccML, data.AccAP, data.AccV, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    data['t_rel'] = data.Time/np.amax(data.Time)
    data['slopeSignChangeAP'] = seglearn_fxns(13, data.Time, data.AccAP, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    data['slopeSignChangeML'] = seglearn_fxns(13, data.Time, data.AccML, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    data['sumAbsDiffAP'] = seglearn_fxns(14, data.Time, data.AccAP, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    data['sumAbsDiffVert'] = seglearn_fxns(14, data.Time, data.AccV, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    data['meanML'] = seglearn_fxns(0, data.Time, data.AccML, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    data['meanAP'] = seglearn_fxns(0, data.Time, data.AccAP, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    data['kurtAP'] = seglearn_fxns(8, data.Time, data.AccAP, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    data['skewML'] = seglearn_fxns(7, data.Time, data.AccML, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    data['zeroXML'] = seglearn_fxns(13, data.Time, data.AccML, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    data['zeroXAP'] = seglearn_fxns(13, data.Time, data.AccAP, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    data['Id_t'] = data.Id.astype('string') + "_" + data.Time.astype('string')
    dfList_DEFOG_Test.append(data)


df_DEFOG_Test = pd.concat(dfList_DEFOG_Test, ignore_index=True)

Reading in Test Files...


In [15]:
###################TDCS####################
left_TDCS_Test = df_TDCS_Test.set_index('Id')
right_TDCS_Test = df_TDCSmetadata.set_index('Id')
df_new_TDCS_Test = left_TDCS_Test.join(right_TDCS_Test)
df_new_TDCS_Test.reset_index(inplace=True)

nextLeft_TDCS_Test = df_new_TDCS_Test.set_index('Subject')
nextRight_TDCS_Test = df_subjects.set_index(['Subject'])
df_final_TDCS_Test = nextLeft_TDCS_Test.join(nextRight_TDCS_Test, on=['Subject'], lsuffix='_meta', rsuffix='_subjects')
df_final_TDCS_Test.reset_index(inplace=True)
df_final_TDCS_Test['Visit'] = df_final_TDCS_Test['Visit_meta'] 
df_final_TDCS_Test = df_final_TDCS_Test.drop(columns=['Visit_subjects'])

###################DEFOG####################
left_DEFOG_Test = df_DEFOG_Test.set_index('Id')
right_DEFOG_Test = df_DEFOGmetadata.set_index('Id')
df_new_DEFOG_Test = left_DEFOG_Test.join(right_DEFOG_Test)
df_new_DEFOG_Test.reset_index(inplace=True)

nextLeft_DEFOG_Test = df_new_DEFOG_Test.set_index('Subject')
nextRight_DEFOG_Test = df_subjects.set_index(['Subject', 'Visit'])
df_final_DEFOG_Test = nextLeft_DEFOG_Test.join(nextRight_DEFOG_Test, on=['Subject', 'Visit'], lsuffix='_meta', rsuffix='_subjects')
df_final_DEFOG_Test.reset_index(inplace=True)
#df_final_DEFOG_Test = df_final_DEFOG_Test.drop(columns=['Task', 'Valid'])

#del some unused df's
del left_TDCS_Test
del right_TDCS_Test
del df_new_TDCS_Test
del nextLeft_TDCS_Test
del nextRight_TDCS_Test

del left_DEFOG_Test
del right_DEFOG_Test
del df_new_DEFOG_Test
del nextLeft_DEFOG_Test
del nextRight_DEFOG_Test

del df_subjects


df_final_TDCS_Test.fillna(0, inplace=True)
df_final_DEFOG_Test.fillna(0, inplace=True)

df_final_TDCS_Test['relevantScore'] = df_final_TDCS_Test.Medication*df_final_TDCS_Test.UPDRSIII_On + (1-df_final_TDCS_Test.Medication)*df_final_TDCS_Test.UPDRSIII_Off
df_final_DEFOG_Test['relevantScore'] = df_final_DEFOG_Test.Medication*df_final_DEFOG_Test.UPDRSIII_On + (1-df_final_DEFOG_Test.Medication)*df_final_DEFOG_Test.UPDRSIII_Off


In [16]:
#Normalize Test Data:
for col in featuresTDCS2norm:
    currCol = df_final_TDCS_Test[col]
    currMean = np.mean(currCol)
    currStdDev = np.std(currCol)
    transformedCol = (currCol - currMean)/currStdDev
    df_final_TDCS_Test[col] = transformedCol


for col in featuresDEFOG2norm:
    currCol = df_final_DEFOG_Test[col]
    currMean = np.mean(currCol)
    currStdDev = np.std(currCol)
    transformedCol = (currCol - currMean)/currStdDev
    df_final_DEFOG_Test[col] = transformedCol

In [17]:
#Target Encoding Cluster ID's
df_final_TDCS_Test['cluster'] = kmeansTDCS.predict(df_final_TDCS_Test[clusterFeatures_TDCS])



df_final_TDCS_Test.loc[df_final_TDCS_Test['cluster'] == 0, 'te_cluster_SH'] = cluster0_SH_mean
df_final_TDCS_Test.loc[df_final_TDCS_Test['cluster'] == 1, 'te_cluster_SH'] = cluster1_SH_mean
df_final_TDCS_Test.loc[df_final_TDCS_Test['cluster'] == 2, 'te_cluster_SH'] = cluster2_SH_mean
df_final_TDCS_Test.loc[df_final_TDCS_Test['cluster'] == 3, 'te_cluster_SH'] = cluster3_SH_mean
df_final_TDCS_Test.loc[df_final_TDCS_Test['cluster'] == 4, 'te_cluster_SH'] = cluster4_SH_mean
df_final_TDCS_Test.loc[df_final_TDCS_Test['cluster'] == 5, 'te_cluster_SH'] = cluster5_SH_mean

df_final_TDCS_Test.loc[df_final_TDCS_Test['cluster'] == 0, 'te_cluster_Tu'] = cluster0_Tu_mean
df_final_TDCS_Test.loc[df_final_TDCS_Test['cluster'] == 1, 'te_cluster_Tu'] = cluster1_Tu_mean
df_final_TDCS_Test.loc[df_final_TDCS_Test['cluster'] == 2, 'te_cluster_Tu'] = cluster2_Tu_mean
df_final_TDCS_Test.loc[df_final_TDCS_Test['cluster'] == 3, 'te_cluster_Tu'] = cluster3_Tu_mean
df_final_TDCS_Test.loc[df_final_TDCS_Test['cluster'] == 4, 'te_cluster_Tu'] = cluster4_Tu_mean
df_final_TDCS_Test.loc[df_final_TDCS_Test['cluster'] == 5, 'te_cluster_Tu'] = cluster5_Tu_mean

df_final_TDCS_Test.loc[df_final_TDCS_Test['cluster'] == 0, 'te_cluster_Wa'] = cluster0_Wa_mean
df_final_TDCS_Test.loc[df_final_TDCS_Test['cluster'] == 1, 'te_cluster_Wa'] = cluster1_Wa_mean
df_final_TDCS_Test.loc[df_final_TDCS_Test['cluster'] == 2, 'te_cluster_Wa'] = cluster2_Wa_mean
df_final_TDCS_Test.loc[df_final_TDCS_Test['cluster'] == 3, 'te_cluster_Wa'] = cluster3_Wa_mean
df_final_TDCS_Test.loc[df_final_TDCS_Test['cluster'] == 4, 'te_cluster_Wa'] = cluster4_Wa_mean
df_final_TDCS_Test.loc[df_final_TDCS_Test['cluster'] == 5, 'te_cluster_Wa'] = cluster5_Wa_mean

ValueError: Input X contains NaN.
KMeans does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
X_test_TDCS = df_final_TDCS_Test.loc[:, featuresTDCS]

X_test_DEFOG = df_final_DEFOG_Test.loc[:, featuresDEFOG]


In [None]:
print("Making TDCS predictions...")
predsSH_TDCS = np.array(neuralModel_TDCS_SH.predict_proba(X_test_TDCS))
predsTu_TDCS = np.array(neuralModel_TDCS_Tu.predict_proba(X_test_TDCS))
predsWa_TDCS = np.array(neuralModel_TDCS_Wa.predict_proba(X_test_TDCS))

print(predsTu_TDCS)

df_Predictions_TDCS = pd.DataFrame()
df_Predictions_TDCS['StartHesitation'] = predsSH_TDCS[:, 1]/2.0 #probability of (positive) class 1
df_Predictions_TDCS['Turn'] = predsTu_TDCS[:, 1] #probability of class 1
df_Predictions_TDCS['Walking'] = predsWa_TDCS[:, 1] #probability of class 1



print("Making DEFOG predictions...")
predsSH_DEFOG = np.array(neuralModel_DEFOG_SH.predict_proba(X_test_DEFOG))
predsTu_DEFOG = np.array(neuralModel_DEFOG_Tu.predict_proba(X_test_DEFOG))
predsWa_DEFOG = np.array(neuralModel_DEFOG_Wa.predict_proba(X_test_DEFOG))

df_Predictions_DEFOG = pd.DataFrame()
df_Predictions_DEFOG['StartHesitation'] = predsSH_DEFOG[:, 1]*0 #probability of class 1
df_Predictions_DEFOG['Turn'] = predsTu_DEFOG[:, 1] #probability of class 1
df_Predictions_DEFOG['Walking'] = predsWa_DEFOG[:, 1] #probability of class 1

In [None]:
#store df of submission results.
print('Creating submission dataframe...')
finalCols = ['Id', 'StartHesitation', 'Turn', 'Walking']

id_t_TDCS = df_final_TDCS_Test.loc[:, 'Id_t']
id_t_DEFOG = df_final_DEFOG_Test.loc[:, 'Id_t']

df_Predictions_TDCS['Id'] =  id_t_TDCS
df_Predictions_DEFOG['Id'] = id_t_DEFOG

df_TDCS = df_Predictions_TDCS[finalCols]
df_DEFOG = df_Predictions_DEFOG[finalCols]

output_df = pd.concat([df_TDCS, df_DEFOG], ignore_index=True)

print('Writing submission file...')
output_df.to_csv("submission.csv", index=False)