In [1]:
# Import Libraries:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
#import random
#import csv
from scipy import stats
import matplotlib.pyplot as plt
import glob
#from sklearn.linear_model import LogisticRegression
#from sklearn.feature_selection import mutual_info_classif
#from sklearn.metrics import average_precision_score
from sklearn.model_selection import train_test_split
#from sklearn.utils import shuffle
import lightgbm as lgb

pd.set_option('use_inf_as_na', True)

#TEMPORARY!!!! RM!
np.seterr(divide='ignore', invalid='ignore')
#TEMPORARY!!!! RM!


####################################################################
########              PARAMETERS & CONSTANTS:               ########
####################################################################
gg = 9.806

sampleRateTDCSFOG = 128     #per second
sampleRateDEFOG   = 100     #per second

batchesTDCS = 100
batchesDEFOG = 1000

deadRelTimeStart = 0.05
deadRelTimeEnd   = 0.95

dummyVariable = 9 #ignore



In [2]:
#A low pass filter to remove high frequency noise.
def lowPassFilter(kArr, freqArr, cutOffFreq):
    for i in range(0,len(freqArr)):
        if freqArr[i] > cutOffFreq:
            kArr.real[i] = 0; 
            kArr.imag[i] = 0;
    return kArr


#A high pass filter to analyze only high frequencies.  
def highPassFilter(kArr, freqArr, cutOffFreq):
    for i in range(0,len(freqArr)):
        if freqArr[i] < cutOffFreq:
            kArr.real[i] = 0;
            kArr.imag[i] = 0;
    return kArr


#A quick FFT where W can be x, y, z accelerations etc.
def quickFFT(inputT, inputW, sampleRate, filterType, cutOff):
    kspaceData = np.fft.rfft(inputW)
    freq = np.fft.rfftfreq(inputT.shape[-1], d=1.0/sampleRate)
    if filterType == "low":
        filteredData = lowPassFilter(kspaceData, freq, cutOff)
    elif filterType == "high":
        filteredData = highPassFilter(kspaceData, freq, cutOff)
    else:
        filteredData = kspaceData
    outputW = np.fft.irfft(filteredData, len(inputW))
    return outputW


#A quick FFT where W can be x, y, z accelerations etc. (returns k-space)
def quickFFT_k(inputT, inputW, sampleRate, filterType, cutOff):
    kspaceData = np.fft.rfft(inputW)
    freq = np.fft.rfftfreq(inputT.shape[-1], d=1.0/sampleRate)
    if filterType == "low":
        filteredData = lowPassFilter(kspaceData, freq, cutOff)
    elif filterType == "high":
        filteredData = highPassFilter(kspaceData, freq, cutOff)
    else:
        filteredData = kspaceData
    return freq, filteredData


In [3]:
def gen1featuresTDCS(time, aML, aAP, aVert, sampleRate, cutOff, numBatches):
    numPoints = len(time)
    windowSize = numPoints // numBatches
    #ex: 4133 // 100 = 41
    f1 = [] #note if statement and numpointstoadd
    f2 = []
    f3 = []
    for i in range(0, numBatches):
        t_Clip = time.loc[i*windowSize:(i+1)*windowSize]
        aML_Clip = aML.loc[i*windowSize:(i+1)*windowSize]
        aAP_Clip = aAP.loc[i*windowSize:(i+1)*windowSize]
        freqAP, ampsAP = quickFFT_k(t_Clip, aAP_Clip, sampleRate, "none", cutOff)
        angles = np.angle(ampsAP)
        meanWeightedAngle = np.mean(angles*abs(ampsAP))
        f1_tmp = np.std(angles) #phaseSpread
        freqML, ampsML = quickFFT_k(t_Clip, aML_Clip, sampleRate, "none", cutOff)
        lowfampsAP = ampsAP[(freqAP > 0.5)*(freqAP < 3)]
        highfampsML = ampsML[(freqML > 3)*(freqML < 8)]
        lowf_ampWeightedFreqAP_tmp = np.mean(abs(lowfampsAP)*freqAP[(freqAP > 0.5)*(freqAP < 3)])
        highf_ampWeightedFreqML_tmp = np.mean(abs(highfampsML)*freqML[(freqML > 3)*(freqML < 8)])
        f2_tmp = highf_ampWeightedFreqML_tmp/(1+lowf_ampWeightedFreqAP_tmp)
        f1.extend([f1_tmp]*(windowSize))
        f2.extend([f2_tmp]*(windowSize))
        f3.extend([meanWeightedAngle]*(windowSize)) #meanAmpWeightedPhase
    if len(f1) < numPoints:
        numPointsToAdd = numPoints - len(f1)
        t_Clip = time.loc[numPoints - numPointsToAdd:numPoints-1]
        aML_Clip = aML.loc[numPoints - numPointsToAdd:numPoints-1]
        aAP_Clip = aAP.loc[numPoints - numPointsToAdd:numPoints-1]
        freqAP, ampsAP = quickFFT_k(t_Clip, aAP_Clip, sampleRate, "none", cutOff)
        angles = np.angle(ampsAP)
        meanWeightedAngle = np.mean(angles*abs(ampsAP))
        f1_tmp = np.std(angles) #phaseSpread
        freqML, ampsML = quickFFT_k(t_Clip, aML_Clip, sampleRate, "none", cutOff)
        lowfampsAP = ampsAP[(freqAP > 0.5)*(freqAP < 3)]
        highfampsML = ampsML[(freqML > 3)*(freqML < 8)]
        lowf_ampWeightedFreqAP_tmp = np.mean(abs(lowfampsAP)*freqAP[(freqAP > 0.5)*(freqAP < 3)])
        highf_ampWeightedFreqML_tmp = np.mean(abs(highfampsML)*freqML[(freqML > 3)*(freqML < 8)])
        f2_tmp = highf_ampWeightedFreqML_tmp/(1+lowf_ampWeightedFreqAP_tmp)
        f1.extend([f1_tmp]*(numPointsToAdd))
        f2.extend([f2_tmp]*(numPointsToAdd))
        f3.extend([meanWeightedAngle]*(numPointsToAdd)) #meanAmpWeightedPhase
    return f1, f2, f3


def gen1featuresDEFOG(time, aML, aAP, aVert, sampleRate, cutOff, numBatches):
    numPoints = len(time)
    windowSize = numPoints // numBatches
    #ex: 4133 // 100 = 41
    f1 = [] #note if statement and numpointstoadd #phase spread
    f2 = []     #phaseExcursion
    f3 = []     #meanAmpWeightedPhase
    f4 = []     #highfMLbylowfAP
    f5 = []     #max_aAP 
    f6 = []     #max_aVert
    f7 = []     #iqrRatio_ML_Vert
    for i in range(0, numBatches):
        t_Clip = time.loc[i*windowSize:(i+1)*windowSize]
        aML_Clip = aML.loc[i*windowSize:(i+1)*windowSize]
        aAP_Clip = aAP.loc[i*windowSize:(i+1)*windowSize]
        aVert_Clip = aVert.loc[i*windowSize:(i+1)*windowSize]
        #aTOT_Clip = aTOT.loc[i*windowSize:(i+1)*windowSize]
        freqAP, ampsAP = quickFFT_k(t_Clip, aAP_Clip, sampleRate, "none", cutOff)
        angles = np.angle(ampsAP)
        meanWeightedAngle = np.nanmean(angles*abs(ampsAP))
        f1_tmp = np.std(angles) #phaseStd
        f2_tmp = np.sum(abs(angles - meanWeightedAngle)*abs(ampsAP)) #weightedPhase
        freqML, ampsML = quickFFT_k(t_Clip, aML_Clip, sampleRate, "none", cutOff)
        lowfampsAP = ampsAP[(freqAP > 0.5)*(freqAP < 3)]
        highfampsML = ampsML[(freqML > 3)*(freqML < 8)]
        lowf_ampWeightedFreqAP_tmp = np.mean(abs(lowfampsAP)*freqAP[(freqAP > 0.5)*(freqAP < 3)])
        highf_ampWeightedFreqML_tmp = np.mean(abs(highfampsML)*freqML[(freqML > 3)*(freqML < 8)])
        f4_tmp = highf_ampWeightedFreqML_tmp/(1+lowf_ampWeightedFreqAP_tmp)
        f5_tmp = np.amax(aAP_Clip)
        f6_tmp = np.amax(aVert_Clip)
        #iqr ratio metric
        q75ML, q25ML = np.percentile(aML_Clip, [75 ,25])
        iqr_ML = q75ML - q25ML
        q75Vert, q25Vert = np.percentile(aVert_Clip, [75 ,25])
        iqr_Vert = q75Vert - q25Vert
        f7_tmp = iqr_ML/(1+iqr_Vert)
        f1.extend([f1_tmp]*(windowSize))
        f2.extend([f2_tmp]*(windowSize))
        f3.extend([meanWeightedAngle]*(windowSize))
        f4.extend([f4_tmp]*(windowSize))
        f5.extend([f5_tmp]*(windowSize))
        f6.extend([f6_tmp]*(windowSize))
        f7.extend([f7_tmp]*(windowSize))
    if len(f1) < numPoints:
        numPointsToAdd = numPoints - len(f1)
        t_Clip = time.loc[numPoints - numPointsToAdd:numPoints-1]
        aML_Clip = aML.loc[numPoints - numPointsToAdd:numPoints-1]
        aAP_Clip = aAP.loc[numPoints - numPointsToAdd:numPoints-1]
        aVert_Clip = aVert.loc[numPoints - numPointsToAdd:numPoints-1]
        #aTOT_Clip = aTOT.loc[numPoints - numPointsToAdd:numPoints-1]
        freqAP, ampsAP = quickFFT_k(t_Clip, aAP_Clip, sampleRate, "none", cutOff)
        angles = np.angle(ampsAP)
        meanWeightedAngle = np.nanmean(angles*abs(ampsAP))
        f1_tmp = np.std(angles) #phaseStd
        f2_tmp = np.sum(abs(angles - meanWeightedAngle)*abs(ampsAP)) #weightedPhase
        freqML, ampsML = quickFFT_k(t_Clip, aML_Clip, sampleRate, "none", cutOff)
        lowfampsAP = ampsAP[(freqAP > 0.5)*(freqAP < 3)]
        highfampsML = ampsML[(freqML > 3)*(freqML < 8)]
        lowf_ampWeightedFreqAP_tmp = np.mean(abs(lowfampsAP)*freqAP[(freqAP > 0.5)*(freqAP < 3)])
        highf_ampWeightedFreqML_tmp = np.mean(abs(highfampsML)*freqML[(freqML > 3)*(freqML < 8)])
        f4_tmp = highf_ampWeightedFreqML_tmp/(1+lowf_ampWeightedFreqAP_tmp)
        f5_tmp = np.amax(aAP_Clip)
        f6_tmp = np.amax(aVert_Clip)
        #iqr ratio metric
        q75ML, q25ML = np.percentile(aML_Clip, [75 ,25])
        iqr_ML = q75ML - q25ML
        q75Vert, q25Vert = np.percentile(aVert_Clip, [75 ,25])
        iqr_Vert = q75Vert - q25Vert
        f7_tmp = iqr_ML/(1+iqr_Vert)
        f1.extend([f1_tmp]*(numPointsToAdd))
        f2.extend([f2_tmp]*(numPointsToAdd))
        f3.extend([meanWeightedAngle]*(numPointsToAdd))
        f4.extend([f4_tmp]*(numPointsToAdd))
        f5.extend([f5_tmp]*(numPointsToAdd))
        f6.extend([f6_tmp]*(numPointsToAdd))
        f7.extend([f7_tmp]*(numPointsToAdd))
    return f1, f2, f3, f4, f5, f6, f7
    



def engineerFeatures5(time, aML, aAP, aVert, sampleRate, cutOff, numBatches):
    numPoints = len(time)
    windowSize = numPoints // numBatches
    #ex: 4133 // 100 = 41
    f1 = [] #note if statement and numpointstoadd
    f2 = []
    f3 = []
    for i in range(0, numBatches):
        t_Clip = time.loc[i*windowSize:(i+1)*windowSize]
        aAP_Clip = aAP.loc[i*windowSize:(i+1)*windowSize]
        freqAP, ampsAP = quickFFT_k(t_Clip, aAP_Clip, sampleRate, "none", cutOff)
        meanWeightedFreq = np.mean(freqAP*abs(ampsAP))
        f1_tmp = np.mean(abs(ampsAP[freqAP>3])*abs(ampsAP[freqAP>3]))
        f2_tmp = np.sum(abs(freqAP - meanWeightedFreq)*abs(ampsAP)) #weightedFreqExcursionAP
        f1.extend([f1_tmp]*(windowSize))
        f2.extend([f2_tmp]*(windowSize))
        f3.extend([meanWeightedFreq]*(windowSize))
    if len(f1) < numPoints:
        numPointsToAdd = numPoints - len(f1)
        t_Clip = time.loc[numPoints - numPointsToAdd:numPoints-1]
        aAP_Clip = aAP.loc[numPoints - numPointsToAdd:numPoints-1]
        freqAP, ampsAP = quickFFT_k(t_Clip, aAP_Clip, sampleRate, "none", cutOff)
        meanWeightedFreq = np.mean(freqAP*abs(ampsAP))
        f1_tmp = np.mean(abs(ampsAP[freqAP>3])*abs(ampsAP[freqAP>3]))
        f2_tmp = np.sum(abs(freqAP - meanWeightedFreq)*abs(ampsAP)) #weightedFreqExcursionAP
        f1.extend([f1_tmp]*(numPointsToAdd))
        f2.extend([f2_tmp]*(numPointsToAdd))
        f3.extend([meanWeightedFreq]*(numPointsToAdd))
    return f1, f2, f3

In [4]:
#seglearn fxns
def seglearn_fxns(whichFeature, time, inputW, sampleRate, cutOff, numBatches):
    numPoints = len(time)
    windowSize = numPoints // numBatches
    ret = []
    for i in range(0, numBatches):
        t_Clip = time.loc[i*windowSize:(i+1)*windowSize]
        inputW_Clip = inputW.loc[i*windowSize:(i+1)*windowSize]
        if whichFeature == 0:
            ret.extend([np.mean(inputW_Clip)]*(windowSize))
        elif whichFeature == 1:
            ret.extend([np.median(inputW_Clip)]*(windowSize))
        elif whichFeature == 2:
            ret.extend([stats.gmean(np.abs(inputW_Clip))]*(windowSize))
        elif whichFeature == 3:
            ret.extend([stats.hmean(np.abs(inputW_Clip))]*(windowSize))
        elif whichFeature == 4:
            ret.extend([np.sum(inputW_Clip)]*(windowSize))
        elif whichFeature == 5:
            ret.extend([np.sum(np.abs(inputW_Clip))]*(windowSize))
        elif whichFeature == 6:
            ret.extend([np.sum(inputW_Clip*inputW_Clip)]*(windowSize))
        elif whichFeature == 7:
            ret.extend([stats.skew(inputW_Clip)]*(windowSize))
        elif whichFeature == 8:
            ret.extend([stats.kurtosis(inputW_Clip)]*(windowSize))
        elif whichFeature == 9:
            ret.extend([np.mean(np.diff(inputW_Clip))]*(windowSize))
        elif whichFeature == 10:
            ret.extend([np.mean(np.abs(np.diff(inputW_Clip)))]*(windowSize))
        elif whichFeature == 11:
            ret.extend([np.mean(np.square(np.abs(np.fft.fft(inputW_Clip))))]*(windowSize))
        elif whichFeature == 12:
            #zero crossings about mean
            meanW = np.mean(inputW)
            signW = np.heaviside((inputW - meanW), 1) - 0.5
            ret.extend([np.sum(np.abs(np.diff(signW)))]*(windowSize))
        elif whichFeature == 13:
            difArr = np.diff(inputW_Clip)
            signDifW = np.heaviside(difArr, 1) - 0.5
            ret.extend([np.sum(np.abs(np.diff(signDifW)))]*(windowSize))
        elif whichFeature == 14:
            ret.extend([np.sum(np.abs(np.diff(inputW_Clip)))]*(windowSize))
    if len(ret) < numPoints:
        numPointsToAdd = numPoints - len(ret)
        t_Clip = time.loc[numPoints - numPointsToAdd:numPoints-1]
        inputW_Clip = inputW.loc[numPoints - numPointsToAdd:numPoints-1]
        if whichFeature == 0:
            ret.extend([np.mean(inputW_Clip)]*(numPointsToAdd))
        elif whichFeature == 1:
            ret.extend([np.median(inputW_Clip)]*(numPointsToAdd))
        elif whichFeature == 2:
            ret.extend([stats.gmean(np.abs(inputW_Clip))]*(numPointsToAdd))
        elif whichFeature == 3:
            ret.extend([stats.hmean(np.abs(inputW_Clip))]*(numPointsToAdd))
        elif whichFeature == 4:
            ret.extend([np.sum(inputW_Clip)]*(numPointsToAdd))
        elif whichFeature == 5:
            ret.extend([np.sum(np.abs(inputW_Clip))]*(numPointsToAdd))
        elif whichFeature == 6:
            ret.extend([np.sum(inputW_Clip*inputW_Clip)]*(numPointsToAdd))
        elif whichFeature == 7:
            ret.extend([stats.skew(inputW_Clip)]*(numPointsToAdd))
        elif whichFeature == 8:
            ret.extend([stats.kurtosis(inputW_Clip)]*(numPointsToAdd))
        elif whichFeature == 9:
            ret.extend([np.mean(np.diff(inputW_Clip))]*(numPointsToAdd))
        elif whichFeature == 10:
            ret.extend([np.mean(np.abs(np.diff(inputW_Clip)))]*(numPointsToAdd))
        elif whichFeature == 11:
            ret.extend([np.mean(np.square(np.abs(np.fft.fft(inputW_Clip))))]*(numPointsToAdd))
        elif whichFeature == 12:
            #zero crossings about mean
            meanW = np.mean(inputW)
            signW = np.heaviside((inputW - meanW), 1) - 0.5
            ret.extend([np.sum(np.abs(np.diff(signW)))]*(numPointsToAdd))
        elif whichFeature == 13:
            difArr = np.diff(inputW_Clip)
            signDifW = np.heaviside(difArr, 1) - 0.5
            ret.extend([np.sum(np.abs(np.diff(signDifW)))]*(numPointsToAdd))
        elif whichFeature == 14:
            ret.extend([np.sum(np.abs(np.diff(inputW_Clip)))]*(numPointsToAdd))
    return ret

In [5]:
metadataTDCSFOG_path = '/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/tdcsfog_metadata.csv'
metadataDEFOG_path = '/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/defog_metadata.csv'
subjects_path = '/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/subjects.csv'



df_TDCSmetadata = pd.read_csv(metadataTDCSFOG_path)
#df_TDCSmetadata = df_TDCSmetadata_tmp.drop(columns=['Test'])
one_hot = pd.get_dummies(df_TDCSmetadata['Medication']).drop(columns=['off'])
df_TDCSmetadata['Medication'] = one_hot #one if medication is 'on'
#print(df_TDCSmetadata.head())



df_DEFOGmetadata = pd.read_csv(metadataDEFOG_path)
one_hot = pd.get_dummies(df_DEFOGmetadata['Medication']).drop(columns=['off'])
df_DEFOGmetadata['Medication'] = one_hot #one if medication is 'on'
#print(df_DEFOGmetadata.head())

df_subjects = pd.read_csv(subjects_path)
one_hot = pd.get_dummies(df_subjects['Sex']).drop(columns=['F'])
df_subjects['Sex'] = one_hot #one if medication is 'on'
#print(df_subjects.head())



In [6]:
TDCS_Train_Path = '/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/tdcsfog/'
DEFOG_Train_Path = '/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/defog/'

###################TDCS####################
TDCS_Files = glob.glob(os.path.join(TDCS_Train_Path,"*.csv"))

dfList_TDCS = []
for f in TDCS_Files:
    data = pd.read_csv(f)
    data['Id'] = f[-14:-4]
    data['meanHighFreqPowerAP'], data['weightedFreqExcursionAP'], data['meanWeightedFreqAP'] = engineerFeatures5(data.Time, data.AccML, data.AccAP, data.AccV, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['MLmAP'] = data.AccML - data.AccAP
    data['phaseSpread'], data['highfMLbylowfAP'], data['meanAmpWeightedPhase'] = gen1featuresTDCS(data.Time, data.AccML, data.AccAP, data.AccV, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['APmVert'] = data.AccAP - data.AccV
    data['t_rel'] = data.Time/np.amax(data.Time)
    data['slopeSignChangeAP'] = seglearn_fxns(13, data.Time, data.AccAP, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['slopeSignChangeML'] = seglearn_fxns(13, data.Time, data.AccML, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['slopeSignChangeVert'] = seglearn_fxns(13, data.Time, data.AccV, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['sumML'] = seglearn_fxns(4, data.Time, data.AccML, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['sumAbsDiffAP'] = seglearn_fxns(14, data.Time, data.AccAP, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['sumAbsDiffVert'] = seglearn_fxns(14, data.Time, data.AccV, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    dfList_TDCS.append(data)


df_TDCS = pd.concat(dfList_TDCS, ignore_index=True)
df_TDCS.drop(columns=['Time', 'AccV', 'AccML', 'AccAP'], inplace=True)

###################DEFOG####################
DEFOG_Files = glob.glob(os.path.join(DEFOG_Train_Path,"*.csv"))

dfList_DEFOG = []
for f in DEFOG_Files:
    data = pd.read_csv(f)
    data['Id'] = f[-14:-4]
    data['phaseSpread'], data['phaseExcursion'], data['meanAmpWeightedPhase'], data['highfMLbylowfAP'], data['max_aAP'], data['max_aVert'], data['iqrRatio_ML_Vert'] = gen1featuresDEFOG(data.Time, data.AccML, data.AccAP, data.AccV, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    data['t_rel'] = data.Time/np.amax(data.Time)
    data['slopeSignChangeAP'] = seglearn_fxns(13, data.Time, data.AccAP, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    data['slopeSignChangeML'] = seglearn_fxns(13, data.Time, data.AccML, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    data['sumAbsDiffAP'] = seglearn_fxns(14, data.Time, data.AccAP, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    data['sumAbsDiffVert'] = seglearn_fxns(14, data.Time, data.AccV, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    dfList_DEFOG.append(data)


df_DEFOG = pd.concat(dfList_DEFOG, ignore_index=True)
df_DEFOG.drop(columns =['Time', 'AccV', 'AccML', 'AccAP'], inplace=True)

  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis

In [7]:
###################TDCS####################
left_TDCS = df_TDCS.set_index('Id')
right_TDCS = df_TDCSmetadata.set_index('Id')
df_new_TDCS = left_TDCS.join(right_TDCS)
df_new_TDCS.reset_index(inplace=True)

nextLeft_TDCS = df_new_TDCS.set_index('Subject')
nextRight_TDCS = df_subjects.set_index(['Subject']) #Visit is NaN for TDCS
df_final_TDCS = nextLeft_TDCS.join(nextRight_TDCS, on=['Subject'], lsuffix='_meta', rsuffix='_subjects')
df_final_TDCS.reset_index(inplace=True)
df_final_TDCS['Visit'] = df_final_TDCS['Visit_meta'] 
df_final_TDCS = df_final_TDCS.drop(columns=['Visit_subjects'])
df_final_TDCS = df_final_TDCS.drop(columns=['Visit_meta'])


###################DEFOG####################
left_DEFOG = df_DEFOG.set_index('Id')
right_DEFOG = df_DEFOGmetadata.set_index('Id')
df_new_DEFOG = left_DEFOG.join(right_DEFOG)
df_new_DEFOG.reset_index(inplace=True)

nextLeft_DEFOG = df_new_DEFOG.set_index('Subject')
nextRight_DEFOG = df_subjects.set_index(['Subject', 'Visit'])
df_final_DEFOG = nextLeft_DEFOG.join(nextRight_DEFOG, on=['Subject', 'Visit'], lsuffix='_meta', rsuffix='_subjects')
df_final_DEFOG.reset_index(inplace=True)
df_final_DEFOG = df_final_DEFOG.drop(columns=['Task', 'Valid'])


del left_TDCS
del right_TDCS
del df_new_TDCS
del nextLeft_TDCS
del nextRight_TDCS

del left_DEFOG
del right_DEFOG
del df_new_DEFOG
del nextLeft_DEFOG
del nextRight_DEFOG

df_final_TDCS.fillna(0, inplace=True)
df_final_DEFOG.fillna(0, inplace=True)

#print(df_final_TDCS.isna().sum())

In [8]:
features = ['Visit', 'Age', 'YearsSinceDx', 'UPDRSIII_On', 
            'UPDRSIII_Off', 'NFOGQ', 'highfMLbylowfAP', 
            't_rel', 'meanAmpWeightedPhase', 'phaseSpread', 'Sex', 
            'slopeSignChangeAP', 'slopeSignChangeML', 'sumAbsDiffAP',
            'sumAbsDiffVert']

TDCS_Only_Features = ['Test', 'Medication', 'MLmAP', 'APmVert',
                      'meanWeightedFreqAP', 'sumML', 'slopeSignChangeVert']

DEFOG_Only_Features = ['max_aAP', 'max_aVert', 'iqrRatio_ML_Vert', 
                       'phaseExcursion']

featuresTDCS = []
featuresTDCS.extend(features)
featuresTDCS.extend(TDCS_Only_Features)

featuresDEFOG = []
featuresDEFOG.extend(features)
featuresDEFOG.extend(DEFOG_Only_Features)


outputCols = ['StartHesitation', 'Turn', 'Walking']

X_train_TDCS, X_val_TDCS, y_train_TDCS, y_val_TDCS = train_test_split(df_final_TDCS.loc[:, featuresTDCS], df_final_TDCS.loc[:, outputCols], test_size=0.3, random_state=42)
X_train_DEFOG, X_val_DEFOG, y_train_DEFOG, y_val_DEFOG = train_test_split(df_final_DEFOG.loc[:, featuresDEFOG], df_final_DEFOG.loc[:, outputCols], test_size=0.3, random_state=42)


#X_train_TDCS = df_final_TDCS.loc[:, featuresTDCS]
#y_train_TDCS = df_final_TDCS.loc[:, outputCols]

del df_final_TDCS

#X_train_DEFOG = df_final_DEFOG.loc[:, featuresDEFOG]
#y_train_DEFOG = df_final_DEFOG.loc[:, outputCols]

del df_final_DEFOG

In [9]:
#Credit to https://neptune.ai/blog/lightgbm-parameters-guide
SEARCH_PARAMS = {'learning_rate': 0.05,
                'max_depth': 8,
                'num_leaves': 50,
                 'min_data_in_leaf': 2500,
                'feature_fraction': 1.0}

FIXED_PARAMS={'objective': 'binary',
             'metric': 'auc',
             'is_unbalance': True,
             'bagging_freq': 5,
             'boosting': 'dart',
             'num_boost_round': 300,
             'early_stopping_rounds': 10}

def train_evaluate(search_params, X_train, y_train, X_valid, y_valid):
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)
    params = {'metric':FIXED_PARAMS['metric'],
             'objective':FIXED_PARAMS['objective'],
             **search_params}
    model = lgb.train(params, train_data,
                     valid_sets=[valid_data],
                     num_boost_round=FIXED_PARAMS['num_boost_round'],
                     early_stopping_rounds=FIXED_PARAMS['early_stopping_rounds'],
                     valid_names=['valid'])
    score = model.best_score['valid']['auc']
    return score, model

In [10]:
print("Training the models...")
score1, myStartHesitationModelTDCS = train_evaluate(SEARCH_PARAMS, X_train_TDCS, y_train_TDCS.StartHesitation, X_val_TDCS, y_val_TDCS.StartHesitation)
score2, myTurnModelTDCS = train_evaluate(SEARCH_PARAMS, X_train_TDCS, y_train_TDCS.Turn, X_val_TDCS, y_val_TDCS.Turn)
score3, myWalkingModelTDCS = train_evaluate(SEARCH_PARAMS, X_train_TDCS, y_train_TDCS.Walking, X_val_TDCS, y_val_TDCS.Walking)

"""predsSH_TDCS = myStartHesitationModelTDCS.predict(X_test_TDCS)
predsT_TDCS = myTurnModelTDCS.predict(X_test_TDCS)
predsW_TDCS = myWalkingModelTDCS.predict(X_test_TDCS)

df_pred_TDCS = pd.DataFrame()
df_pred_TDCS['StartHesitation'] = predsSH_TDCS
df_pred_TDCS['Turn'] = predsT_TDCS
df_pred_TDCS['Walking'] = predsW_TDCS"""

score1, myStartHesitationModelDEFOG = train_evaluate(SEARCH_PARAMS, X_train_DEFOG, y_train_DEFOG.StartHesitation, X_val_DEFOG, y_val_DEFOG.StartHesitation)
score2, myTurnModelDEFOG = train_evaluate(SEARCH_PARAMS, X_train_DEFOG, y_train_DEFOG.Turn, X_val_DEFOG, y_val_DEFOG.Turn)
score3, myWalkingModelDEFOG = train_evaluate(SEARCH_PARAMS, X_train_DEFOG, y_train_DEFOG.Walking, X_val_DEFOG, y_val_DEFOG.Walking)

"""predsSH_DEFOG = myStartHesitationModelDEFOG.predict(X_test_DEFOG)
predsT_DEFOG = myTurnModelDEFOG.predict(X_test_DEFOG)
predsW_DEFOG = myWalkingModelDEFOG.predict(X_test_DEFOG)

df_pred_DEFOG = pd.DataFrame()
df_pred_DEFOG['StartHesitation'] = predsSH_DEFOG
df_pred_DEFOG['Turn'] = predsT_DEFOG
df_pred_DEFOG['Walking'] = predsW_DEFOG"""


#Free memory
del X_train_TDCS
del y_train_TDCS
del X_train_DEFOG
del y_train_DEFOG

Training the models...




[LightGBM] [Info] Number of positive: 213256, number of negative: 4730614
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3411
[LightGBM] [Info] Number of data points in the train set: 4943870, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.043135 -> initscore=-3.099317
[LightGBM] [Info] Start training from score -3.099317
[1]	valid's auc: 0.948804
Training until validation scores don't improve for 10 rounds
[2]	valid's auc: 0.977204
[3]	valid's auc: 0.978036
[4]	valid's auc: 0.978357
[5]	valid's auc: 0.979669
[6]	valid's auc: 0.979922
[7]	valid's auc: 0.980019
[8]	valid's auc: 0.981815
[9]	valid's auc: 0.983477
[10]	valid's auc: 0.98421
[11]	valid's auc: 0.984236
[12]	valid's auc: 0.985005
[13]	valid's auc: 0.985175
[14]	valid's auc: 0.985455
[15]	valid's auc: 0.985524
[16]	valid's auc: 0.98591
[17]	valid's auc: 0.986889
[18]	valid's auc: 0.989923
[19]	



[LightGBM] [Info] Number of positive: 1174563, number of negative: 3769307
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3411
[LightGBM] [Info] Number of data points in the train set: 4943870, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.237580 -> initscore=-1.165995
[LightGBM] [Info] Start training from score -1.165995
[1]	valid's auc: 0.91892
Training until validation scores don't improve for 10 rounds
[2]	valid's auc: 0.925048
[3]	valid's auc: 0.9275
[4]	valid's auc: 0.929968
[5]	valid's auc: 0.933217
[6]	valid's auc: 0.935985
[7]	valid's auc: 0.93782
[8]	valid's auc: 0.941946
[9]	valid's auc: 0.943557
[10]	valid's auc: 0.944664
[11]	valid's auc: 0.945871
[12]	valid's auc: 0.946535
[13]	valid's auc: 0.948825
[14]	valid's auc: 0.949887
[15]	valid's auc: 0.950686
[16]	valid's auc: 0.951287
[17]	valid's auc: 0.952063
[18]	valid's auc: 0.953083
[19]	valid's auc: 0.953797
[20]	valid's auc: 0.95418
[21]	valid's auc:



[LightGBM] [Info] Number of positive: 145428, number of negative: 4798442
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3411
[LightGBM] [Info] Number of data points in the train set: 4943870, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.029416 -> initscore=-3.496365
[LightGBM] [Info] Start training from score -3.496365
[1]	valid's auc: 0.946338
Training until validation scores don't improve for 10 rounds
[2]	valid's auc: 0.951923
[3]	valid's auc: 0.954779
[4]	valid's auc: 0.955856
[5]	valid's auc: 0.958317
[6]	valid's auc: 0.959516
[7]	valid's auc: 0.96006
[8]	valid's auc: 0.960973
[9]	valid's auc: 0.961247
[10]	valid's auc: 0.969463
[11]	valid's auc: 0.969558
[12]	valid's auc: 0.969931
[13]	valid's auc: 0.9704
[14]	valid's auc: 0.978566
[15]	valid's auc: 0.982166
[16]	valid's auc: 0.982432
[17]	valid's auc: 0.983095
[18]	valid's auc: 0.986537
[19]	v



[LightGBM] [Info] Number of positive: 345, number of negative: 9467646
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3169
[LightGBM] [Info] Number of data points in the train set: 9467991, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000036 -> initscore=-10.219846
[LightGBM] [Info] Start training from score -10.219846
[1]	valid's auc: 0.999122
Training until validation scores don't improve for 10 rounds
[2]	valid's auc: 0.365069
[3]	valid's auc: 0.490794
[4]	valid's auc: 0.362994
[5]	valid's auc: 0.363209
[6]	valid's auc: 0.363268
[7]	valid's auc: 0.36407
[8]	valid's auc: 0.364049
[9]	valid's auc: 0.364772
[10]	valid's auc: 0.365013
[11]	valid's auc: 0.366667
Early stopping, best iteration is:
[1]	valid's auc: 0.999122




[LightGBM] [Info] Number of positive: 411944, number of negative: 9056047
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3169
[LightGBM] [Info] Number of data points in the train set: 9467991, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.043509 -> initscore=-3.090301
[LightGBM] [Info] Start training from score -3.090301
[1]	valid's auc: 0.911104
Training until validation scores don't improve for 10 rounds
[2]	valid's auc: 0.923138
[3]	valid's auc: 0.93785
[4]	valid's auc: 0.942903
[5]	valid's auc: 0.947979
[6]	valid's auc: 0.949112
[7]	valid's auc: 0.95002
[8]	valid's auc: 0.951105
[9]	valid's auc: 0.952435
[10]	valid's auc: 0.953317
[11]	valid's auc: 0.954068
[12]	valid's auc: 0.954881
[13]	valid's auc: 0.955825
[14]	valid's auc: 0.956312
[15]	valid's auc: 0.963385
[16]	valid's auc: 0.964898
[17]	valid's auc: 0.965854
[18]	valid's auc: 0.966638
[19]	



[LightGBM] [Info] Number of positive: 68949, number of negative: 9399042
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3169
[LightGBM] [Info] Number of data points in the train set: 9467991, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.007282 -> initscore=-4.914996
[LightGBM] [Info] Start training from score -4.914996
[1]	valid's auc: 0.887275
Training until validation scores don't improve for 10 rounds
[2]	valid's auc: 0.939066
[3]	valid's auc: 0.940716
[4]	valid's auc: 0.942107
[5]	valid's auc: 0.942323
[6]	valid's auc: 0.953177
[7]	valid's auc: 0.954533
[8]	valid's auc: 0.977175
[9]	valid's auc: 0.977534
[10]	valid's auc: 0.979627
[11]	valid's auc: 0.982343
[12]	valid's auc: 0.983048
[13]	valid's auc: 0.983892
[14]	valid's auc: 0.985113
[15]	valid's auc: 0.985431
[16]	valid's auc: 0.985812
[17]	valid's auc: 0.987058
[18]	valid's auc: 0.987573
[19]

In [11]:
#READ IN TEST FILES

TDCS_Test_Path = '/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/test/tdcsfog/'
DEFOG_Test_Path = '/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/test/defog/'

print("Reading in Test Files...")

###################TDCS####################
TDCS_Files_Test = glob.glob(os.path.join(TDCS_Test_Path,"*.csv"))

dfList_TDCS_Test = []
for f in TDCS_Files_Test:
    data = pd.read_csv(f)
    data['Id'] = f[-14:-4]
    data['meanHighFreqPowerAP'], data['weightedFreqExcursionAP'], data['meanWeightedFreqAP'] = engineerFeatures5(data.Time, data.AccML, data.AccAP, data.AccV, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['MLmAP'] = data.AccML - data.AccAP
    data['phaseSpread'], data['highfMLbylowfAP'], data['meanAmpWeightedPhase'] = gen1featuresTDCS(data.Time, data.AccML, data.AccAP, data.AccV, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['APmVert'] = data.AccAP - data.AccV
    data['t_rel'] = data.Time/np.amax(data.Time)
    data['slopeSignChangeAP'] = seglearn_fxns(13, data.Time, data.AccAP, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['slopeSignChangeML'] = seglearn_fxns(13, data.Time, data.AccML, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['slopeSignChangeVert'] = seglearn_fxns(13, data.Time, data.AccV, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['sumML'] = seglearn_fxns(4, data.Time, data.AccML, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['sumAbsDiffAP'] = seglearn_fxns(14, data.Time, data.AccAP, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['sumAbsDiffVert'] = seglearn_fxns(14, data.Time, data.AccV, sampleRateTDCSFOG, dummyVariable, batchesTDCS)
    data['Id_t'] = data.Id.astype('string') + "_" + data.Time.astype('string')
    dfList_TDCS_Test.append(data)


df_TDCS_Test = pd.concat(dfList_TDCS_Test, ignore_index=True)


###################DEFOG####################
DEFOG_Files_Test = glob.glob(os.path.join(DEFOG_Test_Path,"*.csv"))

dfList_DEFOG_Test = []
for f in DEFOG_Files_Test:
    data = pd.read_csv(f)
    data['Id'] = f[-14:-4]
    data['phaseSpread'], data['phaseExcursion'], data['meanAmpWeightedPhase'], data['highfMLbylowfAP'], data['max_aAP'], data['max_aVert'], data['iqrRatio_ML_Vert'] = gen1featuresDEFOG(data.Time, data.AccML, data.AccAP, data.AccV, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    data['t_rel'] = data.Time/np.amax(data.Time)
    data['slopeSignChangeAP'] = seglearn_fxns(13, data.Time, data.AccAP, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    data['slopeSignChangeML'] = seglearn_fxns(13, data.Time, data.AccML, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    data['sumAbsDiffAP'] = seglearn_fxns(14, data.Time, data.AccAP, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    data['sumAbsDiffVert'] = seglearn_fxns(14, data.Time, data.AccV, sampleRateDEFOG, dummyVariable, batchesDEFOG)
    data['Id_t'] = data.Id.astype('string') + "_" + data.Time.astype('string')
    dfList_DEFOG_Test.append(data)


df_DEFOG_Test = pd.concat(dfList_DEFOG_Test, ignore_index=True)

Reading in Test Files...


In [12]:
###################TDCS####################
left_TDCS_Test = df_TDCS_Test.set_index('Id')
right_TDCS_Test = df_TDCSmetadata.set_index('Id')
df_new_TDCS_Test = left_TDCS_Test.join(right_TDCS_Test)
df_new_TDCS_Test.reset_index(inplace=True)

nextLeft_TDCS_Test = df_new_TDCS_Test.set_index('Subject')
nextRight_TDCS_Test = df_subjects.set_index(['Subject'])
df_final_TDCS_Test = nextLeft_TDCS_Test.join(nextRight_TDCS_Test, on=['Subject'], lsuffix='_meta', rsuffix='_subjects')
df_final_TDCS_Test.reset_index(inplace=True)
df_final_TDCS_Test['Visit'] = df_final_TDCS_Test['Visit_meta'] 
df_final_TDCS_Test = df_final_TDCS_Test.drop(columns=['Visit_subjects'])

###################DEFOG####################
left_DEFOG_Test = df_DEFOG_Test.set_index('Id')
right_DEFOG_Test = df_DEFOGmetadata.set_index('Id')
df_new_DEFOG_Test = left_DEFOG_Test.join(right_DEFOG_Test)
df_new_DEFOG_Test.reset_index(inplace=True)

nextLeft_DEFOG_Test = df_new_DEFOG_Test.set_index('Subject')
nextRight_DEFOG_Test = df_subjects.set_index(['Subject', 'Visit'])
df_final_DEFOG_Test = nextLeft_DEFOG_Test.join(nextRight_DEFOG_Test, on=['Subject', 'Visit'], lsuffix='_meta', rsuffix='_subjects')
df_final_DEFOG_Test.reset_index(inplace=True)
#df_final_DEFOG_Test = df_final_DEFOG_Test.drop(columns=['Task', 'Valid'])

#del some unused df's
del left_TDCS_Test
del right_TDCS_Test
del df_new_TDCS_Test
del nextLeft_TDCS_Test
del nextRight_TDCS_Test

del left_DEFOG_Test
del right_DEFOG_Test
del df_new_DEFOG_Test
del nextLeft_DEFOG_Test
del nextRight_DEFOG_Test

del df_subjects


df_final_TDCS_Test.fillna(0, inplace=True)
df_final_DEFOG_Test.fillna(0, inplace=True)

In [13]:
X_test_TDCS = df_final_TDCS_Test.loc[:, featuresTDCS]

X_test_DEFOG = df_final_DEFOG_Test.loc[:, featuresDEFOG]


In [14]:
print("Making TDCS predictions...")
predsSH_TDCS = myStartHesitationModelTDCS.predict(X_test_TDCS)
predsT_TDCS = myTurnModelTDCS.predict(X_test_TDCS)
predsW_TDCS = myWalkingModelTDCS.predict(X_test_TDCS)

df_Predictions_TDCS = pd.DataFrame()
df_Predictions_TDCS['StartHesitation'] = predsSH_TDCS
df_Predictions_TDCS['Turn'] = predsT_TDCS
df_Predictions_TDCS['Walking'] = predsW_TDCS


print("Making DEFOG predictions...")
predsSH_DEFOG = myStartHesitationModelDEFOG.predict(X_test_DEFOG)
predsT_DEFOG = myTurnModelDEFOG.predict(X_test_DEFOG)
predsW_DEFOG = myWalkingModelDEFOG.predict(X_test_DEFOG)

df_Predictions_DEFOG = pd.DataFrame()
df_Predictions_DEFOG['StartHesitation'] = predsSH_DEFOG
df_Predictions_DEFOG['Turn'] = predsT_DEFOG
df_Predictions_DEFOG['Walking'] = predsW_DEFOG

Making TDCS predictions...
Making DEFOG predictions...


In [15]:
#store df of submission results.
print('Creating submission dataframe...')
finalCols = ['Id', 'StartHesitation', 'Turn', 'Walking']

id_t_TDCS = df_final_TDCS_Test.loc[:, 'Id_t']
id_t_DEFOG = df_final_DEFOG_Test.loc[:, 'Id_t']

df_Predictions_TDCS['Id'] =  id_t_TDCS
df_Predictions_DEFOG['Id'] = id_t_DEFOG

df_TDCS = df_Predictions_TDCS[finalCols]
df_DEFOG = df_Predictions_DEFOG[finalCols]

output_df = pd.concat([df_TDCS, df_DEFOG], ignore_index=True)

print('Writing submission file...')
output_df.to_csv("submission.csv", index=False)

Creating submission dataframe...
Writing submission file...
