In [None]:
import os
import tqdm
import glob
import numpy as np
import pandas as pd

Load and process the data

In [None]:
def read_metadata(file_path, id_col='Id', subject_col='Subject'):
    df_meta = pd.read_csv(file_path)
    return df_meta

def load_data(file_paths, meta_df, limit=None):
    df_combined = pd.DataFrame()
    
    if limit:
        file_paths = file_paths[::limit]

    for fp in tqdm.tqdm(file_paths):
        tmp = pd.read_csv(fp)
        file_id = os.path.basename(fp).replace(".csv", "")
        subject = meta_df.loc[meta_df['Id'] == file_id, 'Subject'].iloc[0]
        
        tmp['Medication'] = meta_df.loc[meta_df['Id'] == file_id, 'Medication'].iloc[0]
        tmp['Age'] = meta_df.loc[meta_df['Subject'] == subject, 'Age'].iloc[0]
        tmp['Sex'] = meta_df.loc[meta_df['Subject'] == subject, 'Sex'].iloc[0]
        tmp['YearsSinceDx'] = meta_df.loc[meta_df['Subject'] == subject, 'YearsSinceDx'].iloc[0]
        tmp['NFOGQ'] = meta_df.loc[meta_df['Subject'] == subject, 'NFOGQ'].iloc[0]

        df_combined = pd.concat([df_combined, tmp]).reset_index(drop=True)
    
    return df_combined

def prepare_training_data(df_combined):
    df_combined = df_combined[(df_combined['Valid'] == True) & (df_combined['Task'] == True)]
    df_combined = df_combined.drop(['Valid', 'Task'], axis=1)
    return df_combined

def encode_columns(df):
    df['Medication'] = np.where(df['Medication'] == 'on', 1, 0)
    df['Sex'] = np.where(df['Sex'] == 'M', 1, 0)
    return df

In [None]:
# parent directory
pdir = '/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction'

df_tdcs_meta = read_metadata(os.path.join(pdir, 'tdcsfog_metadata.csv'))
df_defog_meta = read_metadata(os.path.join(pdir, 'defog_metadata.csv'))
df_subjects = read_metadata(os.path.join(pdir, 'subjects.csv'))

# load tdcsfog data
tdcs_file_path = glob.glob(os.path.join(pdir, 'train', 'tdcsfog', '*.csv'), recursive=True)
df_tdcs = load_data(tdcs_file_path, df_tdcs_meta, limit=100)

# load defog data
defog_file_path = glob.glob(os.path.join(pdir, 'train', 'defog', '*.csv'), recursive=True)
df_defog = load_data(defog_file_path, df_defog_meta, limit=50)
df_defog = prepare_training_data(df_defog)

In [None]:
# prepare the training data
df_train = pd.concat([df_tdcs, df_defog]).reset_index(drop=True)
df_train = encode_columns(df_train)

# split data into features and target.
y = df_train[['StartHesitation', 'Turn', 'Walking']]                       # target
X = df_train.drop(['StartHesitation', 'Turn', 'Walking', 'Time'], axis=1)  # feature


Train the model

In [None]:
from xgboost import XGBClassifier


xgb = XGBClassifier(n_estimators=100)
xgb.fit(X, y)

Get the test data

In [None]:
tdcs_test_file_path = glob.glob(os.path.join(pdir, 'test', 'tdcsfog', '*.csv'), recursive=True)
df_tdcs_test = load_data(tdcs_test_file_path, df_tdcs_meta)

defog_test_file_path = glob.glob(os.path.join(pdir, 'test', 'defog', '*.csv'), recursive=True)
df_defog_test = load_data(defog_test_file_path, df_defog_meta)
df_test = pd.concat([df_tdcs_test, df_defog_test]).reset_index(drop=True)
df_test = encode_columns(df_test)

In [None]:
# split data into submission Id and feature.
Id = df_test['Id']                             # Id for submission data
X_test = df_test.drop(['Time', 'Id'], axis=1)  # feature of test data
X_test.head()

In [None]:
# calculate prediction using trained RandomForestClassifier model.
prediction = xgb.predict(X_test)

In [None]:
# Prepare submit data
submission = pd.DataFrame(Id, columns=['Id'])
submission['StartHesitation'] = prediction[:, 0]
submission['Turn'] = prediction[:, 1]
submission['Walking'] = prediction[:, 2]

In [None]:
# Save the created submission data.
submission.to_csv('submission.csv', index=False)