Credit to 此般浅薄 for the initial inspiration

In [1]:
# Install tsflex and seglearn
!pip install tsflex --no-index --find-links=file:///kaggle/input/time-series-tools
!pip install seglearn --no-index --find-links=file:///kaggle/input/time-series-tools

Looking in links: file:///kaggle/input/time-series-tools
Processing /kaggle/input/time-series-tools/tsflex-0.3.0-py3-none-any.whl
Installing collected packages: tsflex
Successfully installed tsflex-0.3.0
[0mLooking in links: file:///kaggle/input/time-series-tools
Processing /kaggle/input/time-series-tools/seglearn-1.2.5-py3-none-any.whl
Installing collected packages: seglearn
Successfully installed seglearn-1.2.5
[0m

In [2]:
import numpy as np
import pandas as pd
from sklearn import *
import glob
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from os import path
from pathlib import Path
from seglearn.feature_functions import base_features, emg_features
from tsflex.features import FeatureCollection, MultipleFeatureDescriptors
from tsflex.features.integrations import seglearn_feature_dict_wrapper
from sklearn.model_selection import GroupKFold
import lightgbm as lgb
from sklearn.multioutput import MultiOutputRegressor
from sklearn.base import clone
from sklearn.metrics import average_precision_score

# Grab important files

## Open Files

In [3]:
root = '/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/'

train = glob.glob(path.join(root, 'train/**/**')) # Lists all files in the train directory recursively.
test = glob.glob(path.join(root, 'test/**/**')) # Lists all files in the test directory recursively.

# Reads CSV files into pandas DataFrames.
subjects = pd.read_csv(path.join(root, 'subjects.csv')) 
tasks = pd.read_csv(path.join(root, 'tasks.csv'))
events = pd.read_csv(path.join(root, 'events.csv'))
tdcsfog_metadata = pd.read_csv(path.join(root, 'tdcsfog_metadata.csv'))
defog_metadata = pd.read_csv(path.join(root, 'defog_metadata.csv')) 

# Adds a new column to each metadata dataframe to distinguish between them.
tdcsfog_metadata['Module'] = 'tdcsfog'
defog_metadata['Module'] = 'defog'

# Concatenates the two metadata dataframes into one.
full_metadata = pd.concat([tdcsfog_metadata, defog_metadata])


In [4]:
seed = 42
cluster_size = 8

In [5]:
subjects

Unnamed: 0,Subject,Visit,Age,Sex,YearsSinceDx,UPDRSIII_On,UPDRSIII_Off,NFOGQ
0,00f674,2.0,63,M,27.0,43.0,49.0,24
1,00f674,1.0,63,M,27.0,31.0,30.0,26
2,02bc69,,69,M,4.0,21.0,,22
3,040587,2.0,75,M,26.0,52.0,69.0,21
4,040587,1.0,75,M,26.0,47.0,75.0,24
...,...,...,...,...,...,...,...,...
168,f80507,1.0,57,M,2.0,12.0,,0
169,fa8764,,60,F,7.0,30.0,,19
170,fba3a3,1.0,65,F,8.0,28.0,,0
171,fcb9f5,1.0,69,M,3.5,27.0,49.0,23


## Clean the Data

In [6]:
# Imputes missing data for subject 'fe5d84', setting its sex as 'F'.
subjects.loc[subjects['Subject'] == 'fe5d84', 'Sex'] = 'F'

# Transforms categorical variable 'Sex' into integers.
subjects['Sex'] = subjects['Sex'].factorize()[0]

# Fills any remaining missing values with 0 and takes the median value for each subject.
subjects = subjects.fillna(0).groupby('Subject').median()

# Clusters the subjects into a predetermined number of clusters and stores the cluster labels in a new column.
subjects['s_group'] = cluster.KMeans(n_clusters = cluster_size, random_state = seed).fit_predict(subjects[subjects.columns[1:]])

# Renames columns in the subjects dataframe.
new_names = {'Visit':'s_visit','Age':'s_age','YearsSinceDx':'s_years','UPDRSIII_On':'s_on','UPDRSIII_Off':'s_off','NFOGQ':'s_NFOGQ', 'Sex': 's_sex'}
subjects = subjects.rename(columns = new_names)


## Process Tasks

In [7]:
# Creates a new feature, 'Duration', which is the difference between 'End' and 'Begin'.
tasks['Duration'] = tasks['End'] - tasks['Begin']

# Transforms the tasks dataframe from long to wide format, filling missing values with 0.
tasks = pd.pivot_table(tasks, values=['Duration'], index=['Id'], columns=['Task'], aggfunc='sum', fill_value=0)
tasks.columns = [c[1] for c in tasks.columns] # Renames the columns.
tasks = tasks.reset_index()

# Clusters the tasks into a predetermined number of clusters and stores the cluster labels in a new column.
tasks['t_group'] = cluster.KMeans(n_clusters = cluster_size, random_state = seed).fit_predict(tasks[tasks.columns[1:]])


## Merge metadata and process

In [8]:
# Merges the full_metadata dataframe with the subjects dataframe.
metadata_w_subjects = full_metadata.merge(subjects, how='left', on='Subject').copy()
features = metadata_w_subjects.columns # Stores the names of the columns.

# Transforms categorical variable 'Medication' into integers.
metadata_w_subjects['Medication'] = metadata_w_subjects['Medication'].factorize()[0]


# Using Kaggle User Model LGBM to regress baseline

from tsflex.features import FeatureCollection, MultipleFeatureDescriptors
from tsflex.features.integrations import seglearn_feature_dict_wrapper from the time series data itself

In [9]:
basic_feats = MultipleFeatureDescriptors(
    functions=seglearn_feature_dict_wrapper(base_features()),
    series_names=['AccV', 'AccML', 'AccAP'],
    windows=[5000],
    strides=[5000],
)

emg_feats = emg_features()
del emg_feats['simple square integral'] # is same as abs_energy (which is in base_features)

emg_feats = MultipleFeatureDescriptors(
    functions=seglearn_feature_dict_wrapper(emg_feats),
    series_names=['AccV', 'AccML', 'AccAP'],
    windows=[5000],
    strides=[5000],
)

fc = FeatureCollection([basic_feats, emg_feats])

### Here I optimized the original code to be less memory intensive


In [10]:
import gc

In [11]:
metadata_w_subjects

Unnamed: 0,Id,Subject,Visit,Test,Medication,Module,s_visit,s_age,s_sex,s_years,s_on,s_off,s_NFOGQ,s_group
0,003f117e14,4dc2f8,3,2.0,0,tdcsfog,0.0,68.0,1.0,9.0,17.0,15.0,15.0,3
1,009ee11563,f62eec,4,2.0,0,tdcsfog,0.0,71.0,0.0,10.0,42.0,0.0,24.0,0
2,011322847a,231c3b,2,2.0,0,tdcsfog,0.0,67.0,0.0,12.0,27.0,28.0,19.0,3
3,01d0fe7266,231c3b,2,1.0,1,tdcsfog,0.0,67.0,0.0,12.0,27.0,28.0,19.0,3
4,024418ba39,fa8764,19,3.0,0,tdcsfog,0.0,60.0,1.0,7.0,30.0,0.0,19.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
965,f3a921edee,1a778d,1,,1,defog,1.5,65.0,0.0,7.0,50.0,59.5,24.5,1
966,f40e8c6ebe,575c60,1,,1,defog,1.0,28.0,0.0,4.0,54.0,50.0,25.0,1
967,f8ddbdd98d,107712,1,,0,defog,1.0,82.0,1.0,11.0,38.0,42.0,21.0,6
968,f9efef91fb,5d9cae,2,,1,defog,1.5,72.0,0.5,14.0,22.5,39.0,16.0,7


In [13]:
def reader(file):
    try:
        df = pd.read_csv(file, index_col='Time')
        path_split = file.split('/')
        df['Id'] = path_split[-1].split('.')[0]
        dataset = Path(file).parts[-2]
        df['Module'] = dataset
        df['Time_frac']=(df.index/df.index.max()).values
        # Check if optional columns exist in df, if not create them with default values
        for col in ['StartHesitation', 'Turn', 'Walking']:
            if col not in df.columns:
                df[col] = 0 # or any other default value
        chunksize = 100000 # adjust this value depending on your system memory
        chunks = []
        for chunk in pd.read_csv(file, index_col='Time', chunksize=chunksize,
                                 usecols=['Time', 'AccV', 'AccML', 'AccAP', 'StartHesitation', 'Turn' , 'Walking']):

            path_split = file.split('/')
            chunk['Id'] = path_split[-1].split('.')[0]
            dataset = Path(file).parts[-2]
            chunk['Module'] = dataset

            chunk['Time_frac']=(chunk.index/chunk.index.max()).values

            chunk = pd.merge(chunk, tasks[['Id','t_group']], how='left', on='Id').fillna(-1)
            chunk = pd.merge(chunk, metadata_w_subjects[['Id','Subject', 'Visit','Test','Medication','s_group']], how='left', on='Id').fillna(-1)

            chunk_feats = fc.calculate(chunk, return_df=True, include_final_window=True, approve_sparsity=True, window_idx="begin").astype(np.float32)
            chunk = chunk.merge(chunk_feats, how="left", left_index=True, right_index=True)
            
            chunk.fillna(method="ffill", inplace=True)

            chunks.append(chunk)

        df = pd.concat(chunks)
        
        # Clear the memory from chunks list
        del chunks
        gc.collect()
        
        return df
    except Exception as e:
        print(f"Error processing file {file}: {e}")
        return pd.DataFrame()

train = pd.concat([reader(f) for f in tqdm(train)]).fillna(0); print(train.shape)


  0%|          | 0/970 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [23]:
train = reader('/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/defog/be9d33541d.csv')

In [24]:

cols = [c for c in train.columns if c not in ['Id','Subject','Module', 'Time', 'StartHesitation', 'Turn' , 'Walking', 'Valid', 'Task','Event']]
pcols = ['StartHesitation', 'Turn' , 'Walking']
scols = ['Id', 'StartHesitation', 'Turn' , 'Walking']


In [None]:
best_params_ = { 
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
    'num_leaves': 31,  # consider adjusting this based on max_depth
    'max_depth': 30,  # decrease from 50 to 30
    'min_child_weight': 20,  # you can try larger values to avoid overfitting
    'subsample': 0.8,  # slightly decrease from 0.996 to control overfitting
    'colsample_bytree': 0.8,  # slightly decrease from 0.9 to control overfitting
    'n_estimators': 500,  # increase from 291 if computational cost is not a problem
    'learning_rate': 0.05,  # increase from 0.01 to make learning faster, but you may want to decrease it if the model isn't accurate enough
}
def custom_average_precision(y_true, y_pred):
    score = average_precision_score(y_true, y_pred)
    return 'average_precision', score, True

class LGBMMultiOutputRegressor(MultiOutputRegressor):
    def fit(self, X, y, eval_set=None, **fit_params):
        self.estimators_ = [clone(self.estimator) for _ in range(y.shape[1])]
        
        for i, estimator in enumerate(self.estimators_):
            if eval_set:
                fit_params['eval_set'] = [(eval_set[0], eval_set[1][:, i])]
            estimator.fit(X, y[:, i], **fit_params)
        
        return self

In [None]:
kfold = GroupKFold(5)
groups=kfold.split(train, groups=train.Subject)

regs = []
cvs = []

for _, (tr_idx, te_idx) in enumerate(tqdm(groups, total=5, desc="Folds")):
    
    tr_idx = pd.Series(tr_idx).sample(n=2000000,random_state=42).values

    multioutput_regressor = LGBMMultiOutputRegressor(lgb.LGBMRegressor(**best_params_))

    x_train = train.loc[tr_idx, cols].to_numpy()
    y_train = train.loc[tr_idx, pcols].to_numpy()
    
    x_test = train.loc[te_idx, cols].to_numpy()
    y_test = train.loc[te_idx, pcols].to_numpy()

    multioutput_regressor.fit(
        x_train, y_train,
        eval_set=(x_test, y_test),
        eval_metric=custom_average_precision,
        early_stopping_rounds=15,
        verbose = 0,
    )
    
    regs.append(multioutput_regressor)
    
    cv = metrics.average_precision_score(y_test, multioutput_regressor.predict(x_test).clip(0.0,1.0))
    
    cvs.append(cv)
    
print(cvs)
print(np.mean(cvs))

In [None]:
import pickle

# Save the model
with open('model.pkl', 'wb') as f:
    pickle.dump(regs, f)


In [None]:
# sub = pd.read_csv(path.join(root, 'sample_submission.csv'))
# submission = []

# # Load the saved model
# with open('model.pkl', 'rb') as f:
#     loaded_model = pickle.load(f)

# for f in test:
#     df = pd.read_csv(f)
#     df.set_index('Time', drop=True, inplace=True)

#     df['Id'] = f.split('/')[-1].split('.')[0]

#     dataset = Path(f).parts[-2]
        
# #     if dataset == 'tdcsfog':
# #         df.AccV = df.AccV / 9.80665
# #         df.AccML = df.AccML / 9.80665
# #         df.AccAP = df.AccAP / 9.80665
            
#     df['Time_frac']=(df.index/df.index.max()).values
#     df = pd.merge(df, tasks[['Id','t_group']], how='left', on='Id').fillna(-1)

#     df = pd.merge(df, metadata_w_subjects[['Id','Subject', 'Visit','Test','Medication','s_group']], how='left', on='Id').fillna(-1)
#     df_feats = fc.calculate(df, return_df=True, include_final_window=True, approve_sparsity=True, window_idx="begin")
#     df = df.merge(df_feats, how="left", left_index=True, right_index=True)
#     df.fillna(method="ffill", inplace=True)

# #     # stride
# #     df["Stride"] = df["AccV"] + df["AccML"] + df["AccAP"]

# #     # step
# #     df["Step"] = np.sqrt(abs(df["Stride"]))
        
#     res_vals = []
    
#     for i_fold in range(5):
        
#         pred = loaded_model[i_fold].predict(df[cols]).clip(0.0,1.0)
#         res_vals.append(np.expand_dims(np.round(pred, 3), axis = 2))
        
#     res_vals = np.mean(np.concatenate(res_vals, axis = 2), axis = 2)
#     res = pd.DataFrame(res_vals, columns=pcols)
    
#     df = pd.concat([df,res], axis=1)
#     df['Id'] = df['Id'].astype(str) + '_' + df.index.astype(str)
#     submission.append(df[scols])
    
# submission = pd.concat(submission)
# submission = pd.merge(sub[['Id']], submission, how='left', on='Id').fillna(0.0)
# submission[scols].to_csv('submission.csv', index=False)

In [None]:
# submission