In [None]:
import os
import gc
import random
import time

import json
from tqdm import tqdm
import glob
import numpy as np
import pandas as pd

import torch
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

from sklearn.model_selection import train_test_split, StratifiedGroupKFold
from sklearn.metrics import accuracy_score, average_precision_score

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
train_dir1 = "/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/defog"
train_dir2 = "/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/tdcsfog"

batch_size = 1024
window_size = 32
window_future = 8
window_past = window_size - window_future

wx = 8

model_dropout = 0.2
model_hidden = 512
model_nblocks = 3

lr = 0.00015
num_epochs = 10
device = 'cuda' if torch.cuda.is_available() else 'cpu'

feature_list = ['AccV', 'AccML', 'AccAP']
label_list = ['StartHesitation', 'Turn', 'Walking']
    
    

Preproces metadata and return kfold group

In [None]:
def process_metadata(metadata, directory, prefix):
    n1_sum = []
    n2_sum = []
    n3_sum = []
    count = []

    for f in tqdm(metadata['Id']):
        fpath = os.path.join(directory, f"{prefix}/{f}.csv")
        if os.path.exists(fpath):
            df = pd.read_csv(fpath)
            n1_sum.append(np.sum(df['StartHesitation']))
            n2_sum.append(np.sum(df['Turn']))
            n3_sum.append(np.sum(df['Walking']))
            count.append(len(df))

    print(f"{len(metadata)} files have positive values in all 3 classes")

    metadata['n1_sum'] = n1_sum
    metadata['n2_sum'] = n2_sum
    metadata['n3_sum'] = n3_sum
    metadata['count'] = count

def stratified_group_kfold(metadata, prefix, split_index):
    sgkf = StratifiedGroupKFold(n_splits=5, random_state=42, shuffle=True)
    for i, (train_index, valid_index) in enumerate(sgkf.split(X=metadata['Id'], y=[1]*len(metadata), groups=metadata['Subject'])):
        print(f"Fold = {i}")
        train_ids = metadata.loc[train_index, 'Id']
        valid_ids = metadata.loc[valid_index, 'Id']
        print(f"Length of Train = {len(train_ids)}, Length of Valid = {len(valid_ids)}")
        if i == split_index:
            break

    train_fpaths = [os.path.join(directory, f"{prefix}/{_id}.csv") for _id in train_ids]
    valid_fpaths = [os.path.join(directory, f"{prefix}/{_id}.csv") for _id in valid_ids]

    return train_fpaths, valid_fpaths


# Dataset

We use a window comprised of past and future time Acc readings to form our dataset for a particular time instance. In case some portion of the window data is not available, we pad them with zeros.

In [None]:
class FOGDataset(Dataset):
    def __init__(self, fpaths, scale=9.806, split="train"):
        super(FOGDataset, self).__init__()
        tm = time.time()
        self.split = split
        self.scale = scale
        
        self.fpaths = fpaths
        self.dfs = [self.read(f) for f in fpaths]
        self.f_ids = [os.path.basename(f)[:-4] for f in self.fpaths]
        
        self.end_indices = []
        self.shapes = []
        _length = 0
        for df in self.dfs:
            self.shapes.append(df.shape[0])
            _length += df.shape[0]
            self.end_indices.append(_length)
        
        self.dfs = np.concatenate(self.dfs, axis=0).astype(np.float16)
        self.length = self.dfs.shape[0]
        
        shape1 = self.dfs.shape[1]
        
        self.dfs = np.concatenate([np.zeros((wx*window_past, shape1)), self.dfs, np.zeros((wx*window_future, shape1))], axis=0)
        print(f"Dataset initialized in {time.time() - tm} secs!")
        gc.collect()
        
    def read(self, f):
        df = pd.read_csv(f)
        if self.split == "test":
            return np.array(df)
        
        df['tdcs'] = 1 if 'tdcsfog' in f else 0

        return np.array(df)
            
    def __getitem__(self, index):
        if self.split == "train":
            row_idx = random.randint(0, self.length-1) + wx*window_past
        elif self.split == "test":
            for i,e in enumerate(self.end_indices):
                if index >= e:
                    continue
                df_idx = i
                break

            row_idx_true = self.shapes[df_idx] - (self.end_indices[df_idx] - index)
            _id = self.f_ids[df_idx] + "_" + str(row_idx_true)
            row_idx = index + wx*window_past
        else:
            row_idx = index + wx*window_past
            
        x = self.dfs[row_idx - wx*window_past : row_idx + wx*window_future, 1:4]
        x = x[::wx, :][::-1, :]
        x = torch.tensor(x.astype('float'))

        t = self.dfs[row_idx, -3]*self.dfs[row_idx, -2]
        
        if self.split == "test":
            return _id, x, t
        
        y = self.dfs[row_idx, 4:7].astype('float')
        y = torch.tensor(y)
        
        return x, y, t
    
    def __len__(self):
        if self.split == "train":
            return 5_000_000
        return self.length


In [None]:
gc.collect()

# Model

We create a LSTM model with 1 hidden layer and an output fully connected layer

In [None]:
class FOGModel(nn.Module):
    def __init__(self, p=model_dropout, dim=model_hidden, nblocks=model_nblocks):
        super(FOGModel, self).__init__()
        self.dropout = nn.Dropout(p)
        self.in_layer = nn.Linear(window_size*3, dim)
        self.blocks = nn.Sequential(*[_block(dim, dim, p) for _ in range(nblocks)])
        self.out_layer = nn.Linear(dim, 3)
        
    def forward(self, x):
        x = x.view(-1, window_size*3)
        x = self.in_layer(x)
        for block in self.blocks:
            x = block(x)
        x = self.out_layer(x)
        return x

class LSTMNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc1 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):

        hidden_state = torch.zeros((self.num_layers, x.size(0), self.hidden_size), dtype=torch.float32).to(device)
        cell_state = torch.zeros((self.num_layers, x.size(0), self.hidden_size), dtype=torch.float32).to(device)

        out, _ = self.lstm(x, (hidden_state, cell_state))
        out = out[:, -1,:]
        out = self.fc1(out)
        return out

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Training

In [None]:
def train_one_epoch(model, loader, optimizer, criterion):
    loss_sum = 0.
    scaler = GradScaler()
    
    model.train()
    for x,y,t in tqdm(loader):
        x = x.to(device).float()
        y = y.to(device).float()
        t = t.to(device).float()
        
        y_pred = model(x)
        loss = criterion(y_pred, y)
        loss = torch.mean(loss*t.unsqueeze(-1), dim=1)
        
        t_sum = torch.sum(t)
        if t_sum > 0:
            loss = torch.sum(loss)/t_sum
        else:
            loss = torch.sum(loss)*0.
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        optimizer.zero_grad()
        
        loss_sum += loss.item()
    
    print(f"Train Loss: {(loss_sum/len(loader)):.04f}")

def validation_one_epoch(model, loader, criterion):
    loss_sum = 0.
    y_true_epoch = []
    y_pred_epoch = []
    t_valid_epoch = []
    
    model.eval()
    for x,y,t in tqdm(loader):
        x = x.to(device).float()
        y = y.to(device).float()
        t = t.to(device).float()
        
        with torch.no_grad():
            y_pred = model(x)
            loss = criterion(y_pred, y)
            loss = torch.mean(loss*t.unsqueeze(-1), dim=1)
            
            t_sum = torch.sum(t)
            if t_sum > 0:
                loss = torch.sum(loss)/t_sum
            else:
                loss = torch.sum(loss)*0.
        
        loss_sum += loss.item()
        y_true_epoch.append(y.cpu().numpy())
        y_pred_epoch.append(y_pred.cpu().numpy())
        t_valid_epoch.append(t.cpu().numpy())
        
    y_true_epoch = np.concatenate(y_true_epoch, axis=0)
    y_pred_epoch = np.concatenate(y_pred_epoch, axis=0)
    
    t_valid_epoch = np.concatenate(t_valid_epoch, axis=0)
    y_true_epoch = y_true_epoch[t_valid_epoch > 0, :]
    y_pred_epoch = y_pred_epoch[t_valid_epoch > 0, :]
    
    scores = [average_precision_score(y_true_epoch[:,i], y_pred_epoch[:,i]) for i in range(3)]
    mean_score = np.mean(scores)
    print(f"Validation Loss: {(loss_sum/len(loader)):.04f}, Validation Score: {mean_score:.03f}, ClassWise: {scores[0]:.03f},{scores[1]:.03f},{scores[2]:.03f}")
    
    return mean_score


In [None]:
INPUT_SIZE = 3
HIDDEN_SIZE = 10
NUM_LAYERS = 1
NUM_CLASSES = 3
PARAMS = {
    "input_size" : INPUT_SIZE,
    "hidden_size" : HIDDEN_SIZE,
    "num_layers" : NUM_LAYERS,
    "num_classes" : NUM_CLASSES
}

model = LSTMNet(**PARAMS).to(device)
print(f"Number of parameters in model - {count_parameters(model):,}")

train_metadata_tdcs = pd.read_csv("/kaggle/input/copy-train-metadata/tdcsfog_metadata.csv")
train_metadata_de = pd.read_csv("/kaggle/input/copy-train-metadata/defog_metadata.csv")

process_metadata(train_metadata_tdcs, train_dir2, "tdcsfog")
process_metadata(train_metadata_de, train_dir1, "defog")

train_fpaths_tdcs, valid_fpaths_tdcs = stratified_group_kfold(train_metadata_tdcs, "tdcsfog", 2)
train_fpaths_de, valid_fpaths_de = stratified_group_kfold(train_metadata_de, "defog", 1)

train_fpaths = [(f, 'de') for f in train_fpaths_de] + [(f, 'tdcs') for f in train_fpaths_tdcs]
valid_fpaths = [(f, 'de') for f in valid_fpaths_de] + [(f, 'tdcs') for f in valid_fpaths_tdcs]

train_dataset = FOGDataset(train_fpaths, split="train")
valid_dataset = FOGDataset(valid_fpaths, split="valid")
print(f"lengths of datasets: train - {len(train_dataset)}, valid - {len(valid_dataset)}")

train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=5, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, num_workers=5)

optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = torch.nn.BCEWithLogitsLoss(reduction='none').to(device)

max_score = 0.0

for epoch in range(num_epochs):
    print(f"Epoch: {epoch}")
    train_one_epoch(model, train_loader, optimizer, criterion)
    score = validation_one_epoch(model, valid_loader, criterion)



gc.collect()

# Submission

In [None]:
model.eval()

test_defog_paths = glob.glob("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/test/defog/*.csv")
test_tdcsfog_paths = glob.glob("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/test/tdcsfog/*.csv")
test_fpaths = [(f, 'de') for f in test_defog_paths] + [(f, 'tdcs') for f in test_tdcsfog_paths]

test_dataset = FOGDataset(test_fpaths, split="test")
test_loader = DataLoader(test_dataset, batch_size=cfg.batch_size, num_workers=5)

ids = []
prediction = []

for _id, x, _ in tqdm(test_loader):
    x = x.to(cfg.device).float()
    with torch.no_grad():
        y_pred = model(x)*0.1
    
    ids.extend(_id)
    prediction.extend(list(np.nan_to_num(y_pred.cpu().numpy())))

In [None]:
sample_submission = pd.read_csv("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/sample_submission.csv")
sample_submission.shape

In [None]:
prediction = np.array(prediction)
submission = pd.DataFrame({'Id': ids, 'StartHesitation': np.round(prediction[:,0],5), \
                           'Turn': np.round(prediction[:,1],5), 'Walking': np.round(prediction[:,2],5)})

submission = pd.merge(sample_submission[['Id']], submission, how='left', on='Id').fillna(0.0)
submission.to_csv("submission.csv", index=False)