In [None]:
!pip install torch torchvision torchaudio
!pip install lightning
!pip install kaggle

Collecting lightning
  Downloading lightning-2.2.0.post0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities<2.0,>=0.8.0 (from lightning)
  Downloading lightning_utilities-0.10.1-py3-none-any.whl (24 kB)
Collecting torchmetrics<3.0,>=0.7.0 (from lightning)
  Downloading torchmetrics-1.3.1-py3-none-any.whl (840 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m840.4/840.4 kB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
Collecting pytorch-lightning (from lightning)
  Downloading pytorch_lightning-2.2.0.post0-py3-none-any.whl (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.9/800.9 kB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: lightning-utilities, torchmetrics, pytorch-lightning, lightning
Successfully installed lightning-2.2.0.post0 lightning-utilities-0.10.1 pytorch-lightning-2.2.0.post0 t

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
import torch
import torchvision
import lightning as L
import torch.nn as nn
import torch.nn.functional as F
from torch import optim, nn
from torch.utils.data import Dataset, DataLoader,random_split,Subset, SubsetRandomSampler
from torchvision import models, transforms
from torchvision.datasets import ImageFolder
from torchvision import datasets
from torchsummary import summary
from tqdm import tqdm
from lightning.pytorch.callbacks import LearningRateMonitor, ModelCheckpoint, EarlyStopping, Callback
from lightning.pytorch.loggers import TensorBoardLogger
from sklearn.preprocessing import LabelEncoder
import pickle
from tqdm import tqdm

device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c bgu-i-know-what-you-did-last-measurement-time

# Unzip the downloaded files and then remove the zip files
!unzip bgu-i-know-what-you-did-last-measurement-time.zip

In [None]:
unlabeled_dir = '/content/unlabeled/unlabeled'

unlabeled_files = os.listdir(unlabeled_dir)

In [None]:
train_data = pd.read_csv('train.csv')

In [None]:
# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit the label encoder and transform the 'activity' column to get encoded labels
train_data['activity_encoded'] = label_encoder.fit_transform(train_data['activity'])

## **Part 2 - Neural Network**

**a. validation strategy - train_test_split**

In [None]:
from sklearn.model_selection import train_test_split

# Identify unique users
unique_subjects = train_data['userid'].unique()

# Split user IDs into training and validation sets
train_subjects, val_subjects = train_test_split(unique_subjects, test_size=0.2, random_state=42)

# Create training and validation sets
train_set = train_data[train_data['userid'].isin(train_subjects)]
val_set = train_data[train_data['userid'].isin(val_subjects)]

# Summary of the split
print(f"Training set includes {len(train_set)} samples from {len(train_subjects)} subjects.")
print(f"Validation set includes {len(val_set)} samples from {len(val_subjects)} subjects.")

Training set includes 38001 samples from 6 subjects.
Validation set includes 12247 samples from 2 subjects.


**c. Classical ML model**

In [None]:
import os
import pandas as pd
from scipy.stats import skew, kurtosis

def extract_statistical_features(df, data_dir):
    """
    Extracts statistical features from time-series data corresponding to each row in the DataFrame.

    Parameters:
    - df: DataFrame with at least an 'id' column to locate the data files.
    - data_dir: The directory where the sequence data files are stored.

    Returns:
    - DataFrame with original data augmented with statistical features.
    """
    df = df.copy()

    means, stds, mins, maxs, medians, skews, kurtoses = [], [], [], [], [], [], []

    for _, row in df.iterrows():
        seq_id = row['id']
        file_path = os.path.join(data_dir, f"{seq_id}.csv")
        seq_df = pd.read_csv(file_path)

        # Filter by measurement type if necessary
        if "measurement type" in seq_df.columns:
            seq_df = seq_df[seq_df["measurement type"] == "acceleration [m/s/s]"]

        # Calculate statistical features for each axis
        for axis in ['x', 'y', 'z']:
            axis_data = seq_df[axis] if axis in seq_df.columns else seq_df[f"{axis} [m]"]
            means.append(axis_data.mean())
            stds.append(axis_data.std())
            mins.append(axis_data.min())
            maxs.append(axis_data.max())
            medians.append(axis_data.median())
            skews.append(skew(axis_data))  # Corrected
            kurtoses.append(kurtosis(axis_data))  # Corrected

    # Create a DataFrame with the calculated features
    features_df = pd.DataFrame({
        'x_mean': means[0::3], 'y_mean': means[1::3], 'z_mean': means[2::3],
        'x_std': stds[0::3], 'y_std': stds[1::3], 'z_std': stds[2::3],
        'x_min': mins[0::3], 'y_min': mins[1::3], 'z_min': mins[2::3],
        'x_max': maxs[0::3], 'y_max': maxs[1::3], 'z_max': maxs[2::3],
        'x_median': medians[0::3], 'y_median': medians[1::3], 'z_median': medians[2::3],
        'x_skew': skews[0::3], 'y_skew': skews[1::3], 'z_skew': skews[2::3],
        'x_kurtosis': kurtoses[0::3], 'y_kurtosis': kurtoses[1::3], 'z_kurtosis': kurtoses[2::3],
    })

    # Return the original DataFrame augmented with the new features
    return pd.concat([df.reset_index(drop=True), features_df], axis=1)

In [None]:
# Load the train_features DataFrame from the saved CSV file
train_features_loaded = pd.read_csv('/content/train_features_1.csv')

# Load the val_features DataFrame from the saved CSV file
val_features_loaded = pd.read_csv('/content/val_features_1.csv')

In [None]:
features_columns = ['x_mean', 'y_mean', 'z_mean', 'x_std', 'y_std', 'z_std','x_min', 'y_min', 'z_min','x_max', 'y_max', 'z_max','x_median', 'y_median', 'z_median', 'x_skew', 'y_skew', 'z_skew','x_kurtosis', 'y_kurtosis', 'z_kurtosis']

In [None]:
# Assuming 'features_columns' is a list of your feature column names
X_train_loaded = train_features_loaded[features_columns].values
X_val_loaded = val_features_loaded[features_columns].values

y_train_loaded = train_features_loaded['activity_encoded'].values
y_val_loaded = val_features_loaded['activity_encoded'].values

In [None]:
class HARDataset(Dataset):
    def __init__(self, data, data_dir, transform=None, labeled=True):
        """
        Initializes the dataset.
        :param data: DataFrame for labeled data or list of filenames for unlabeled data.
        :param data_dir: Base directory where the data files are stored.
        :param transform: Optional transform to be applied on a sample.
        :param labeled: Flag indicating if the dataset is for labeled or unlabeled data.
        """
        self.data = data
        self.data_dir = data_dir
        self.transform = transform
        self.labeled = labeled

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if self.labeled:
            # Handling labeled data
            seq_id, activity_label = self.data.iloc[idx]['id'], self.data.iloc[idx]['activity_encoded']
            seq_file_path = os.path.join(self.data_dir, f"{seq_id}.csv")
        else:# Handling unlabeled data
            seq_file_path = os.path.join(self.data_dir, self.data[idx])

        # Load sequence data
        seq_frame = pd.read_csv(seq_file_path)
        if self.labeled and "measurement type" in seq_frame.columns:
            # Filter by acceleration if it's labeled data and contains a 'measurement type' column
            seq_frame = seq_frame[seq_frame["measurement type"] == "acceleration [m/s/s]"]
            sequence_data = seq_frame.iloc[:, 1:4].values
        else:
            sequence_data = seq_frame.values  # Use all data if no filtering criteria

        # Apply transformations
        if self.transform:
            sequence_data = self.transform(sequence_data)
        sequence_data_tensor = torch.tensor(sequence_data, dtype=torch.float32)

        if self.labeled:
            activity_label = self.data.iloc[idx]['activity_encoded']
            activity_label_tensor = torch.tensor(activity_label, dtype=torch.long)
            return sequence_data_tensor, activity_label_tensor
        else:
            return sequence_data_tensor  # No label for unlabeled data

In [None]:
#Padding from start for LSTM

def pad_sequences_from_start(batch, max_length=3000):
    # Initialize padded sequences and labels
    # Assuming each sequence has the same number of features
    num_features = batch[0][0].shape[1]
    padded_sequences = torch.zeros(len(batch), max_length, num_features)
    # labels = []
    labels = torch.zeros(len(batch), dtype=torch.long)

    for i, (sequence, label) in enumerate(batch):
        length = sequence.shape[0]
        start = max(max_length - length, 0)  # Calculate start index for sequence
        if length > max_length:
            # If sequence is longer than max_length, truncate it from the end
            padded_sequence = sequence[-max_length:]
        else:
            padded_sequences[i, start:] = sequence

        labels[i] = label

    return padded_sequences, labels

In [None]:
def pad_sequences_from_start(batch, max_length=3000):
    # Initialize padded sequences and labels
    num_features = batch[0].shape[1] if type(batch[0]) == torch.Tensor else batch[0][0].shape[1]
    padded_sequences = torch.zeros(len(batch), max_length, num_features)
    labels = None  # Initialize labels as None

    for i, data in enumerate(batch):
        if isinstance(data, tuple):  # Check if data has labels
            sequence, label = data
            if labels is None:  # Initialize labels if they exist
                labels = torch.zeros(len(batch), dtype=torch.long)
            labels[i] = label
        else:
            sequence = data  # No label present

        length = sequence.shape[0]
        start = max(max_length - length, 0)  # Calculate start index for sequence
        if length > max_length:
            padded_sequence = sequence[-max_length:]
        else:
            padded_sequences[i, start:] = sequence

    return (padded_sequences, labels) if labels is not None else padded_sequences


In [None]:
# Create dataset instances
train_dataset = HARDataset(train_features_loaded, unlabeled_dir, labeled=True)
val_dataset = HARDataset(val_features_loaded, unlabeled_dir, labeled=True)

In [None]:
# Create data loaders
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=pad_sequences_from_start)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=pad_sequences_from_start)

In [None]:
for i, (x, y) in enumerate(train_dataloader):
    if i == 5:
        break
    print(f"Batch {i+1}:")
    print(f"X shape: {x.shape}")
    print(f"y shape: {y.shape}")
    print(f"y: {y}")
    print()


## **Autoencoder**

### **#Unlabeled**

In [27]:
import os
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
from torch.utils.data import Dataset
import torch

def extract_statistical_features(data_dir, file_list):
    features = []

    for file_name in file_list:
        file_path = os.path.join(data_dir, file_name)
        seq_df = pd.read_csv(file_path)

        if "measurement type" in seq_df.columns:
            seq_df_filtered = seq_df[seq_df["measurement type"].str.contains("acceleration")].copy()
            seq_df_filtered.drop(columns=['measurement type'], inplace=True)
        else:
            seq_df_filtered = seq_df.copy()

        if 'x [m]' in seq_df_filtered.columns:
            seq_df_filtered.rename(columns={'x [m]': 'x', 'y [m]': 'y', 'z [m]': 'z'}, inplace=True)

        stats = []
        for axis in ['x', 'y', 'z']:
            axis_data = seq_df_filtered[axis]
            stats.extend([axis_data.mean(), axis_data.std()])

        features.append(stats)

    columns = ['x_mean', 'y_mean', 'z_mean', 'x_std', 'y_std', 'z_std']

    features_df = pd.DataFrame(features, columns=columns)
    return features_df


In [28]:
def pad_sequences_from_start_unlabeled(batch, max_length=4000):
    num_features = batch[0].shape[1]  # Adjusted for sequences only, no labels
    padded_sequences = torch.zeros(len(batch), max_length, num_features)

    for i, sequence in enumerate(batch):  # No labels to unpack
        length = sequence.shape[0]
        if length > max_length:
            padded_sequences[i] = sequence[-max_length:]  # Truncate from the start
        else:
            start = max_length - length  # Calculate start index for sequence
            padded_sequences[i, start:] = sequence  # Pad at the beginning

    return padded_sequences

In [None]:
class UnlabeledSequenceDataset(Dataset):
    def __init__(self, sequence_dir, features_df, file_list):
        self.sequence_dir = sequence_dir
        self.features_df = features_df  # DataFrame of features
        self.file_list = file_list

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        file_name = self.file_list[idx].split('.')[0]  # Assuming file_list contains file names with '.csv' extension
        seq_file_path = os.path.join(self.sequence_dir, self.file_list[idx])
        sequence = pd.read_csv(seq_file_path)

        if 'measurement type' in sequence.columns:
            sequence_filtered = sequence[sequence['measurement type'] == 'acceleration [m/s/s]'].copy()
            sequence_filtered.drop(columns=['measurement type'], inplace=True)
            sequence_filtered.rename(columns=lambda x: x.strip().split(' ')[0], inplace=True)
        else:
            sequence_filtered = sequence.copy()

        # Check and convert axis names if they are in 'x [m]', 'y [m]', 'z [m]' format
        if set(['x [m]', 'y [m]', 'z [m]']).issubset(sequence_filtered.columns):
            sequence_filtered.rename(columns={'x [m]': 'x', 'y [m]': 'y', 'z [m]': 'z'}, inplace=True)

        sequence_tensor = torch.tensor(sequence_filtered[['x', 'y', 'z']].values, dtype=torch.float32)

        features = self.features_df.loc[self.features_df.index == file_name]
        if features.empty:
            features_tensor = torch.zeros(self.features_df.shape[1])
        else:
            features_tensor = torch.tensor(features.values[0], dtype=torch.float32)  # Assuming features are in the first row if present

        return sequence_tensor, features_tensor

In [None]:
# import random

# # Example usage with a subset of data
# data_dir = unlabeled_dir  # Directory where the sequence data files are stored
# all_file_list = os.listdir(data_dir)  # Get all file names in the directory

# # Select a random 5% of the files
# sample_size = int(len(all_file_list) * 0.05)  # Calculate 5% of the total number of files
# sampled_file_list = random.sample(all_file_list, sample_size)  # Randomly select files


In [None]:
# Now, use only this subset for your feature extraction
# unlabeled_features_sample = extract_statistical_features(data_dir, sampled_file_list)

In [None]:
# Ensure this list corresponds to the actual files used to create 'unlabeled_features_sample'
# sampled_files = random.sample(unlabeled_files, len(unlabeled_features_sample))  # Or however you've obtained the sample
# unlabeled_features_sample.index = [name.split('.')[0] for name in sampled_file_list]


In [None]:
def custom_collate_fn(batch):
    # Extract sequences and features from the batch
    sequences = [item[0] for item in batch]  # Extract sequences
    features = [item[1] for item in batch]  # Extract features

    # Pad the sequences
    max_length = 4000  # Adjust as necessary
    padded_sequences = pad_sequences_from_start_unlabeled(sequences, max_length)

    # Convert features to a tensor (since they should all be the same length already)
    features_tensor = torch.stack(features)

    return padded_sequences, features_tensor

In [None]:
# # Create your dataset using the sampled files and their corresponding features
# unlabeled_dataset_sample = UnlabeledSequenceDataset(data_dir, unlabeled_features_sample, sampled_file_list)

# # Now, create your DataLoader from this dataset
# unlabeled_dataloader_sample = DataLoader(unlabeled_dataset_sample, batch_size=64, shuffle=True, collate_fn=custom_collate_fn)


In [None]:
# for i, (sequences, features) in enumerate(unlabeled_dataloader_sample):
#     # sequences: padded sequence tensors
#     # features: corresponding statistical feature tensors
#     # Add your training code here
#     print(f"Batch {i + 1}: Sequences shape = {sequences.shape}, Features shape = {features.shape}")
#     if i == 5:  # Just an example to limit the loop for testing
#         break


Batch 1: Sequences shape = torch.Size([64, 4000, 3]), Features shape = torch.Size([64, 21])
Batch 2: Sequences shape = torch.Size([64, 4000, 3]), Features shape = torch.Size([64, 21])
Batch 3: Sequences shape = torch.Size([64, 4000, 3]), Features shape = torch.Size([64, 21])
Batch 4: Sequences shape = torch.Size([64, 4000, 3]), Features shape = torch.Size([64, 21])
Batch 5: Sequences shape = torch.Size([64, 4000, 3]), Features shape = torch.Size([64, 21])
Batch 6: Sequences shape = torch.Size([64, 4000, 3]), Features shape = torch.Size([64, 21])


In [None]:
# Extract features
unlabeled_features = extract_statistical_features(unlabeled_dir, unlabeled_files)

In [None]:
unlabeled_features.index = [name.split('.')[0] for name in unlabeled_files]
# unlabeled_features.index = [name.replace('.csv', '') for name in unlabeled_files]  # Set file names (without '.csv') as index

In [None]:
# Assuming unlabeled_features is a pandas DataFrame
unlabeled_features.to_csv('unlabeled_features.csv', index=False)

In [None]:
# Instantiate the dataset
unlabeled_dataset = UnlabeledSequenceDataset(
    sequence_dir=unlabeled_dir,
    features_df=unlabeled_features,
    file_list=unlabeled_files
)

# Create the DataLoader
unlabeled_dataloader = DataLoader(
    dataset=unlabeled_dataset,
    batch_size=64,  # You can adjust the batch size according to your needs
    shuffle=True,   # Shuffle the data to ensure random sampling
    collate_fn=custom_collate_fn,  # Use your custom collate function
    drop_last=True  # Drop the last batch if it's smaller than the specified batch size
)

In [None]:
# After extracting features, ensure the index matches the modified file names exactly.
unlabeled_features.index = [name.split('.')[0] for name in unlabeled_files]  # Assuming the file names are like '1641.csv'


In [None]:
for i, (sequences, features) in enumerate(unlabeled_dataloader):
    # sequences: padded sequence tensors
    # features: corresponding statistical feature tensors
    # Add your training code here
    print(f"Batch {i + 1}: Sequences shape = {sequences.shape}, Features shape = {features.shape}")
    if i == 5:  # Just an example to limit the loop for testing
        break

Batch 1: Sequences shape = torch.Size([64, 4000, 3]), Features shape = torch.Size([64, 6])
Batch 2: Sequences shape = torch.Size([64, 4000, 3]), Features shape = torch.Size([64, 6])
Batch 3: Sequences shape = torch.Size([64, 4000, 3]), Features shape = torch.Size([64, 6])
Batch 4: Sequences shape = torch.Size([64, 4000, 3]), Features shape = torch.Size([64, 6])
Batch 5: Sequences shape = torch.Size([64, 4000, 3]), Features shape = torch.Size([64, 6])
Batch 6: Sequences shape = torch.Size([64, 4000, 3]), Features shape = torch.Size([64, 6])


### **#Define the Autoencoder for Pretraining**

**#Try**

In [29]:
import os
import pandas as pd
from scipy.stats import skew, kurtosis

def extract_statistical_features(df, data_dir):
    """
    Extracts statistical features from time-series data corresponding to each row in the DataFrame.

    Parameters:
    - df: DataFrame with at least an 'id' column to locate the data files.
    - data_dir: The directory where the sequence data files are stored.

    Returns:
    - DataFrame with original data augmented with statistical features.
    """
    df = df.copy()

    means, stds, mins, maxs, medians, skews, kurtoses = [], [], [], [], [], [], []

    for _, row in df.iterrows():
        seq_id = row['id']
        file_path = os.path.join(data_dir, f"{seq_id}.csv")
        seq_df = pd.read_csv(file_path)

        # Filter by measurement type if necessary
        if "measurement type" in seq_df.columns:
            seq_df = seq_df[seq_df["measurement type"] == "acceleration [m/s/s]"]

        # Calculate statistical features for each axis
        for axis in ['x', 'y', 'z']:
            axis_data = seq_df[axis] if axis in seq_df.columns else seq_df[f"{axis} [m]"]
            means.append(axis_data.mean())
            stds.append(axis_data.std())
            mins.append(axis_data.min())
            maxs.append(axis_data.max())
            medians.append(axis_data.median())
            skews.append(skew(axis_data))  # Corrected
            kurtoses.append(kurtosis(axis_data))  # Corrected

    # Create a DataFrame with the calculated features
    features_df = pd.DataFrame({
        'x_mean': means[0::3], 'y_mean': means[1::3], 'z_mean': means[2::3],
        'x_std': stds[0::3], 'y_std': stds[1::3], 'z_std': stds[2::3],
        'x_min': mins[0::3], 'y_min': mins[1::3], 'z_min': mins[2::3],
        'x_max': maxs[0::3], 'y_max': maxs[1::3], 'z_max': maxs[2::3],
        'x_median': medians[0::3], 'y_median': medians[1::3], 'z_median': medians[2::3],
        'x_skew': skews[0::3], 'y_skew': skews[1::3], 'z_skew': skews[2::3],
        'x_kurtosis': kurtoses[0::3], 'y_kurtosis': kurtoses[1::3], 'z_kurtosis': kurtoses[2::3],
    })

    # Return the original DataFrame augmented with the new features
    return pd.concat([df.reset_index(drop=True), features_df], axis=1)

In [30]:
# Load the train_features DataFrame from the saved CSV file
train_features_loaded = pd.read_csv('/content/train_features_1.csv')

# Load the val_features DataFrame from the saved CSV file
val_features_loaded = pd.read_csv('/content/val_features_1.csv')

In [None]:
features_columns = ['x_mean', 'y_mean', 'z_mean', 'x_std', 'y_std', 'z_std','x_min', 'y_min', 'z_min','x_max', 'y_max', 'z_max','x_median', 'y_median', 'z_median', 'x_skew', 'y_skew', 'z_skew','x_kurtosis', 'y_kurtosis', 'z_kurtosis']

In [56]:
import os
import pandas as pd
import torch

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data_dir, file_list, transform=None):
        """
        Args:
            data_dir (string): Directory with all the sequences.
            file_list (DataFrame): DataFrame containing file names and labels.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.data_dir = data_dir
        self.data = file_list
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if isinstance(self.data, pd.DataFrame):
            seq_id = self.data.iloc[idx]['id']  # Assuming 'id' contains file names without '.csv' extension
            seq_file_path = os.path.join(self.data_dir, f"{seq_id}.csv")
        else:
            # Handle the case where self.data is not a DataFrame but a list or similar
            seq_file_path = os.path.join(self.data_dir, self.data[idx])

        # Load sequence
        sequence = pd.read_csv(seq_file_path)

        # Preprocess and filter sequence data if necessary
        if 'measurement type' in sequence.columns:
            sequence_filtered = sequence[sequence['measurement type'] == 'acceleration [m/s/s]'].copy()
            sequence_filtered.drop(columns=['measurement type'], inplace=True)
            sequence_filtered.rename(columns=lambda x: x.strip().split(' ')[0], inplace=True)
        else:
            sequence_filtered = sequence.copy()

        # Convert axis names if in specific format
        if set(['x [m]', 'y [m]', 'z [m]']).issubset(sequence_filtered.columns):
            sequence_filtered.rename(columns={'x [m]': 'x', 'y [m]': 'y', 'z [m]': 'z'}, inplace=True)

        # Apply transformations
        if self.transform:
            sequence_filtered = self.transform(sequence_filtered)  # Ensure your transform can handle a DataFrame or a numpy array

        # Convert filtered sequence to tensor
        sequence_data_tensor = torch.tensor(sequence_filtered[['x', 'y', 'z']].values, dtype=torch.float32)

        # For autoencoder, return the same data as both input and output
        return sequence_data_tensor, sequence_data_tensor



In [57]:
def pad_sequences_from_start(batch, max_length=4000):
    # Initialize padded sequences
    num_features = batch[0].shape[1]  # Adjusted for batch without labels
    padded_sequences = torch.zeros(len(batch), max_length, num_features)

    for i, sequence in enumerate(batch):
        length = sequence.shape[0]
        start = max(max_length - length, 0)  # Calculate start index for sequence
        if length > max_length:
            # If sequence is longer than max_length, truncate it from the end
            padded_sequence = sequence[-max_length:]
        else:
            padded_sequences[i, start:] = sequence

    return padded_sequences  # Return only the padded sequences for unlabeled data

In [54]:
# Create dataset instances
train_dataset = CustomDataset(train_features_loaded, unlabeled_dir, labeled=False)
val_dataset = CustomDataset(val_features_loaded, unlabeled_dir, labeled=False)

# Create data loaders
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=pad_sequences_from_start)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=True, collate_fn=pad_sequences_from_start)

In [55]:
for i, (sequences, targets) in enumerate(train_dataloader):
    print(f"Batch {i + 1}:")
    print(f"Sequences shape: {sequences.shape}")
    print(f"Targets shape: {targets.shape}")  # If you're using targets, otherwise ignore
    if i == 2:  # Check first three batches
        break


KeyError: 3675

In [None]:
# Check the configuration for train_dataloader
print("Checking configuration for train_dataloader:")
for i, x in enumerate(train_dataloader):
    print(f"Batch {i+1}:")
    print(f"X shape: {x.shape}")  # x is the batch of sequences
    if i >= 2:  # Print the first 3 batches to check their shapes
        break

# Add a separator for clarity
print("\n" + "="*50 + "\n")

# Check the configuration for val_dataloader
print("Checking configuration for val_dataloader:")
for i, x in enumerate(val_dataloader):
    print(f"Batch {i+1}:")
    print(f"X shape: {x.shape}")  # x is the batch of sequences
    if i >= 2:  # Print the first 3 batches to check their shapes
        break


########################3

In [59]:
import os
import pandas as pd
from scipy.stats import skew, kurtosis

def extract_statistical_features(df, data_dir):
    """
    Extracts statistical features from time-series data corresponding to each row in the DataFrame.

    Parameters:
    - df: DataFrame with at least an 'id' column to locate the data files.
    - data_dir: The directory where the sequence data files are stored.

    Returns:
    - DataFrame with original data augmented with statistical features.
    """
    df = df.copy()

    means, stds, mins, maxs, medians, skews, kurtoses = [], [], [], [], [], [], []

    for _, row in df.iterrows():
        seq_id = row['id']
        file_path = os.path.join(data_dir, f"{seq_id}.csv")
        seq_df = pd.read_csv(file_path)

        # Filter by measurement type if necessary
        if "measurement type" in seq_df.columns:
            seq_df = seq_df[seq_df["measurement type"] == "acceleration [m/s/s]"]

        # Calculate statistical features for each axis
        for axis in ['x', 'y', 'z']:
            axis_data = seq_df[axis] if axis in seq_df.columns else seq_df[f"{axis} [m]"]
            means.append(axis_data.mean())
            stds.append(axis_data.std())
            mins.append(axis_data.min())
            maxs.append(axis_data.max())
            medians.append(axis_data.median())
            skews.append(skew(axis_data))  # Corrected
            kurtoses.append(kurtosis(axis_data))  # Corrected

    # Create a DataFrame with the calculated features
    features_df = pd.DataFrame({
        'x_mean': means[0::3], 'y_mean': means[1::3], 'z_mean': means[2::3],
        'x_std': stds[0::3], 'y_std': stds[1::3], 'z_std': stds[2::3],
        'x_min': mins[0::3], 'y_min': mins[1::3], 'z_min': mins[2::3],
        'x_max': maxs[0::3], 'y_max': maxs[1::3], 'z_max': maxs[2::3],
        'x_median': medians[0::3], 'y_median': medians[1::3], 'z_median': medians[2::3],
        'x_skew': skews[0::3], 'y_skew': skews[1::3], 'z_skew': skews[2::3],
        'x_kurtosis': kurtoses[0::3], 'y_kurtosis': kurtoses[1::3], 'z_kurtosis': kurtoses[2::3],
    })

    # Return the original DataFrame augmented with the new features
    return pd.concat([df.reset_index(drop=True), features_df], axis=1)

In [60]:
# Load the train_features DataFrame from the saved CSV file
train_features_loaded = pd.read_csv('/content/train_features_1.csv')

# Load the val_features DataFrame from the saved CSV file
val_features_loaded = pd.read_csv('/content/val_features_1.csv')

In [None]:
features_columns = ['x_mean', 'y_mean', 'z_mean', 'x_std', 'y_std', 'z_std','x_min', 'y_min', 'z_min','x_max', 'y_max', 'z_max','x_median', 'y_median', 'z_median', 'x_skew', 'y_skew', 'z_skew','x_kurtosis', 'y_kurtosis', 'z_kurtosis']

In [75]:
class HARDataset(Dataset):
    def __init__(self, data, data_dir, transform=None, labeled=True):
        """
        Initializes the dataset.
        :param data: DataFrame for labeled data or list of filenames for unlabeled data.
        :param data_dir: Base directory where the data files are stored.
        :param transform: Optional transform to be applied on a sample.
        :param labeled: Flag indicating if the dataset is for labeled or unlabeled data.
        """
        self.data = data
        self.data_dir = data_dir
        self.transform = transform
        self.labeled = labeled

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if self.labeled:
            # Handling labeled data
            seq_id, activity_label = self.data.iloc[idx]['id'], self.data.iloc[idx]['activity_encoded']
            seq_file_path = os.path.join(self.data_dir, f"{seq_id}.csv")
        else:# Handling unlabeled data
            seq_file_path = os.path.join(self.data_dir, self.data[idx])

        # Load sequence data
        seq_frame = pd.read_csv(seq_file_path)
        if  "measurement type" in seq_frame.columns:
            # Filter by acceleration if it's labeled data and contains a 'measurement type' column
            seq_frame = seq_frame[seq_frame["measurement type"] == "acceleration [m/s/s]"]
            sequence_data = seq_frame.iloc[:, 1:4].values
        else:
            sequence_data = seq_frame.values  # Use all data if no filtering criteria

        # Apply transformations
        if self.transform:
            sequence_data = self.transform(sequence_data)
        sequence_data_tensor = torch.tensor(sequence_data, dtype=torch.float32)

        if self.labeled:
            activity_label = self.data.iloc[idx]['activity_encoded']
            activity_label_tensor = torch.tensor(activity_label, dtype=torch.long)
            return sequence_data_tensor, activity_label_tensor
        else:
            return sequence_data_tensor  # No label for unlabeled data

In [76]:
#Padding from start for LSTM

def pad_sequences_from_start(batch, max_length=3000):
    # Initialize padded sequences and labels
    # Assuming each sequence has the same number of features
    num_features = batch[0][0].shape[1]
    padded_sequences = torch.zeros(len(batch), max_length, num_features)
    # labels = []
    labels = torch.zeros(len(batch), dtype=torch.long)

    for i, (sequence, label) in enumerate(batch):
        length = sequence.shape[0]
        start = max(max_length - length, 0)  # Calculate start index for sequence
        if length > max_length:
            # If sequence is longer than max_length, truncate it from the end
            padded_sequence = sequence[-max_length:]
        else:
            padded_sequences[i, start:] = sequence

        labels[i] = label

    return padded_sequences, labels

In [82]:
# Create dataset instances
train_dataset = HARDataset(train_features_loaded, unlabeled_dir, labeled=True)
val_dataset = HARDataset(val_features_loaded, unlabeled_dir, labeled=True)

In [83]:
# Create data loaders
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=pad_sequences_from_start)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=True, collate_fn=pad_sequences_from_start)

In [86]:
for i, (x,y) in enumerate(train_dataloader):
    if i == 5:
        break
    print(f"Batch {i+1}:")
    print(f"X shape: {x.shape}")
    print(f"y shape: {y.shape}")
    print(f"y: {y}")
    print()

Batch 1:
X shape: torch.Size([64, 3000, 3])
y shape: torch.Size([64])
y: tensor([ 7, 10,  5, 16,  6,  9, 10, 16,  9,  9, 13,  7,  8,  9,  9, 12,  9,  3,
         0, 11,  1,  5, 11, 10, 10, 16,  7,  7,  7, 12,  2,  7, 13,  0, 10, 11,
        10,  9,  4,  7, 16,  9,  9,  8, 15,  7,  1, 16,  3,  2, 14,  4, 14,  7,
         1,  8, 11,  9,  9, 13,  2,  7, 15,  9])

Batch 2:
X shape: torch.Size([64, 3000, 3])
y shape: torch.Size([64])
y: tensor([ 3, 14, 15, 14,  1, 15,  8, 10,  1, 14, 10,  1,  8,  2, 11,  7,  8,  2,
         9, 13, 17, 12,  2,  8, 11,  6,  2, 15,  7, 11, 10,  2, 10,  8,  9, 12,
        12,  3, 15,  8,  3,  3, 12, 10,  7,  2, 12,  9,  6, 13,  7,  3,  4, 12,
         1, 14, 12, 10,  5,  7, 13, 11, 10,  1])

Batch 3:
X shape: torch.Size([64, 3000, 3])
y shape: torch.Size([64])
y: tensor([ 7, 15, 10, 13, 13, 15, 16, 10,  5, 14, 14,  9,  7,  3,  9,  9, 14,  7,
         7, 10, 12,  0, 13,  7,  3,  2,  5,  7, 11,  7,  0, 10, 12, 13,  2,  2,
        12, 14,  9, 17,  9, 17,  6, 13,  

####################################3

In [87]:
class LSTMAutoencoder(L.LightningModule):
    def __init__(self, input_size, hidden_size, num_layers):
        super(LSTMAutoencoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # Encoder
        self.encoder = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        # Decoder
        self.decoder = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        # Map the output of the decoder to the input space
        self.output_layer = nn.Linear(hidden_size, input_size)

        self.TrainLossEs = []

    def forward(self, x):
        # Encoding
        _, (hidden, cell) = self.encoder(x)  # We only use the hidden states; ignore the outputs

        # Prepare decoder initial input (typically zeros, but you can modify this)
        decoder_input = torch.zeros(x.size(0), x.size(1), self.hidden_size).to(x.device)  # Size: (batch_size, seq_len, hidden_size)

        # Decoding
        decoded, _ = self.decoder(decoder_input, (hidden, cell))  # Pass hidden states as initial state

        # Mapping decoded states to input space
        decoded = self.output_layer(decoded)  # Size: (batch_size, seq_len, input_size)
        return decoded

    def training_step(self, batch, batch_idx):
      sequences, _ = batch  # Assuming the batch returns a tuple (sequences, features)
      sequences = sequences.float()  # Ensure sequences are of type float for LSTM

      # Ensure sequences are correctly shaped [batch_size, seq_length, num_features]
      reconstructed = self(sequences)

      # Compute the loss
      loss = F.mse_loss(reconstructed, sequences)
      self.TrainLossEs.append(loss.item())
      self.log('train_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
      return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.0001)

    def get_train_loss(self):
        return self.TrainLossEs


**#Pretrain the Autoencoder**

In [89]:
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping
from lightning.pytorch.loggers import TensorBoardLogger


# Define your autoencoder model
autoencoder = LSTMAutoencoder(input_size=3, hidden_size=64, num_layers=5)

model_checkpoint = ModelCheckpoint(
    monitor='train_loss',
    filename='Autoencoder-{epoch:02d}-{train_loss:.2f}',
    save_top_k=1,
    mode='min',
)

early_stopping = EarlyStopping(
    monitor='train_loss',
    patience=3,
    mode='min',
)

trainer = L.Trainer(
      max_epochs=15,
      accelerator='auto',
      devices='auto',
      callbacks=[model_checkpoint, early_stopping],
      accumulate_grad_batches=2,
      gradient_clip_val=0.5,  # Adjust gradient clipping as needed
      logger=TensorBoardLogger('lightning_logs', name='Autoencoder'),
      check_val_every_n_epoch=1,  # Run validation every epoch
      log_every_n_steps=10,  # Adjust according to your preference
  )

# Pretrain the model
trainer.fit(autoencoder, train_dataloader, val_dataloader)

INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO:lightning.pytorch.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/configuration_validator.py:72: You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name         | Type   | Params
----------------------------------------
0 | encoder      | LSTM   | 150 K 
1 | decoder      | LSTM   | 166 K 
2 | output_layer | Linear | 195   
-

Training: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=15` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=15` reached.


In [90]:
import torch
from google.colab import files

# Define the filename
model_file_name = 'autoencoder_final.pth'

# Save the autoencoder model's state dictionary
torch.save(autoencoder.state_dict(), model_file_name)

In [91]:
# Download the file to your local machine
files.download(model_file_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [92]:
# Define path for saving model's state dictionary
model_state_dict_path = 'autoencoder_state_dict.pth'

# Save the autoencoder model's state dictionary
torch.save(autoencoder.state_dict(), model_state_dict_path)


In [93]:
# Initialize the model structure
autoencoder_model = LSTMAutoencoder(input_size=3, hidden_size=64, num_layers=5)

# Load the state dict back into the model
autoencoder_model.load_state_dict(torch.load(model_state_dict_path))


<All keys matched successfully>

**#Define and Train the Classification Model**

In [154]:
# class LSTMClassifier(L.LightningModule):
#     def _init_(self, input_size, dropout_rate):
#         super(LSTMClassifier, self)._init_()
#         self.input_size = input_size
#         self.dropout_rate = dropout_rate

#         # Use the pretrained LSTMAutoencoder's encoder
#         self.encoder = autoencoder.encoder  # Use only the encoder part

#         # Assuming the hidden_size and num_layers match those of the pretrained encoder
#         hidden_size = autoencoder.hidden_size

#         # Additional linear layer for classification
#         self.fc = nn.Linear(hidden_size, input_size)

#         # Optionally add a dropout layer
#         self.dropout = nn.Dropout(dropout_rate)


#         # save all the metrics
#         self.TrainLossEs = []
#         self.ValLossEs = []
#         self.TrainAcc = []
#         self.ValAcc = []

#     def forward(self, x):
#         # Use the encoder to get the hidden state
#         (hidden) = self.encoder(x)  # Get the last hidden state

#         # Applying dropout
#         hidden = self.dropout(hidden[-1])  # Taking the last layer's hidden state

#         # Pass through the classification layer
#         out = self.fc(hidden)
#         return out


# # class LSTMClassifier(L.LightningModule):
# #     def __init__(self, pretrained_encoder, num_classes, dropout_rate=0.4):
# #         super(LSTMClassifier, self).__init__()
# #         self.encoder = pretrained_encoder  # Use the pretrained encoder
# #         self.fc = nn.Linear(pretrained_encoder.hidden_size, num_classes)  # Classification head
# #         self.dropout = nn.Dropout(dropout_rate)



#     # def forward(self, x):
#     #   # Pass input through the encoder
#     #   (hidden) = self.encoder(x)
#     #   x = hidden[-1]  # Take the last hidden state
#     #   # Pass through the classifier
#     #   x = self.dropout(x)
#     #   x = self.fc(x)
#     #   return x


#     def training_step(self, batch, batch_idx):
#         sequences, labels = batch
#         predictions = self(sequences)
#         loss = F.cross_entropy(predictions, labels)
#         acc = (predictions.argmax(dim=1) == labels).float().mean()
#         self.TrainLossEs.append(loss.item())
#         self.TrainAcc.append(acc.item())
#         self.log('train_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
#         self.log('train_accuracy', acc, on_step=False, on_epoch=True, prog_bar=True)
#         return loss

#     def validation_step(self, batch, batch_idx):
#         sequences, labels = batch
#         predictions = self(sequences)
#         loss = F.cross_entropy(predictions, labels)
#         acc = (predictions.argmax(dim=1) == labels).float().mean()
#         self.ValLossEs.append(loss.item())
#         self.ValAcc.append(acc.item())
#         self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
#         self.log('val_accuracy', acc, on_step=False, on_epoch=True, prog_bar=True)

#     def configure_optimizers(self):
#         optimizer = torch.optim.Adam(self.parameters(), lr=0.0005)
#         scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)
#         return [optimizer], [scheduler]

#     def get_train_loss(self):
#         return self.TrainLossEs

#     def get_val_loss(self):
#         return self.ValLossEs

#     def get_train_acc(self):
#         return self.TrainAcc

#     def get_val_acc(self):
#         return self.ValAcc

In [210]:
class LSTMClassifier(L.LightningModule):
    def __init__(self, input_size=3, hidden_size=64, num_layers=5, num_classes=18, dropout_rate=0.4):
        super(LSTMClassifier, self).__init__()
        self.save_hyperparameters()

        # Assuming 'autoencoder_model' is a pretrained instance of your LSTMAutoencoder class
        self.encoder = autoencoder.encoder  # Make sure this is defined or passed correctly

        # Classifier layers
        self.fc = nn.Linear(hidden_size, num_classes)  # Maps from hidden state space to label space
        self.dropout = nn.Dropout(dropout_rate)

        # Metrics storage
        self.TrainLosses = []
        self.ValLosses = []
        self.TrainAcc = []
        self.ValAcc = []

    def forward(self, x):
        # Encoder
        _, (hidden, _) = self.encoder(x)  # We only need the hidden state

        # We take the last hidden state for the last layer of the LSTM
        # The hidden state shape is (num_layers, batch_size, hidden_size), we take the last layer's output
        last_hidden = hidden[-1]  # Shape: (batch_size, hidden_size)

        # Dropout and classification
        last_hidden = self.dropout(last_hidden)  # Apply dropout
        out = self.fc(last_hidden)  # Final classification
        return out

    def training_step(self, batch, batch_idx):
        self.train()
        sequences, labels = batch  # Unpack batch
        predictions = self(sequences)  # Forward pass
        loss = F.cross_entropy(predictions, labels)  # Loss calculation
        acc = (predictions.argmax(dim=1) == labels).float().mean()  # Accuracy calculation

        # Logging
        self.TrainLosses.append(loss.item())
        self.TrainAcc.append(acc.item())
        self.log('train_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log('train_accuracy', acc, on_step=False, on_epoch=True, prog_bar=True)

        return loss

    def validation_step(self, batch, batch_idx):
        sequences, labels = batch
        predictions = self(sequences)
        loss = F.cross_entropy(predictions, labels)
        # Calculate accuracy
        acc = (predictions.argmax(dim=1) == labels).float().mean()
        self.ValLossEs.append(loss.item())
        self.ValAcc.append(acc.item())
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_accuracy', acc, prog_bar=True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.0005)
        return optimizer

    def get_train_loss(self):
        return self.TrainLossEs

    def get_val_loss(self):
        return self.ValLossEs

    def get_train_acc(self):
        return self.TrainAcc

    def get_val_acc(self):
        return self.ValAcc

In [211]:
# Create dataset instances
train_dataset = HARDataset(train_features_loaded, unlabeled_dir, labeled=True)
val_dataset = HARDataset(val_features_loaded, unlabeled_dir, labeled=True)

In [212]:
# Create data loaders
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=pad_sequences_from_start)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=pad_sequences_from_start)

In [213]:
for i, (x, y) in enumerate(train_dataloader):
    if i == 5:
        break
    print(f"Batch {i+1}:")
    print(f"X shape: {x.shape}")
    print(f"y shape: {y.shape}")
    print(f"y: {y}")
    print()

Batch 1:
X shape: torch.Size([64, 3000, 3])
y shape: torch.Size([64])
y: tensor([ 8, 11, 14, 12, 12,  1, 11,  0, 10,  5, 12, 13, 11,  7,  3, 15, 15,  8,
         7,  9, 12,  9, 12,  2, 12, 11, 13,  5, 12, 10, 10,  7, 15, 10,  2, 17,
         8,  9, 15,  9, 11,  9, 13,  2,  4, 12, 16, 15,  1, 14,  2,  1,  7,  2,
        13,  3,  1, 10,  6, 17, 10,  8,  2,  1])

Batch 2:
X shape: torch.Size([64, 3000, 3])
y shape: torch.Size([64])
y: tensor([17,  5,  5,  3, 10, 13, 10,  9,  0, 14,  9, 10, 15,  7, 14,  4,  6,  9,
        10,  7,  5,  1,  3, 11, 14,  8, 10,  2, 15, 15,  9,  8,  8,  3,  9,  2,
         8, 13, 10,  9, 10,  5,  6, 10, 10,  2,  1, 12, 11,  9,  9,  3, 11,  9,
        10,  5,  9, 15, 16, 10,  7,  6, 15, 15])

Batch 3:
X shape: torch.Size([64, 3000, 3])
y shape: torch.Size([64])
y: tensor([13, 10,  2,  9, 13,  7,  9,  6, 12, 11, 11, 11,  5, 13,  3,  7, 12, 13,
         8, 12, 12,  1,  6, 10,  4, 12,  4,  9, 10, 11, 14,  3,  4,  2,  9, 12,
        14,  5, 11, 13,  4,  3, 10,  2,  

In [214]:
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping
from lightning.pytorch.loggers import TensorBoardLogger

def train_and_evaluate(model, train_dataloader, val_dataloader, max_epochs=10, patience=5):

    # Logging
    logger = TensorBoardLogger("lightning_logs", name="LSTMClassifier")

    # Callbacks
    model_checkpoint = ModelCheckpoint(
        monitor='val_loss',  # Save the model based on the validation loss
        filename='LSTMClassifier-{epoch:02d}-{val_loss:.2f}',
        save_top_k=1,  # Save the best model only
        mode='min',  # Minimize validation loss
    )

    early_stopping = EarlyStopping(
        monitor='val_accuracy',
        patience=5,  # Number of epochs with no improvement after which training will be stopped
        mode='max',
    )

    # Trainer
    trainer = L.Trainer(
        default_root_dir='checkpoints/',
        log_every_n_steps=5,
        max_epochs=max_epochs,
        accelerator="auto",
        devices="auto",
        callbacks=[model_checkpoint, early_stopping],
        logger=logger
    )

    # Assuming 'trainer' is already defined and initialized
    if hasattr(trainer, 'logger') and trainer.logger:
        trainer.logger._log_graph = True  # This will log the model graph for visualization in TensorBoard
        trainer.logger._default_hp_metric = None  # This will disable the default hyperparameter metric logging

    # Start training
    trainer.fit(model, train_dataloader, val_dataloader)

    # Retrieve metrics from the trainer's logged metrics
    train_loss = trainer.logged_metrics.get('train_loss')
    val_loss = trainer.logged_metrics.get('val_loss')
    train_acc = trainer.logged_metrics.get('train_accuracy')
    val_acc = trainer.logged_metrics.get('val_accuracy')

    metrics = {
    'train_loss': train_loss,
    'val_loss': val_loss,
    'train_acc': train_acc,
    'val_acc': val_acc
    }

    return model, metrics

In [215]:
def epoch_array(matric_batch_array, num_epoch):
  """
  The  function calculates the mean matric value fo each epoch
  """
  res = []
  for i in np.array_split(matric_batch_array,num_epoch):
    res.append(float(np.mean(i)))
  return res

In [216]:
def plot_acc_and_loss(model, num_epoch):
  """
  Helper function to plot the graphs
  """

  train_losses = epoch_array(model.get_train_loss(), num_epoch)
  val_losses = epoch_array(model.get_val_loss(), num_epoch)
  train_accs = epoch_array(model.get_train_acc(), num_epoch)
  val_accs = epoch_array(model.get_val_acc(), num_epoch)

  epochs = range(1, num_epoch + 1)

  # Plot training and validation loss
  plt.figure(figsize=(10, 5))
  plt.plot(epochs, train_losses, '.-', label='Training Loss', color = '#609d9e')
  plt.plot(epochs, val_losses, '.-', label='Validation Loss', color = '#075052')
  plt.title('Training and Validation Loss')
  plt.xlabel('Epochs')
  plt.ylabel('Loss')
  plt.legend()
  plt.show()

  # Plot training and validation accuracy
  plt.figure(figsize=(10, 5))
  plt.plot(epochs, train_accs, '.-', label='Training Accuracy', color = '#609d9e')
  plt.plot(epochs, val_accs, '.-', label='Validation Accuracy', color = '#075052')
  plt.title('Training and Validation Accuracy')
  plt.xlabel('Epochs')
  plt.ylabel('Accuracy')
  plt.legend()
  plt.show()

In [217]:
# Initialize your classifier
classifier = LSTMClassifier(input_size=3, hidden_size=64, num_layers=5, num_classes=18, dropout_rate=0.4)

# Train your classifier (make sure to define 'train_and_evaluate' properly or use your training routine)
classifier, metrics = train_and_evaluate(model=classifier, train_dataloader=train_dataloader, val_dataloader=val_dataloader)

INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO:lightning.pytorch.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/configuration_validator.py:72: You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name    | Type    | Params
------------------------------------
0 | encoder | LSTM    | 150 K 
1 | fc      | Linear  | 1.2 K 
2 | dropout | Dropout | 0     
---------------------

Training: |          | 0/? [00:00<?, ?it/s]

RuntimeError: Early stopping conditioned on metric `val_accuracy` which is not available. Pass in or modify your `EarlyStopping` callback to use any of the following: `train_loss`, `train_accuracy`

In [None]:
epochs = 10
plot_acc_and_loss(classifier, epochs)

In [None]:
def extract_predictions(model, dataloader):
    model.eval()
    model.freeze()
    predictions, labels, probabilities = [], [], []
    with torch.no_grad():
        for sequences, labels_batch in dataloader:
            outputs = model(sequences)
            probs = F.softmax(outputs, dim=1)
            preds = torch.argmax(probs, dim=1)
            predictions.extend(preds.cpu().numpy())
            labels.extend(labels_batch.cpu().numpy())
            probabilities.extend(probs.cpu().numpy())
    return predictions, labels, probabilities

# Extract predictions and probabilities from the validation set
predictions, true_labels, probabilities = extract_predictions(classifier, val_dataloader)

In [None]:
def visualize_classifications_df(predictions, true_labels, probabilities, label_encoder, num_examples=5):
    # Convert numerical labels back to original activity names
    predicted_activities = label_encoder.inverse_transform(predictions)
    true_activities = label_encoder.inverse_transform(true_labels)

    # Prepare data for DataFrame
    data = []
    good_indices = [i for i, (pred, true) in enumerate(zip(predictions, true_labels)) if pred == true][:num_examples]
    for i in good_indices:
        data.append({
            'Index': i,
            'Type': 'Good',
            'Predicted': predicted_activities[i],
            'True': true_activities[i],
            'Probability': max(probabilities[i])
        })

    bad_indices = [i for i, (pred, true) in enumerate(zip(predictions, true_labels)) if pred != true][:num_examples]
    for i in bad_indices:
        data.append({
            'Index': i,
            'Type': 'Bad',
            'Predicted': predicted_activities[i],
            'True': true_activities[i],
            'Probability': max(probabilities[i])
        })

    uncertain_indices = sorted(range(len(probabilities)), key=lambda i: max(probabilities[i]))[:num_examples]
    for i in uncertain_indices:
        data.append({
            'Index': i,
            'Type': 'Uncertain',
            'Predicted': predicted_activities[i],
            'True': true_activities[i],
            'Probability': max(probabilities[i])
        })

    # Create and return DataFrame
    results_df = pd.DataFrame(data)

    # Round the 'Probability' column to 4 decimal places
    results_df['Probability'] = results_df['Probability'].round(4)

    return results_df

In [None]:
visualize_classifications_df(predictions, true_labels, probabilities, label_encoder, num_examples=5)