# Human Activity Classification Dataset
https://www.kaggle.com/datasets/rabieelkharoua/human-activity-classification-dataset

In [None]:
import os
import sys
import warnings
warnings.filterwarnings("ignore")

path_append = "../../"
sys.path.append(path_append)  # Go up one directory from where you are.

In [None]:
import pandas as pd
from tqdm import tqdm

file_ids = range(1600, 1651)  # Subject_id: 1600 ~ 1650

dfs = {}

for device in ['watch', 'phone']:
    for sensor in ['accel', 'gyro']:
        dfs[f'{device}_{sensor}'] = pd.DataFrame()
        
        for file_id in tqdm(file_ids, desc=f'Processing {device}/{sensor}'):
            file_path = f'../../data/Human Activity Classification Dataset/wisdm-dataset/wisdm-dataset/raw/{device}/{sensor}/data_{file_id}_{sensor}_{device}.txt'
            try:
                data = pd.read_csv(file_path, sep=",", lineterminator=";", header=None, on_bad_lines='skip')

                # add column names
                if len(data.columns) < 6:
                    additional_cols = {i: None for i in range(len(data.columns), 6)}
                    data = data.assign(**additional_cols)
                data.columns = ['Subject_id', 'Activity_code', 'Timestamp', 'x', 'y', 'z']

                
                data['device'] = device
                data['sensor'] = sensor
                dfs[f'{device}_{sensor}'] = pd.concat([dfs[f'{device}_{sensor}'], data], ignore_index=True)
            except Exception as e:
                print(f"Error processing file {file_path}: {e}")

# concat all dataframes
df = pd.concat(dfs.values(), ignore_index=True)

print(f"Final merged dataframe has {len(df)} records:")
print(df.head())


In [None]:
df['Subject_id'] = df['Subject_id'].str.replace('\n', '')

In [None]:
df.info()

In [None]:
df = df.dropna(subset=['Subject_id', 'Activity_code', 'Timestamp', 'x', 'y', 'z'])

min_trial_length = 3

segment_pairs = []

grouped = df.groupby(['Subject_id', 'device', 'sensor'])

for name, group in grouped:
    start_indices = group.index[0:-min_trial_length+1]
    for start in start_indices:
        end = start + min_trial_length
        if end <= group.index[-1]:
            segment_pairs.append((start, end))

print("Segment Pairs:", segment_pairs)

In [None]:
segment_pairs

In [None]:
from tools.preprocessing.data_frame import auto_preprocess_dataframe

target_columns = ['Activity_code']
drop_columns = ['Subject_id']
encode_columns = ['device',	'sensor']

df, description = auto_preprocess_dataframe(df, target_columns, drop_columns, encode_columns)

In [None]:
from random import shuffle
from tools.preprocessing.template_dataset import TemplateDataset

len_segment_pairs = len(segment_pairs)
shuffle(segment_pairs)  # Shuffle the indices to randomize the data order

min_window_size = 64
max_window_size = 128

def generate_indices(input_df, input_pairs, max_window_size):
    indices = []
    for iter, (start, end) in enumerate(input_pairs):
        max_index = end - max_window_size  # Calculate the maximum starting index for this segment
        for i in range(start, max_index):
            # Check if all labels in the window are the same
            if len(input_df['event'][i:i + max_window_size].unique()) == 1:
                indices.append(i)
            else:
                print(f"Skipping index {i} due to multiple labels in window.")
    return indices

train_indices = generate_indices(df, segment_pairs[:int(0.8*len_segment_pairs)], max_window_size)
test_indices = generate_indices(df, segment_pairs[int(0.8*len_segment_pairs):], max_window_size)

shuffle(train_indices)
shuffle(test_indices)


# predict the next value in the sequence
df_x = df.iloc[:, :-1] # all columns except the last one
df_y = df.iloc[:, -1:] # only the last column

# Assuming you have an EEG_Dataset class defined as before
trainset = TemplateDataset(df_x, df_y, indices = train_indices, min_window_size =min_window_size, max_window_size = max_window_size)
testset = TemplateDataset(df_x, df_y, indices = test_indices, min_window_size =max_window_size, max_window_size = max_window_size)

In [None]:
from tools.setting.data_config import DataConfig
from tools.setting.ml_params import MLParameters
from trainer_hub import TrainerHub

num_features = description['num_features']
num_classes = description['num_classes']
data_config = DataConfig(dataset_name = 'Human_Activity_Classification', task_type='multi_class_classification', obs_shape=[num_features], label_size=num_classes)

#  Set training configuration from the AlgorithmConfig class, returning them as a Namespace object.
ml_params = MLParameters(ccnet_network = 'gpt', encoder_network = 'none')
ml_params.algorithm.error_function = 'mae'
ml_params.model.ccnet_config.num_layers = 3
ml_params.training.batch_size = 64
ml_params.training.num_epoch = 5

# Set the device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

# Initialize the TrainerHub class with the training configuration, data configuration, device, and use_print and use_wandb flags
trainer_hub = TrainerHub(ml_params, data_config, device, use_print=True, use_wandb= False) 

In [None]:
trainer_hub.train(trainset, testset)