# Data Engineering

In [25]:
import pandas as pd
import numpy as np

from collections import Counter

import lightgbm as lgb
from lightgbm import LGBMClassifier

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [26]:
df = pd.read_csv("data/data_cleaned_raw.csv")

Aggregating the columns and creating new features:

In [27]:
# Columns to aggregate
columns_to_aggregate = ['acc_x', 'acc_y', 'acc_z', 'gy_x', 'gy_y', 'gy_z', 'mag_x', 'mag_y', 'mag_z']

# Aggregation functions
agg_funcs = ['max', 'min', 'std', 'mean', 'median']

# Create a dictionary with columns and aggregation functions
agg_dict = {col: agg_funcs for col in columns_to_aggregate}

# Grouping and aggregation
grouped_df = df.groupby(['subject_id', 'label']).agg(agg_dict).reset_index()

# Flatten MultiIndex columns
grouped_df.columns = ['_'.join(filter(None, col)).strip() for col in grouped_df.columns]

In [28]:
df = grouped_df.copy()

## Train-Test Split

In [29]:
# Split subject IDs for train, validation, and test sets
subject_ids = df['subject_id'].unique()
np.random.shuffle(subject_ids)

train_ids = subject_ids[:4]
val_id = subject_ids[4]
test_ids = subject_ids[5:]

# Create train, validation, and test sets
train_df = df[df['subject_id'].isin(train_ids)]
val_df = df[df['subject_id'] == val_id]
test_df = df[df['subject_id'].isin(test_ids)]

# Separate features and target
X_train = train_df.drop(['subject_id', 'label'], axis=1)
y_train = train_df['label']
X_val = val_df.drop(['subject_id', 'label'], axis=1)
y_val = val_df['label']
X_test = test_df.drop(['subject_id', 'label'], axis=1)
y_test = test_df['label']

# Modelling

In [12]:
# Encode target labels
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_val = le.transform(y_val)
y_test = le.transform(y_test)

In [13]:
# Calculate class weights
class_counts = Counter(y_train)
total_samples = len(y_train)
class_weights = {cls: total_samples / count for cls, count in class_counts.items()}

In [42]:
# Initialize the LGBMClassifier
model = lgb.LGBMClassifier(
    boosting_type='gbdt',
    num_leaves=31,
    learning_rate=0.1,
    n_estimators=100,
    # max_depth=-1,
    # subsample_for_bin=200000,
    # objective='multiclass',
    # class_weight=class_weights,  # Use calculated class weights
    # min_split_gain=0.0,
    # min_child_weight=0.001,
    # min_child_samples=20,
    # subsample=1.0,
    # subsample_freq=0,
    # colsample_bytree=1.0,
    # reg_alpha=0.0,
    # reg_lambda=0.0,
    # random_state=None,
    # n_jobs=-1,
    # importance_type='split'
)

In [43]:
# Fit the model
model.fit(X_train, y_train, eval_set=[(X_val, y_val)])

# Make predictions on the test set
y_pred = model.predict(X_test)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000625 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 777
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 45
[LightGBM] [Info] Start training from score -2.484907
[LightGBM] [Info] Start training from score -2.484907
[LightGBM] [Info] Start training from score -2.484907
[LightGBM] [Info] Start training from score -2.484907
[LightGBM] [Info] Start training from score -2.484907
[LightGBM] [Info] Start training from score -2.484907
[LightGBM] [Info] Start training from score -2.484907
[LightGBM] [Info] Start training from score -2.484907
[LightGBM] [Info] Start training from score -2.484907
[LightGBM] [Info] Start training from score -2.484907
[LightGBM] [Info] Start training from score -2.484907
[LightGBM] [Info] Start training from score -2.484907


In [44]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Test accuracy: {accuracy:.2f}')

Test accuracy: 0.25
