In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
DATA_DIR = "/Users/mr.youssef/Dropbox/lab1/lab2/dataset/UCI_HAR_Dataset"

In [13]:
def load_inertial_signals(subset):
    signals = ['body_acc_x', 'body_acc_y', 'body_acc_z', 'body_gyro_x', 'body_gyro_y', 'body_gyro_z','total_acc_x', 'total_acc_y', 'total_acc_z']
    loaded_signals = []
    for sig in signals:
        path = os.path.join(DATA_DIR, subset, 'Inertial Signals', f'{sig}_{subset}.txt')
        loaded_signals.append(pd.read_csv(path, sep=r'\s+', header=None).values)
        # each file has shape: (N, 128) where N is the number of windows (2.56 seconds per window) and 128 is sequential time steps for that specific window
    # Go from 9 arrays of: (N, 128) to 1 matrix (N, 128, 9) 
    return np.dstack(loaded_signals)

In [None]:
def load_y_and_subjects(subset):
    # y ==> 0 to 5 multiclass describing the activity
    # subjecs: each row has the ID of the specific human volunteer who was wearing the phone for that specific window
    y_path = os.path.join(DATA_DIR, subset, f'y_{subset}.txt')
    sub_path = os.path.join(DATA_DIR, subset, f'subject_{subset}.txt')
    y = pd.read_csv(y_path, sep=r'\s+', header=None).values.squeeze() - 1 # 0-indexed
    subjects = pd.read_csv(sub_path, sep=r'\s+', header=None).values.squeeze()
    return y, subjects

In [15]:
X_train = load_inertial_signals('train')
X_test = load_inertial_signals('test')
y_train, subjects_train = load_y_and_subjects('train')
y_test, subjects_test = load_y_and_subjects('test')

In [16]:
X_train.shape

(7352, 128, 9)

In [17]:
X_test.shape

(2947, 128, 9)

In [24]:
print(y_train.shape, subjects_train.shape)

(7352,) (7352,)


In [25]:
print(y_test.shape, subjects_test.shape)

(2947,) (2947,)


In [30]:
print(f"NaNs: {np.isnan(X_train).sum()} and Infs: {np.isinf(X_train).sum()}\n")

NaNs: 0 and Infs: 0



In [31]:
np.bincount(y_train)

array([1226, 1073,  986, 1286, 1374, 1407])

In [32]:
np.unique(y_train)

array([0, 1, 2, 3, 4, 5])

In [33]:
np.unique(subjects_train)


array([ 1,  3,  5,  6,  7,  8, 11, 14, 15, 16, 17, 19, 21, 22, 23, 25, 26,
       27, 28, 29, 30])

In [36]:
# leakage check
np.random.seed(42)
unique_train_subs = np.unique(subjects_train)
# Hold out 4 subjects from the training pool for checking
val_subs = np.random.choice(unique_train_subs, size=4, replace=False)
# get everyone from the training pool instead the 4 subjects above
train_subs = np.setdiff1d(unique_train_subs, val_subs)
# Prove overlap is 0
train_val_overlap = set(train_subs).intersection(set(val_subs))
train_test_overlap = set(train_subs).intersection(set(subjects_test))

In [37]:
print(len(train_val_overlap), len(train_test_overlap))

0 0
