In [1]:
# autoreload
%load_ext autoreload
%autoreload 2

# change current working directory to the root of the project
import os
os.chdir(os.path.dirname(os.getcwd()))

# Purpose
- Purpose of this notebook is to create train and validation sets
- Test set has already been created

In [2]:
from IPython.display import display

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

# Load Data

In [3]:
nrows = None
data = pd.read_csv('data/transformed/transformed_data.csv', nrows=nrows)

# order by user id and timestamp
data = data.sort_values(['user_id', 'timestamp']).reset_index(drop=True)

X = data.drop('activity', axis=1)
y = data['activity']

# Split the data into train and validation sets
- Split based on User ID

In [4]:

# unique user id
user_ids = X['user_id'].unique()

# split user ids into train, validation and test
train_ids, test_ids = train_test_split(user_ids, test_size=0.2, random_state=42)

# split train ids into train and validation
train_ids, val_ids = train_test_split(train_ids, test_size=0.2, random_state=42)

# split data into train, validation and test
train = data[data['user_id'].isin(train_ids)]
val = data[data['user_id'].isin(val_ids)]
test = data[data['user_id'].isin(test_ids)]

X_train = train.drop('activity', axis=1)
y_train = pd.DataFrame(train['activity'])

X_val = val.drop('activity', axis=1)
y_val = pd.DataFrame(val['activity'])

X_test = test.drop('activity', axis=1)
y_test = pd.DataFrame(test['activity'])

In [5]:
print('Training Set')
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
display(X_train.head())
display(y_train.head())

Training Set
X_train shape: (632284, 5)
y_train shape: (632284, 1)


Unnamed: 0,user_id,timestamp,x,y,z
0,1,4991922345000,0.69,10.8,10.8
1,1,4991972333000,6.85,7.44,7.44
2,1,4992022351000,0.93,5.63,5.63
3,1,4992072339000,-2.11,5.01,5.01
4,1,4992122358000,-4.59,4.29,4.29


Unnamed: 0,activity
0,Walking
1,Walking
2,Walking
3,Walking
4,Walking


In [6]:
print('Validation Set')
print(f"X_val shape: {X_val.shape}")
print(f"y_val shape: {y_val.shape}")
display(X_val.head())
display(y_val.head())

Validation Set
X_val shape: (199298, 5)
y_val shape: (199298, 1)


Unnamed: 0,user_id,timestamp,x,y,z
128544,6,0,0.0,0.0,0.0
128545,6,0,0.0,0.0,0.0
128546,6,0,0.0,0.0,0.0
128547,6,0,0.0,0.0,0.0
128548,6,0,0.0,0.0,0.0


Unnamed: 0,activity
128544,Jogging
128545,Jogging
128546,Jogging
128547,Jogging
128548,Jogging


In [7]:
print('Test Set')
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")
display(X_test.head())
display(y_test.head())

Test Set
X_test shape: (266622, 5)
y_test shape: (266622, 1)


Unnamed: 0,user_id,timestamp,x,y,z
342509,13,0,0.0,0.0,0.0
342510,13,0,0.0,0.0,0.0
342511,13,0,0.0,0.0,0.0
342512,13,0,0.0,0.0,0.0
342513,13,0,0.0,0.0,0.0


Unnamed: 0,activity
342509,Jogging
342510,Jogging
342511,Jogging
342512,Jogging
342513,Jogging


In [8]:
# Save data
X_train.to_csv("data/transformed/X_train.csv", index=False)
X_val.to_csv("data/transformed/X_val.csv", index=False)
X_test.to_csv("data/transformed/X_test.csv", index=False)


y_train.to_csv("data/transformed/y_train.csv", index=False)
y_val.to_csv("data/transformed/y_val.csv", index=False)
y_test.to_csv("data/transformed/y_test.csv", index=False)