In [3]:
%matplotlib inline
import pandas as pd
import datetime
import pandas_datareader.data as web
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np

### Load all data into single dataframe
If we look at the data closely, we will find that the data is already divided into segments of 3 seconds with overlap of 1.5 seconds. So, there is a separation (empty row) between each segment. This means those separations get deleted if we simply remove all rows where accelerometer data is missing. So, we need to consider the case where both timestamp and accelerometer data are missing (this is the separation and should not be deleted as it will be useful in future).

Further, since the sampling rate of accelerometer is not constant, the number of samples in each segment is different. So, this needs to be handled in some way.

In [108]:
all_data_list = list()
for file in os.listdir('../data/annotated_csv/'):
    # check whether file to be loaded is csv 
    # and also ensure no other files are attempted to be parsed.
    if file[-4 : ] == '.csv':
        df = pd.read_csv(os.path.join('../data/annotated_csv/', file), names = ['time','Control','High_level','Expectation','Activity','linear_acc.x','linear_acc.y','linear_acc.z','gyro.z','gyro.x','gyro.y','ci','distance','proximity'], skiprows = [0])
        all_data_list.append(df)
        
# Combine all the dataframes
all_data_df = pd.concat(all_data_list)
# Drop unnecessary data
all_data_df.drop(['ci', 'distance', 'Control', 'Expectation', 'Activity', 'proximity', 'gyro.x', 'gyro.y', 'gyro.z'], axis = 1, inplace = True)
print(len(all_data_df)) 

# Due to combination of multiple dataframes, the indices remained the same from the original dataframe
# even in the full all_data_df dataframe. So, the indices needed to be reset. Upon resetting the indices
# get converted into a column called 'index', so it needs to removed as it is unnecessary. 
all_data_df.reset_index(inplace = True)
print(all_data_df.head())
all_data_df.drop('index', axis = 1, inplace = True)

# Indices are again reset so we can have a column of the correct indices which are required to 
# eliminate the unnecessary rows (which contained CI and proximity data only and no accelerometer data)
all_data_df.reset_index(inplace = True)
all_data_df.to_csv('../data/cleaned2.csv', index = None)

212755
   index      time High_level  linear_acc.x  linear_acc.y  linear_acc.z
0      0  0.000000    Dodging         345.0       -2701.0        -466.0
1      1  0.000000    Dodging         345.0       -2701.0        -466.0
2      2  0.021687    Dodging         442.0       -2287.0        -965.0
3      3  0.040944    Dodging         676.0       -1968.0       -1323.0
4      4  0.061339    Dodging        1406.0       -1561.0       -1228.0


### Discard rows that don't have accelerometer data (but not rows which indicate change of segment)

In [109]:
# Creating a list of indices where timestamp is not NaN but accelerometer data is NaN 
# i.e. indices which don't have any accelerometer data
drop_indices = list()
for t, x, ind in zip(all_data_df.iloc[:, 1], all_data_df.iloc[:, 3], all_data_df.iloc[:, 0]):
    if (pd.isna(x) and (not pd.isna(t))):
        drop_indices.append(ind)
        
# Then, drop those indices and drop the 'index' column created earlier since it is not needed now.
all_data_df.drop(all_data_df.index[drop_indices], inplace = True)
all_data_df.drop('index', axis = 1, inplace = True)
# saving to file for further use
all_data_df.to_csv('../data/cleaned3.csv', index = None)

### Drop unnecessary columns and normalize the data.

In [21]:
df = pd.read_csv('../data/cleaned3.csv', names = ['time', 'High_level', 'linear_acc.x', 'linear_acc.y', 'linear_acc.z'], header = 0)

# drop the timestamps as they are not required since we already know the separation between examples
df.drop(['time'], axis = 1, inplace = True)

# normalizing acceleration data using factor of 16384 mentioned in datasheet of MPU6050
# to get the acceleration in multiples of 'g' (9.8 m/s^2)
df['linear_acc.x'] = df['linear_acc.x'] / 16384.0
df['linear_acc.y'] = df['linear_acc.y'] / 16384.0
df['linear_acc.z'] = df['linear_acc.z'] / 16384.0
# print(df.head())
df.to_csv('../data/acc_only.csv', index = None, header = None)

# Important : Read this before executing next cell !
This saved data (`acc_only.csv`) is then passed through `preprocess.py` (in `../code`) to equalize the number of samples (to 150) in each example and also to discard any too low or too high frequency (sampling frequency) data (here, we just discard data that has more than 150 or less than 140 samples). We get `padded_data.csv` after running the `preprocess` script.

This is done because equal number of samples should be present in each example to be used for training NNs. Also the choice of 140 to 150 samples is arbitrary and may affect performance of the learning model.

### Making the labels one_hot encoded, concatenating the labels with the data and saving to csv

In [29]:
df = pd.read_csv('../data/padded_data.csv', names = ['x_acc', 'y_acc', 'z_acc', 'label'])
# print(df.head())
labels = pd.DataFrame(pd.get_dummies(df['label'], prefix = 'label_'))
# print(labels.head())

# labels.to_csv('../data/labels.csv', index = None)
# assert(len(labels) == len(df))

df.drop(['label'], axis = 1, inplace = True)
dataframe = pd.concat([df, labels], axis = 1).reset_index(drop = True)
print(dataframe.head())
dataframe.to_csv('../data/final_data.csv', index = None)

      x_acc     y_acc     z_acc  label__Blocking  label__Dodging  \
0 -0.018940 -0.071108  0.059729                0               1   
1  0.021057 -0.164856 -0.028442                0               1   
2  0.021057 -0.164856 -0.028442                0               1   
3  0.026978 -0.139587 -0.058899                0               1   
4  0.041260 -0.120117 -0.080750                0               1   

   label__Inactive  label__Moving  label__Sprinting  
0                0              0                 0  
1                0              0                 0  
2                0              0                 0  
3                0              0                 0  
4                0              0                 0  


### Shuffling the dataset
The examples need to be shuffled before splitting the dataset into different sets. However, the examples should be shuffled and not the samples. So, we reshape the dataframe into 3D and shuffle along one axis such that the samples in each example are maintained but the examples themselves get shuffled.

In [4]:
final = pd.read_csv('../data/final_data.csv', header = 0)
# shuffling the dataset before splitting into train, val and test sets
three_d = final.values.reshape(-1, 150, final.shape[1])
# print(three_d.shape)
np.random.shuffle(three_d)
two_d = three_d.reshape(-1, final.shape[1])
print(two_d.shape)

(160200, 8)


### Determining samples for 80 : 10 : 10 split

In [8]:
reqd_len = 150
train_samples = int((0.8 * two_d.shape[0] // reqd_len) * reqd_len) 
# 128100 for 80 %
# 144150 for 90 %
print(train_samples)
test_val_samples = int((0.1 * two_d.shape[0] // reqd_len) * reqd_len)
# 15900 for 10 %
# 7950 for 5 %
print(test_val_samples)

128100
15900


### Splitting into training, validation and testing sets, and saving into csv file

In [9]:
train_df = pd.DataFrame(two_d[ : 128100])
val_df = pd.DataFrame(two_d[128100 : 128100 + 16050])
test_df = pd.DataFrame(two_d[128100 + 16050 : ])
print(len(train_df) // reqd_len)
print(len(val_df) // reqd_len)
print(len(test_df) // reqd_len)
train_df.to_csv('../data/train.csv', index = False, header = False)
val_df.to_csv('../data/val.csv', index = False, header = False)
test_df.to_csv('../data/test.csv', index = False, header = False)

854
107
107
