In [3]:
%matplotlib inline
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import os
import sys

### Loading all data into a single dataframe
If we look at the newly annotated data, the data is continuously annotated (not in segments as done previously). So, if we need to take examples of (say) 1 second each, we need to ensure that at any particular instant of time, there exists 1 seconds' worth of data after that particular time instant. If the data is not long enough, then it should be discarded.

Since the sampling frequency is not constant at 50 Hz, there will be less or more number of samples than expected. This can be handled later.

In [4]:
all_data_list = list()
for file in os.listdir('../data/new_annotated_data/'):
    # check whether file to be loaded is csv 
    # and also ensure no other files are attempted to be parsed.
    if file[-4 : ] == '.csv':
        df = pd.read_csv(os.path.join('../data/new_annotated_data/', file), names = ['time','Control','High_level','Expectation','Activity','linear_acc.x','linear_acc.y','linear_acc.z','gyro.z','gyro.x','gyro.y','ci','distance','proximity'], skiprows = [0])
        all_data_list.append(df)
        
# Combine all the dataframes
all_data_df = pd.concat(all_data_list)
# Drop unnecessary data
all_data_df.drop(['ci', 'distance', 'Control', 'Expectation', 'Activity', 'proximity', 'gyro.x', 'gyro.y', 'gyro.z'], axis = 1, inplace = True)
print(len(all_data_df)) 

# Due to combination of multiple dataframes, the indices remained the same from the original dataframe
# even in the full all_data_df dataframe. So, the indices needed to be reset. Upon resetting the indices
# get converted into a column called 'index', so it needs to removed as it is unnecessary. 
all_data_df.reset_index(inplace = True)
print(all_data_df.head())
all_data_df.drop('index', axis = 1, inplace = True)

# Indices are again reset so we can have a column of the correct indices which are required to 
# eliminate the unnecessary rows (which contained CI and proximity data only and no accelerometer data)
all_data_df.reset_index(inplace = True)
all_data_df.to_csv('../data/cleaned_new.csv', index = None)

26473
   index      time High_level  linear_acc.x  linear_acc.y  linear_acc.z
0      0  0.000000   Inactive         382.0        -382.0        1972.0
1      1  0.000000   Inactive         382.0        -382.0        1972.0
2      2  0.020986   Inactive         420.0        -213.0        2028.0
3      3  0.041613   Inactive         416.0        -119.0        2060.0
4      4  0.061474   Inactive         333.0        -127.0        2081.0


### Drop rows that don't have accelerometer data
Some rows don't have accelerometer data since at those timestamps, other sensors' (Kinect) data was received.

In [5]:
# Creating a list of indices where timestamp is not NaN but accelerometer data is NaN 
# i.e. indices which don't have any accelerometer data
drop_indices = list()
for t, x, ind in zip(all_data_df.iloc[:, 1], all_data_df.iloc[:, 3], all_data_df.iloc[:, 0]):
    if pd.isna(x):
        drop_indices.append(ind)
        
# Then, drop those indices and drop the 'index' column created earlier since it is not needed now.
all_data_df.drop(all_data_df.index[drop_indices], inplace = True)
all_data_df.drop('index', axis = 1, inplace = True)
# saving to file for further use
all_data_df.to_csv('../data/cleaned_new2.csv', index = None)

### Drop unnecessary columns and normalize the data.

In [7]:
df = pd.read_csv('../data/cleaned_new2.csv', names = ['time', 'High_level', 'linear_acc.x', 'linear_acc.y', 'linear_acc.z'], header = 0)

# We won't drop the timestamps as they are still required to separate the examples later
# df.drop(['time'], axis = 1, inplace = True)

# normalizing acceleration data using factor of 16384 mentioned in datasheet of MPU6050
# to get the acceleration in multiples of 'g' (9.8 m/s^2)
df['linear_acc.x'] = df['linear_acc.x'] / 16384.0
df['linear_acc.y'] = df['linear_acc.y'] / 16384.0
df['linear_acc.z'] = df['linear_acc.z'] / 16384.0
# print(df.head())
df.to_csv('../data/new_data.csv', index = None, header = None)

# Important : Read this before executing next cell !
#### Delete the files which are already generated before running the scripts to generate them again. This is because the code appends to the file so if you re-run the scripts without deleting the earlier data, it will append to the earlier data.

This saved data `new_data.csv` will be passed through `preprocess.py` (in `../code/`) (TODO) to get `new_padded_data.csv` for further processing below.