In [1]:
%matplotlib inline
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import os
import sys
from sklearn import preprocessing

### Loading all data into a single dataframe
If we look at the newly annotated data, the data is continuously annotated (not in segments as done previously). So, if we need to take examples of (say) 1 second each, we need to ensure that at any particular instant of time, there exists 1 seconds' worth of data after that particular time instant. If the data is not long enough, then it should be discarded.

Since the sampling frequency is not constant at 50 Hz, there will be less or more number of samples than expected. This can be handled later.

In [31]:
all_data_list = list()
for file in os.listdir('../data/new_annotated_data/'):
    # check whether file to be loaded is csv 
    # and also ensure no other files are attempted to be parsed.
    if file[-4 : ] == '.csv':
        df = pd.read_csv(os.path.join('../data/new_annotated_data/', file), names = ['time','Control','High_level','Expectation','Activity','linear_acc.x','linear_acc.y','linear_acc.z','gyro.z','gyro.x','gyro.y','ci','distance','proximity'], skiprows = [0])
        all_data_list.append(df)
        
# Combine all the dataframes
all_data_df = pd.concat(all_data_list)
# Drop unnecessary data
all_data_df.drop(['ci', 'distance', 'Control', 'Expectation', 'Activity', 'proximity', 'gyro.x', 'gyro.y', 'gyro.z'], axis = 1, inplace = True)
print(len(all_data_df)) 

# Due to combination of multiple dataframes, the indices remained the same from the original dataframe
# even in the full all_data_df dataframe. So, the indices needed to be reset. Upon resetting the indices
# get converted into a column called 'index', so it needs to removed as it is unnecessary. 
all_data_df.reset_index(inplace = True)
print(all_data_df.head())
all_data_df.drop('index', axis = 1, inplace = True)

# Indices are again reset so we can have a column of the correct indices which are required to 
# eliminate the unnecessary rows (which contained CI and proximity data only and no accelerometer data)
all_data_df.reset_index(inplace = True)
all_data_df.to_csv('../data/cleaned_new.csv', index = None)

109420
   index      time High_level  linear_acc.x  linear_acc.y  linear_acc.z
0      0  0.000000    Walking         345.0       -2701.0        -466.0
1      1  0.000000    Walking         345.0       -2701.0        -466.0
2      2  0.021687    Walking         442.0       -2287.0        -965.0
3      3  0.040944    Walking         676.0       -1968.0       -1323.0
4      4  0.061339    Walking        1406.0       -1561.0       -1228.0


### Drop rows that don't have accelerometer data or are labelled 'Transient'
Some rows don't have accelerometer data since at those timestamps, other sensors' (Kinect) data was received. And some readings are transient and don't represent any of the other classes.

In [32]:
# Creating a list of indices where timestamp is not NaN but accelerometer data is NaN 
# i.e. indices which don't have any accelerometer data
drop_indices = list()
for t, x, ind, label in zip(all_data_df.iloc[:, 1], all_data_df.iloc[:, 3], all_data_df.iloc[:, 0], all_data_df.iloc[:, 2]):
    if pd.isna(x) or label == 'Transient':
        drop_indices.append(ind)
        
# Then, drop those indices and drop the 'index' column created earlier since it is not needed now.
all_data_df.drop(all_data_df.index[drop_indices], inplace = True)
all_data_df.drop('index', axis = 1, inplace = True)
# saving to file for further use
all_data_df.to_csv('../data/cleaned_new2.csv', index = None)

### Drop unnecessary columns and normalize the data.

In [40]:
df = pd.read_csv('../data/cleaned_new2.csv', names = ['time', 'High_level', 'linear_acc.x', 'linear_acc.y', 'linear_acc.z'], header = 0)

# We won't drop the timestamps as they are still required to separate the examples later
# df.drop(['time'], axis = 1, inplace = True)

# normalizing acceleration data using factor of 16384 mentioned in datasheet of MPU6050
# to get the acceleration in multiples of 'g' (9.8 m/s^2)
# df['linear_acc.x'] = df['linear_acc.x'] / 16384.0
# df['linear_acc.y'] = df['linear_acc.y'] / 16384.0
# df['linear_acc.z'] = df['linear_acc.z'] / 16384.0

# using sklearn min_max_scaler
min_max_scaler = preprocessing.MinMaxScaler()
x = df['linear_acc.x'].values.reshape(-1, 1)
x_scaled = min_max_scaler.fit_transform(x)
df['linear_acc.x'] = x_scaled

min_max_scaler = preprocessing.MinMaxScaler()
x = df['linear_acc.y'].values.reshape(-1, 1)
x_scaled = min_max_scaler.fit_transform(x)
df['linear_acc.y'] = x_scaled

min_max_scaler = preprocessing.MinMaxScaler()
x = df['linear_acc.z'].values.reshape(-1, 1)
x_scaled = min_max_scaler.fit_transform(x)
df['linear_acc.z'] = x_scaled

# print(df.head())
df.to_csv('../data/new_data.csv', index = None, header = None)

# Important : Read this before executing next cell !
#### Delete the files which are already generated before running the scripts to generate them again. This is because the code appends to the file so if you re-run the scripts without deleting the earlier data, it will append to the earlier data.

This saved data `new_data.csv` will be passed through `preprocess.py` (in `../code/`) (TODO) to get `new_padded_data.csv` for further processing below.

In [2]:
df = pd.read_csv('../data/new_padded_data.csv', names = ['x_acc', 'y_acc', 'z_acc', 'label'])
# print(df.head())
labels = pd.DataFrame(pd.get_dummies(df['label'], prefix = 'label_'))
# print(labels.head())

# labels.to_csv('../data/labels.csv', index = None)
# assert(len(labels) == len(df))

df.drop(['label'], axis = 1, inplace = True)
dataframe = pd.concat([df, labels], axis = 1).reset_index(drop = True)
print(dataframe.head())
dataframe.to_csv('../data/new_final_data.csv', index = None)

      x_acc     y_acc     z_acc  label__Blocking  label__Inactive  \
0  0.458528  0.413146  0.653134                1                0   
1  0.444464  0.404258  0.652870                1                0   
2  0.407148  0.392295  0.653038                1                0   
3  0.393435  0.390639  0.649922                1                0   
4  0.385438  0.390301  0.644817                1                0   

   label__Running  label__Walking  
0               0               0  
1               0               0  
2               0               0  
3               0               0  
4               0               0  


### Shuffling the dataset
The examples need to be shuffled before splitting the dataset into different sets. However, the examples should be shuffled and not the samples. So, we reshape the dataframe into 3D and shuffle along one axis such that the samples in each example are maintained but the examples themselves get shuffled.

In [3]:
final = pd.read_csv('../data/new_final_data.csv', header = 0)
# shuffling the dataset before splitting into train, val and test sets
three_d = final.values.reshape(-1, 100, final.shape[1])
# print(three_d.shape)
np.random.seed()
np.random.shuffle(three_d)
two_d = three_d.reshape(-1, final.shape[1])
print(two_d.shape)

(53300, 7)


#### Determining samples for 80 : 10 : 10 split

In [4]:
reqd_len = 100
train_samples = int((0.8 * two_d.shape[0] // reqd_len) * reqd_len) 
# 58750 for 80 %
print(train_samples)
test_val_samples = int((0.1 * two_d.shape[0] // reqd_len) * reqd_len)
# 7300 for 10 %
print(test_val_samples)

42600
5300


#### Splitting into training, validation and testing sets, and saving into csv file

In [5]:
train_df = pd.DataFrame(two_d[ : 42600])
val_df = pd.DataFrame(two_d[42600 : 42600 + 5300])
test_df = pd.DataFrame(two_d[42600 + 5300 : ])
print(len(train_df) // reqd_len)
print(len(val_df) // reqd_len)
print(len(test_df) // reqd_len)
train_df.to_csv('../data/new_train.csv', index = False, header = False)
val_df.to_csv('../data/new_val.csv', index = False, header = False)
test_df.to_csv('../data/new_test.csv', index = False, header = False)

426
53
54


#### Looking at the distribution of classes

In [6]:
df = pd.read_csv('../data/new_padded_data.csv', names = ['x_acc', 'y_acc', 'z_acc', 'label'])

train_outcome = pd.crosstab(index = df["label"], columns = "count") // 100
print(train_outcome)

col_0     count
label          
Blocking    157
Inactive    260
Running      67
Walking      49


#### Making 2 dataframes having all the data and labels separately (for use in sklearn)

In [7]:
all_data_df = pd.DataFrame(two_d)
all_labels = all_data_df.iloc[:, 3 : ]
all_data = all_data_df.iloc[:,  : 3]
all_data.to_csv('../data/new_data_only.csv', index = False, header = False)
all_labels.to_csv('../data/new_labels_only.csv', index = False, header = False)