#### This notebook is for preparing the PAMAP2 dataset for training PyTorch models

In [22]:
%load_ext autotime
%matplotlib inline
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import os
import sys
from sklearn import preprocessing

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 4.83 ms


#### Loading all data into a single dataframe
Since there is a lot of data, this might take a while (about 1 min 20 secs on a cluster server).

In [23]:
all_data_list = list()
for file in os.listdir('../data/original/'):
    # check whether file to be loaded is csv 
    # and also ensure no other files are attempted to be parsed.
    if file[-4 : ] == '.csv':
        df = pd.read_csv(os.path.join('../data/original/', file), names = range(54))
        all_data_list.append(df)
        
# Combine all the dataframes
all_data_df = pd.concat(all_data_list)

print(all_data_df.head())

     0   1   2     3        4        5        6        7        8        9   \
0  5.70   0 NaN  34.0  2.22755  9.65418  2.38862  2.31968  9.60752  2.58278   
1  5.71   0 NaN  34.0  2.37550  9.57647  2.31412  2.45657  9.62177  2.71852   
2  5.72   0 NaN  34.0  2.94208  9.53415  2.32275  2.78876  9.64961  2.76342   
3  5.73   0 NaN  34.0  3.47541  9.75837  2.40696  3.33080  9.66073  2.68734   
4  5.74   0 NaN  34.0  3.54617  9.83232  2.25382  3.67642  9.71848  2.50565   

   ...        44        45        46       47       48       49        50  \
0  ... -0.017907  0.009340  0.050097 -32.7091  31.4772  44.2318  0.255373   
1  ... -0.070091  0.002312  0.053833 -33.0782  30.9814  44.5148  0.251163   
2  ... -0.084468  0.034249  0.030462 -32.5619  30.6982  44.5169  0.250643   
3  ... -0.030789  0.058615  0.055252 -32.8212  30.9690  44.6575  0.250917   
4  ... -0.020063  0.020903  0.059653 -33.1869  30.0856  44.5154  0.249631   

         51        52        53  
0  0.783075  0.084602  0.560

#### Drop unnecessary columns 
There is a lot of data (other IMU sensors except the one on the chest) which needs to be dropped. So, the columns which are not required are dropped in the next block. Refer to the `readme.pdf` in the PAMAP2_Dataset (available online) to verify which columns were dropped.

In [24]:
# drop data of IMU sensor mounted on hand, temperature reading of chest IMU sensor and heart rate sensor reading
all_data_df.drop(range(2, 21), axis = 1, inplace = True)
# drop data of IMU sensor mounted on ankle and unnecessary data of chest IMU sensor
all_data_df.drop(range(24, 54), axis = 1, inplace = True)
print(all_data_df.head())

# Due to combination of multiple dataframes, the indices remained the same from the original dataframe
# even in the full all_data_df dataframe. So, the indices needed to be reset. Upon resetting the indices
# get converted into a column called 'index', so it needs to removed as it is unnecessary. 
all_data_df.reset_index(inplace = True)
all_data_df.drop('index', axis = 1, inplace = True)
print(all_data_df.head())

# Indices are again reset so we can have a column of the correct indices which are required to 
# eliminate the unnecessary rows (which contained CI and proximity data only and no accelerometer data)
all_data_df.reset_index(inplace = True)
all_data_df.to_csv('../data/cleaned_new.csv', index = None)

     0   1        21       22       23
0  5.70   0  1.45391  5.87561 -7.88825
1  5.71   0  1.56970  5.98847 -7.84780
2  5.72   0  1.52482  6.02469 -8.08106
3  5.73   0  1.43944  6.02286 -8.39237
4  5.74   0  1.24887  6.02345 -8.39582
     0   1        21       22       23
0  5.70   0  1.45391  5.87561 -7.88825
1  5.71   0  1.56970  5.98847 -7.84780
2  5.72   0  1.52482  6.02469 -8.08106
3  5.73   0  1.43944  6.02286 -8.39237
4  5.74   0  1.24887  6.02345 -8.39582
time: 40.6 s


In [25]:
# print(all_data_df.head())

time: 612 µs


#### Drop unnecessary rows 
The creators of the dataset mention the existence of unavailable data due to hardware failure (represented by NaN in the data). These entries need to be removed. Further, there is a 'transient' class (label = 0) that needs to be discarded.

In [None]:
# Creating a list of indices where timestamp is not NaN but accelerometer data is NaN 
# i.e. indices which don't have any accelerometer data
drop_indices = list()
for t, x, ind, label in zip(all_data_df.iloc[:, 1], all_data_df.iloc[:, 3], all_data_df.iloc[:, 0], all_data_df.iloc[:, 2]):
    if pd.isna(x) or label == 0 or pd.isna(t):
        drop_indices.append(ind)
        
# Then, drop those indices and drop the 'index' column created earlier since it is not needed now.
all_data_df.drop(all_data_df.index[drop_indices], inplace = True)
all_data_df.drop('index', axis = 1, inplace = True)
# saving to file for further use
all_data_df.to_csv('../data/cleaned_new2.csv', index = None)

#### Drop unnecessary columns and normalize the data.

In [29]:
df = pd.read_csv('../data/cleaned_new2.csv', names = ['time', 'High_level', 'linear_acc.x', 'linear_acc.y', 'linear_acc.z'], header = 0)

# We won't drop the timestamps as they are still required to separate the examples later
# df.drop(['time'], axis = 1, inplace = True)

# normalizing acceleration data using factor of 16384 mentioned in datasheet of MPU6050
# to get the acceleration in multiples of 'g' (9.8 m/s^2)
# df['linear_acc.x'] = df['linear_acc.x'] / 16384.0
# df['linear_acc.y'] = df['linear_acc.y'] / 16384.0
# df['linear_acc.z'] = df['linear_acc.z'] / 16384.0

# using sklearn min_max_scaler
min_max_scaler = preprocessing.MinMaxScaler()
x = df['linear_acc.x'].values.reshape(-1, 1)
x_scaled = min_max_scaler.fit_transform(x)
df['linear_acc.x'] = x_scaled

min_max_scaler = preprocessing.MinMaxScaler()
x = df['linear_acc.y'].values.reshape(-1, 1)
x_scaled = min_max_scaler.fit_transform(x)
df['linear_acc.y'] = x_scaled

min_max_scaler = preprocessing.MinMaxScaler()
x = df['linear_acc.z'].values.reshape(-1, 1)
x_scaled = min_max_scaler.fit_transform(x)
df['linear_acc.z'] = x_scaled

# print(df.head())
df.to_csv('../data/cleaned_new3.csv', index = None, header = None)

time: 28.3 s
