# Exploratory Analysis of Heterogeneous Human Activity Recognition


------------- Accelerometer Samples ------------


All the csv files have the same structure of following columns:
'Index', 'Arrival_Time', 'Creation_Time', 'x', 'y', 'z', 'User', 'Model', 'Device', 'gt'
And the columns have the following values:

Index: 		is the row number.

Arrival_Time:	The time the measurement arrived to the sensing application

Creation_Time	The timestamp the OS attaches to the sample

X,y,z		The values provided by the sensor for the three axes, X,y,z

User:		The user this sample originates from, the users are named a to i.

Model:		The phone/watch model this sample originates from

Device:		The specific device this sample is from. They are prefixed with the model name and then the number, e.g., nexus4_1 or nexus4_2.

Gt:		The activity the user was performing: bike sit, stand, walk, stairsup, stairsdown and null


In [13]:
import pandas as pd 
import os 
import numpy as np 
import datetime as dt
import matplotlib.pyplot as plt 
%pylab inline
import gc
from sklearn.model_selection import train_test_split

Populating the interactive namespace from numpy and matplotlib


In [2]:
def convert_to_decisecond(df): 
    """
    round off timestamp to deci-second 
    to make the sampling frequency to 100 Hz. 
    """
    df['arrivalTimeDttm'] = pd.to_datetime(df['Arrival_Time'], unit ='ms')
    # strip away milliseconds. 
    df['arrivalTimeDttm_rounded'] = df['arrivalTimeDttm'].apply(lambda x: dt.datetime.strftime(x, '%Y-%m-%d %H:%M:%S'))
    df['arrivalTimeDttm_rounded'] = df['arrivalTimeDttm'].apply(lambda x: dt.datetime.strftime(x, '%Y-%m-%d %H:%M:%S'))
    return pd.to_datetime(df['arrivalTimeDttm_rounded']) + pd.to_timedelta(round(df['arrivalTimeDttm'].dt.microsecond /10000000, 2) , 's')  

def aggregate(df, prefix):
    summary = df[['gt', 'arrivalDttm_rounded', 'x', 'y', 'z']].groupby(['gt', 'arrivalDttm_rounded']).agg([np.mean, np.std, min, max])
    summary.columns = summary.columns.to_flat_index()
    summary =summary.reset_index()
    summary.columns = [i  if len(i) == 0 or type(i)==str  else  '_'.join([prefix] + list(i)) for i in list(summary.columns.to_flat_index())]
    return summary

In [3]:
HHAR_DATADIR = '../rawdata/HHAR/'
filesToRead = [os.path.join(HHAR_DATADIR, i) for i in os.listdir(HHAR_DATADIR) if i[-3:] == 'csv']
filesToRead

['../rawdata/HHAR/Phones_accelerometer.csv',
 '../rawdata/HHAR/Phones_gyroscope.csv',
 '../rawdata/HHAR/Watch_accelerometer.csv',
 '../rawdata/HHAR/Watch_gyroscope.csv']

In [7]:
# phone acclerometer. 
phone_acc = pd.read_csv(filesToRead[0])
phone_acc.shape
phone_acc.head()

Unnamed: 0,Index,Arrival_Time,Creation_Time,x,y,z,User,Model,Device,gt
0,0,1424696633908,1424696631913248572,-5.958191,0.688065,8.135345,a,nexus4,nexus4_1,stand
1,1,1424696633909,1424696631918283972,-5.95224,0.670212,8.136536,a,nexus4,nexus4_1,stand
2,2,1424696633918,1424696631923288855,-5.995087,0.653549,8.204376,a,nexus4,nexus4_1,stand
3,3,1424696633919,1424696631928385290,-5.942718,0.676163,8.128204,a,nexus4,nexus4_1,stand
4,4,1424696633929,1424696631933420691,-5.991516,0.641647,8.135345,a,nexus4,nexus4_1,stand


In [10]:
shuffled = phone_acc.sample(frac=1)
result = np.array_split(shuffled,   

In [14]:
train, test = train_test_split(phone_acc, test_size=0.2)

In [15]:
train.shape

(10449980, 10)

In [19]:
train['User'].unique()

array(['a', 'd', 'i', 'g', 'h', 'c', 'f', 'e', 'b'], dtype=object)

In [17]:
train.

Unnamed: 0,Index,Arrival_Time,Creation_Time,x,y,z,User,Model,Device,gt
1070138,95424,1424697720063,15326295621000,1.875857,-0.220267,11.520899,a,s3mini,s3mini_1,stairsdown
4782130,221611,1424782905445,1424784750763620249,-3.326523,0.870605,6.827133,d,nexus4,nexus4_2,stairsup
11617548,112810,1424776961481,1424776964962211314,5.439026,0.614273,8.710205,i,nexus4,nexus4_1,sit
8682620,107880,1424687387499,1424687385507523511,4.611847,1.604507,8.985138,g,nexus4,nexus4_1,sit
4329230,91144,1424782180254,1424782183735729679,4.076263,0.365524,9.492157,d,nexus4,nexus4_1,sit


In [12]:
[i.shape[0] for i in result]

[2612495, 2612495, 2612495, 2612495, 2612495]

In [6]:
phone_acc_user_a = phone_acc[phone_acc['User'] == 'a']
phone_acc_user_a['arrivalDttm_rounded'] = convert_to_decisecond(phone_acc_user_a)
phone_acc_user_a = aggregate(phone_acc_user_a, prefix = 'acc')

In [71]:
phone_acc_user_a

Unnamed: 0,gt,arrivalDttm_rounded,acc_x_mean,acc_x_std,acc_x_min,acc_x_max,acc_y_mean,acc_y_std,acc_y_min,acc_y_max,acc_z_mean,acc_z_std,acc_z_min,acc_z_max
0,bike,2015-02-23 13:30:17.070,-4.381178,6.302072,-10.170568,8.827422,1.480320,1.474127,-2.451662,6.903681,5.831237,1.054258,3.677448,8.657433
1,bike,2015-02-23 13:30:17.080,-4.383019,6.128748,-9.414001,10.460267,1.352832,2.347563,-3.828329,9.868899,6.256333,1.152948,3.677494,8.226477
2,bike,2015-02-23 13:30:17.090,-3.896743,6.891228,-8.495666,10.222044,1.020839,1.516081,-4.167108,2.639607,6.206461,1.006998,3.409343,8.063671
3,bike,2015-02-23 13:30:17.100,-4.492634,6.190975,-7.987057,8.737639,0.989501,1.538600,-2.577358,2.681625,6.755259,1.095302,3.782839,8.063671
4,bike,2015-02-23 13:30:18.000,-3.274386,6.689549,-8.063671,8.274258,1.146561,1.601013,-2.001553,2.723282,6.089364,1.043304,3.451242,7.779480
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18387,walk,2015-02-23 13:20:35.090,-2.613742,5.596885,-9.346847,9.959755,0.036995,0.994395,-1.838724,2.059013,8.304558,1.284418,5.908890,9.710882
18388,walk,2015-02-23 13:20:35.100,-3.024304,5.109649,-7.201669,6.588761,0.449715,0.957827,-1.685497,1.953669,7.557148,0.878439,5.765238,8.585236
18389,walk,2015-02-23 13:20:36.000,-4.059792,5.991124,-9.662998,6.588761,-0.125458,0.849789,-1.710159,1.379043,8.066215,0.895646,6.588761,10.136047
18390,walk,2015-02-23 13:20:36.010,-3.715623,5.336997,-9.375694,6.588761,0.374939,1.232080,-1.838724,2.145205,7.638130,0.912907,5.755661,9.958710


In [68]:
# phone gyroscope. 
phone_gyro = pd.read_csv(filesToRead[1])
phone_gyro.shape
phone_gyro.head()

Unnamed: 0,Index,Arrival_Time,Creation_Time,x,y,z,User,Model,Device,gt
0,0,1424696633909,1424696631914042029,0.013748,-0.000626,-0.023376,a,nexus4,nexus4_1,stand
1,1,1424696633909,1424696631919046912,0.014816,-0.001694,-0.022308,a,nexus4,nexus4_1,stand
2,2,1424696633918,1424696631924051794,0.015884,-0.001694,-0.02124,a,nexus4,nexus4_1,stand
3,3,1424696633919,1424696631929117712,0.016953,-0.00383,-0.020172,a,nexus4,nexus4_1,stand
4,4,1424696633928,1424696631934214148,0.015884,-0.007034,-0.020172,a,nexus4,nexus4_1,stand


In [69]:
phone_gyro_user_a = phone_gyro[phone_gyro['User'] == 'a']
phone_gyro_user_a['arrivalDttm_rounded'] = convert_to_decisecond(phone_gyro_user_a)
phone_gyro_user_a = aggregate(phone_gyro_user_a, prefix = 'gyro')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas

In [6]:
phone_gyro_user_a.head()

NameError: name 'phone_gyro_user_a' is not defined

In [73]:
joined = phone_acc_user_a.merge(phone_gyro_user_a)

In [5]:
joined.columns

NameError: name 'joined' is not defined

NameError: name 'joined' is not defined

In [None]:
# # phone gyroscope. 
# phone_gyro = pd.read_csv(filesToRead[1])
# phone_gyro.shape
# phone_gyro.head()

# # watch accelerometer
# watch_acc = pd.read_csv(filesToRead[2])
# watch_acc.shape
# watch_acc.head()

# # watch gyro
# watch_gyro = pd.read_csv(filesToRead[3])
# watch_gyro.shape
# watch_gyro.head()