In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
import re
import matplotlib.pyplot as plt
import scipy as sp
#%matplotlib inline
%matplotlib notebook
sns.set()
pd.set_option("display.max_columns", None)
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


import arff
import scipy.io.arff as si
import datetime

In [2]:
#data files
os.listdir("../data/home/share/data/public_sets/WISDM_at_v2.0/")

['.WISDM_at_v2.0_raw_about.txt.swp',
 'readme.txt',
 'WISDM_at_v2.0_demographics.txt',
 'WISDM_at_v2.0_demographics_about.txt',
 'WISDM_at_v2.0_raw.txt',
 'WISDM_at_v2.0_raw_about.txt',
 'WISDM_at_v2.0_transformed.arff',
 'WISDM_at_v2.0_transformed_about.txt',
 'WISDM_at_v2.0_unlabeled_raw.txt',
 'WISDM_at_v2.0_unlabeled_raw_about.txt',
 'WISDM_at_v2.0_unlabeled_transformed.arff',
 'WISDM_at_v2.0_unlabeled_transformed_about.txt']

In [3]:
#demographics data
base = "../data/home/share/data/public_sets/WISDM_at_v2.0/"
demog = pd.read_csv(base+"WISDM_at_v2.0_demographics.txt",
                    names=["user", "height", "sex", "age", "weight", "leg_injury"])

#leg_injury - indicator if user has leg injury (0 for none, 1 if has)
demog.head()

Unnamed: 0,user,height,sex,age,weight,leg_injury
0,1,62.5,F,21.0,135.0,0.0
1,2,68.0,F,20.0,160.0,0.0
2,3,79.0,M,19.0,180.0,0.0
3,4,73.0,M,21.0,165.0,1.0
4,5,66.5,M,22.0,125.0,0.0


In [24]:
#raw data 
raw = pd.read_csv(base+'WISDM_at_v2.0_raw.txt', sep=",",
                  names=["user", "activity", "timestamp", "x", "y", "z"],
                  low_memory=False).dropna()
raw.drop([409831, 779331, 779332], inplace=True) #timestamps are indecipherable
raw["timestamp"] = pd.to_datetime(raw["timestamp"], unit="ms")
raw.reset_index(drop=True, inplace=True)
raw["z"] = raw["z"].apply(lambda x: float(x[:-1]))

In [34]:
#separate the time and date
raw["date"] = raw["timestamp"].apply(lambda x: x.date())
raw["time"] = raw["timestamp"].apply(lambda x: x.time())

In [60]:
raw.head()

Unnamed: 0,user,activity,timestamp,x,y,z,date,time
0,1679,Walking,2013-06-06 12:07:49.556,0.294132,-0.635605,-0.226936,2013-06-06,12:07:49.556000
1,1679,Walking,2013-06-06 12:07:49.606,-0.499688,-0.604451,-0.22602,2013-06-06,12:07:49.606000
2,1679,Walking,2013-06-06 12:07:49.656,-2.178345,0.713491,0.372017,2013-06-06,12:07:49.656000
3,1679,Walking,2013-06-06 12:07:49.706,-2.797763,1.354899,-0.277638,2013-06-06,12:07:49.706000
4,1679,Walking,2013-06-06 12:07:49.756,-2.167961,-1.327716,-0.554971,2013-06-06,12:07:49.756000


In [42]:
#count how many sessions for each of the activity
raw.groupby("activity").apply(lambda x: len(x.groupby(["user", "date"]).groups))

activity
Jogging       56
LyingDown     44
Sitting      113
Stairs        22
Standing      87
Walking      211
dtype: int64

In [58]:
#number of users
len(raw["user"].unique())

225

In [56]:
raw_bysession = pd.DataFrame(raw.groupby(["user", "activity", "date"])\
                                .apply(lambda x: x[["x", "y", "z"]].values),
                             columns=["tri-accel"]).reset_index()
fs = 20 #Hz sampling rate of the device
raw_bysession["duration"] = raw_bysession["tri-accel"].apply(lambda x: len(x)*fs/60) #in min

In [59]:
raw_bysession.head()

Unnamed: 0,user,activity,date,tri-accel,duration
0,194,Jogging,2013-06-04,"[[5.0994577, 8.825985, 0.0392266], [8.551398, ...",1266.333333
1,194,LyingDown,2013-06-04,"[[10.120462, 0.69627213, 1.2846711], [10.08123...",133.333333
2,194,Sitting,2013-06-04,"[[2.2359161, 4.903325, 7.3157606], [9.071151, ...",66.0
3,194,Sitting,2013-06-20,"[[-0.0031957934, 0.07243798, 0.058589544], [0....",798.0
4,194,Stairs,2013-06-04,"[[2.0005565, 3.3734875, 0.7649187], [2.3143694...",2066.666667


In [75]:
#plot some accelerometer clips
nsamp_per_act = 3
activities = raw_bysession["activity"].unique()

for act in activities:
    act_df = raw_bysession[raw_bysession["activity"]==act]
    users = act_df[act_df["activity"]==act]["user"]
    
    for j in np.random.choice(users, replace=False, size=nsamp_per_act):
        plt.figure(figsize=(10, 3))
        s = act_df[act_df["user"]==j]["tri-accel"]
        plt.plot(np.sqrt(np.sum(act_df[act_df["user"]==j]["tri-accel"].iloc[0]**2, axis=1)))
        plt.tight_layout()
        plt.title(act)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [161]:
import pandas
import io
import re
relation = r'@relation (?P<relation>[^\n]+)'
attribute = r'@attribute (?P<attribute>[^\n]+)'
data = r'@data\n(?P<data>.+)'
arff_re = re.compile(r'{}|{}|{}'.format(relation, attribute, data), re.DOTALL)

def arff2dataframe(filename):
    with open(filename, 'r') as f:
        text = f.read()
    column_names = []
    for m in arff_re.finditer(text):
        d = m.groupdict()
        if d['attribute']:
            colm = re.match(r'\".+\"', d['attribute'])
            column_names.append(colm.group())
            #column_names.append(colm.group(1) or colm.group(2))
        if d['data']:
            csv_data = d['data']
    return pandas.read_csv(io.StringIO(csv_data),
                           header=None,
                           names=column_names)

In [162]:
trans_arff = arff2dataframe(base+"WISDM_at_v2.0_transformed.arff")

In [163]:
trans_arff

Unnamed: 0,"""user""","""X0""","""X1""","""X2""","""X3""","""X4""","""X5""","""X6""","""X7""","""X8""","""X9""","""Y0""","""Y1""","""Y2""","""Y3""","""Y4""","""Y5""","""Y6""","""Y7""","""Y8""","""Y9""","""Z0""","""Z1""","""Z2""","""Z3""","""Z4""","""Z5""","""Z6""","""Z7""","""Z8""","""Z9""","""XAVG""","""YAVG""","""ZAVG""","""XPEAK""","""YPEAK""","""ZPEAK""","""XABSOLDEV""","""YABSOLDEV""","""ZABSOLDEV""","""XSTANDDEV""","""YSTANDDEV""","""ZSTANDDEV""","""RESULTANT""","""class"""
0,194,0.0,1.000,0.000,0.000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0,1.000,0.0,0.0,0.0,1.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.524263,10.489200,-1.70258,0.000,6.420000e+02,0.000,0.116248,0.047219,0.139718,0.010264,0.004632,0.012177,10.64180,Standing
1,194,0.0,1.000,0.000,0.000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0,1.000,0.0,0.0,0.0,1.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.571335,10.507800,-1.57161,0.000,4.790000e+02,0.000,0.075843,0.032549,0.099133,0.007557,0.003087,0.009263,10.64130,Standing
2,194,0.0,1.000,0.000,0.000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0,1.000,0.0,0.0,0.0,1.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.571237,10.508200,-1.57058,0.000,4.790000e+02,0.000,0.075928,0.032858,0.100204,0.007561,0.003103,0.009316,10.64160,Standing
3,194,0.0,1.000,0.000,0.000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0,1.000,0.0,0.0,0.0,1.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.570845,10.508300,-1.56955,0.000,4.810710e+02,0.000,0.076094,0.032876,0.101254,0.007566,0.003102,0.009368,10.64160,Standing
4,194,0.0,1.000,0.000,0.000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0,1.000,0.0,0.0,0.0,1.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.570453,10.508500,-1.56852,0.000,4.810710e+02,0.000,0.075859,0.032892,0.102304,0.007562,0.003102,0.009419,10.64160,Standing
5,194,0.0,1.000,0.000,0.000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0,1.000,0.0,0.0,0.0,1.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.570845,10.508300,-1.56749,0.000,1.716000e+03,0.000,0.075702,0.032669,0.103354,0.007556,0.003092,0.009469,10.64120,Standing
6,194,0.0,1.000,0.000,0.000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0,1.000,0.0,0.0,0.0,1.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.611641,10.508700,-1.49041,0.000,2.007830e+03,0.000,0.103885,0.035407,0.072930,0.008789,0.003087,0.006660,10.63260,Standing
7,194,0.0,1.000,0.000,0.000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0,1.000,0.0,0.0,0.0,1.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.612768,10.508700,-1.49203,0.000,1.742430e+03,0.000,0.104086,0.035407,0.073772,0.008796,0.003087,0.006713,10.63290,Standing
8,194,0.0,1.000,0.000,0.000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0,1.000,0.0,0.0,0.0,1.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.613847,10.508600,-1.49238,0.000,1.742430e+03,0.000,0.104212,0.035364,0.073518,0.008799,0.003086,0.006691,10.63300,Standing
9,194,0.0,1.000,0.000,0.000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0,1.000,0.0,0.0,0.0,1.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.614975,10.508600,-1.49336,0.000,1.742430e+03,0.000,0.104760,0.035320,0.073550,0.008829,0.003085,0.006675,10.63310,Standing


In [150]:
unl_trans_arff = arff2dataframe(base+"WISDM_at_v2.0_unlabeled_transformed.arff")

In [151]:
unl_trans_arff

Unnamed: 0,"""user"" {""194"", ""925"", ""995"", ""998"", ""1005"", ""1058"", ""1064"", ""1097"", ""1104"", ""1117"", ""1100"", ""1110"", ""1140"", ""1141"", ""1191"", ""1192"", ""1193"", ""1195"", ""1224"", ""1233"", ""1238"", ""1239"", ""1248"", ""1249"", ""1251"", ""1253"", ""1255"", ""1260"", ""1263"", ""1266"", ""1267"", ""1268"", ""1271"", ""1272"", ""1274"", ""1275"", ""1277"", ""1279"", ""1280"", ""1288"", ""1292"", ""1294"", ""1295"", ""1302"", ""1303"", ""1305"", ""1314"", ""1317"", ""1319"", ""1320"", ""1478"", ""1482"", ""1484"", ""1491"", ""1492"", ""1494"", ""1498"", ""1501"", ""1503"", ""1504"", ""1507"", ""1512"", ""1514"", ""1515"", ""1516"", ""1521"", ""1526"", ""1529"", ""1531"", ""1532"", ""1536"", ""1537"", ""1538"", ""1540"", ""1549"", ""1550"", ""1556"", ""1557"", ""1558"", ""1559"", ""1561"", ""1562"", ""1565"", ""1566"", ""1567"", ""1577"", ""1579"", ""1598"", ""1601"", ""1602"", ""1603"", ""1634"", ""1636"", ""1647"", ""1656"", ""1658"", ""1660"", ""1662"", ""1664"", ""1667"", ""1671"", ""1685"", ""1687"", ""1688"", ""1690"", ""1696"", ""1702"", ""1703"", ""1704"", ""1706"", ""1707"", ""1717"", ""1721"", ""1722"", ""1723"", ""1724"", ""1726"", ""1730"", ""1732"", ""1733"", ""1734"", ""1736"", ""1742"", ""1745"", ""1748"", ""1749"", ""1750"", ""1753"", ""1755"", ""1758"", ""1759"", ""1763"", ""1766"", ""1767"", ""1768"", ""1769"", ""1774"", ""1775"", ""1776"", ""1778"", ""1783"", ""1791"", ""1793"", ""1794"", ""1798"", ""1809"", ""1802"", ""1804"", ""1801""","""X0""","""X1""","""X2""","""X3""","""X4""","""X5""","""X6""","""X7""","""X8""","""X9""","""Y0""","""Y1""","""Y2""","""Y3""","""Y4""","""Y5""","""Y6""","""Y7""","""Y8""","""Y9""","""Z0""","""Z1""","""Z2""","""Z3""","""Z4""","""Z5""","""Z6""","""Z7""","""Z8""","""Z9""","""XAVG""","""YAVG""","""ZAVG""","""XPEAK""","""YPEAK""","""ZPEAK""","""XABSOLDEV""","""YABSOLDEV""","""ZABSOLDEV""","""XSTANDDEV""","""YSTANDDEV""","""ZSTANDDEV""","""RESULTANT""","""class""{ ""Walking"" , ""Jogging"" , ""Stairs"" , ""Sitting"" , ""Standing"" , ""LyingDown"""
0,194,0.00,1.000,0.000,0.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.810,0.190,0.0,0.000,0.000,0,0.00,0.0,0.000,0.000,0.005,0.995,0.000,0.000,0.000,0.0,0.0,0.000,0.0,-0.088465,-1.345960e-02,0.063634,0.000000e+00,2.850330e+03,499.000,0.104367,1.905670e-02,0.008037,0.010375,2.342200e-03,0.000923,1.307830e-01,Sitting
1,194,0.00,0.695,0.305,0.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.625,0.375,0.0,0.000,0.000,0,0.00,0.0,0.000,0.000,0.005,0.995,0.000,0.000,0.000,0.0,0.0,0.000,0.0,-0.009183,-1.143560e-02,0.067639,3.100000e+03,1.575000e+03,775.500,0.035775,2.819160e-02,0.013007,0.003306,2.839660e-03,0.001643,9.119130e-02,Sitting
2,194,0.00,0.980,0.020,0.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.850,0.150,0.0,0.000,0.000,0,0.00,0.0,0.000,0.000,0.000,1.000,0.000,0.000,0.000,0.0,0.0,0.000,0.0,-0.023116,-8.474180e-03,0.066888,-1.853710e+10,4.500000e+02,325.000,0.003780,7.944420e-03,0.002779,0.000514,1.034740e-03,0.000339,7.283940e-02,Sitting
3,194,0.00,1.000,0.000,0.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.875,0.125,0.0,0.000,0.000,0,0.00,0.0,0.000,0.000,0.000,1.000,0.000,0.000,0.000,0.0,0.0,0.000,0.0,-0.023116,-8.596690e-03,0.066861,0.000000e+00,1.719330e+03,333.333,0.002370,5.777460e-03,0.002699,0.000232,5.242850e-04,0.000261,7.170910e-02,Sitting
4,194,0.00,1.000,0.000,0.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.900,0.100,0.0,0.000,0.000,0,0.00,0.0,0.000,0.000,0.000,1.000,0.000,0.000,0.000,0.0,0.0,0.000,0.0,-0.023303,-8.490160e-03,0.066861,0.000000e+00,3.271000e+03,543.176,0.002182,5.425710e-03,0.001721,0.000236,6.158720e-04,0.000170,7.185890e-02,Sitting
5,194,0.00,0.955,0.045,0.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.765,0.235,0.0,0.000,0.000,0,0.00,0.0,0.000,0.000,0.000,1.000,0.000,0.000,0.000,0.0,0.0,0.000,0.0,-0.024059,-7.616640e-03,0.066089,3.140500e+03,2.750000e+02,3189.500,0.009518,1.604180e-02,0.005188,0.001459,1.947620e-03,0.000595,7.697930e-02,Sitting
6,194,0.00,1.000,0.000,0.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.860,0.140,0.0,0.000,0.000,0,0.00,0.0,0.000,0.000,0.000,1.000,0.000,0.000,0.000,0.0,0.0,0.000,0.0,-0.023356,-8.538100e-03,0.067144,0.000000e+00,2.289330e+03,2406.670,0.003389,6.930660e-03,0.003647,0.000386,6.913340e-04,0.000380,7.241380e-02,Sitting
7,194,0.00,1.000,0.000,0.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.935,0.065,0.0,0.000,0.000,0,0.00,0.0,0.000,0.000,0.000,1.000,0.000,0.000,0.000,0.0,0.0,0.000,0.0,-0.023335,-7.941550e-03,0.066728,0.000000e+00,2.175500e+03,809.083,0.001443,3.966140e-03,0.001548,0.000140,3.880400e-04,0.000155,7.135880e-02,Sitting
8,194,0.00,1.000,0.000,0.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.995,0.005,0.0,0.000,0.000,0,0.00,0.0,0.000,0.000,0.000,1.000,0.000,0.000,0.000,0.0,0.0,0.000,0.0,-0.023255,-8.229170e-03,0.066840,0.000000e+00,-2.078410e+10,466.667,0.001393,2.797380e-03,0.001767,0.000129,2.534270e-04,0.000201,7.136890e-02,Sitting
9,194,0.00,1.000,0.000,0.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,1.000,0.000,0.0,0.000,0.000,0,0.00,0.0,0.000,0.000,0.000,1.000,0.000,0.000,0.000,0.0,0.0,0.000,0.0,-0.023265,-8.058730e-03,0.066867,0.000000e+00,0.000000e+00,391.391,0.001148,2.174310e-03,0.001115,0.000100,1.879490e-04,0.000106,7.132000e-02,Sitting
