# Data cleaning

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
import json
import matplotlib.pyplot as plt

In [3]:
PATH = Path('data')
list(PATH.iterdir())

[WindowsPath('data/location_history.json')]

In [103]:
j_file = json.load((PATH/'location_history.json').open())

In [104]:
df = pd.DataFrame.from_dict(j_file['locations'])

In [106]:
df.tail(10) # sort newest to oldest by datetime

Unnamed: 0,accuracy,activity,altitude,heading,latitudeE7,longitudeE7,timestampMs,velocity,verticalAccuracy
647887,30,,,,296678426,-952766618,1386536707631,,
647888,30,"[{'timestampMs': '1386536651324', 'activity': ...",,,296678240,-952766606,1386536647690,,
647889,30,,,,296678308,-952766592,1386536587653,,
647890,30,,,,296678200,-952766444,1386536527693,,
647891,11,,,,296678076,-952766669,1386536468150,,
647892,12,,,,296678120,-952766469,1386536423014,,
647893,5,"[{'timestampMs': '1386536388113', 'activity': ...",31.0,320.0,296677802,-952765843,1386536377898,0.0,
647894,6,,31.0,320.0,296677783,-952765824,1386536372859,0.0,
647895,3,,30.0,307.0,296677688,-952765729,1386536352800,0.0,
647896,8,"[{'timestampMs': '1386536307870', 'activity': ...",30.0,161.0,296676529,-952764549,1386536307163,0.0,


In [107]:
df.timestampMs = df.timestampMs.astype(np.int64)

In [108]:
df['date_time'] = pd.to_datetime(df.timestampMs,unit='ms').dt.tz_localize('utc').dt.tz_convert('US/Central')

In [109]:
df.head()

Unnamed: 0,accuracy,activity,altitude,heading,latitudeE7,longitudeE7,timestampMs,velocity,verticalAccuracy,date_time
0,16,,118.0,,390846488,-771527455,1531424546373,,2.0,2018-07-12 14:42:26.373000-05:00
1,16,,118.0,,390846471,-771527444,1531423345272,,2.0,2018-07-12 14:22:25.272000-05:00
2,16,,118.0,,390846486,-771527475,1531422368498,,2.0,2018-07-12 14:06:08.498000-05:00
3,16,,118.0,,390846477,-771527482,1531421692554,,2.0,2018-07-12 13:54:52.554000-05:00
4,26,,,,390845015,-771527064,1531421092427,,,2018-07-12 13:44:52.427000-05:00


In [74]:
# from 2013 to dec 2017 - studying at Houston
# df_houston = df[df.timestampMs <= 1485752399000]

In [119]:
df_houston = df[df.date_time <= '2017-01-30 06:20:00'].reset_index(drop=True)

In [120]:
df_houston.head()

Unnamed: 0,accuracy,activity,altitude,heading,latitudeE7,longitudeE7,timestampMs,velocity,verticalAccuracy,date_time
0,545,"[{'timestampMs': '1485778729815', 'activity': ...",23.0,,299853940,-953481015,1485778729042,,,2017-01-30 06:18:49.042000-06:00
1,585,"[{'timestampMs': '1485778608624', 'activity': ...",19.0,,299853159,-953482313,1485778672000,,,2017-01-30 06:17:52-06:00
2,8,"[{'timestampMs': '1485778474389', 'activity': ...",4.0,224.0,299850980,-953489148,1485778297913,,,2017-01-30 06:11:37.913000-06:00
3,8,,4.0,224.0,299850980,-953489148,1485778233440,,,2017-01-30 06:10:33.440000-06:00
4,8,"[{'timestampMs': '1485778148328', 'activity': ...",4.0,224.0,299850980,-953489148,1485778148040,,,2017-01-30 06:09:08.040000-06:00


In [123]:
df_houston.shape

(270054, 10)

In [122]:
df_houston.to_csv(PATH/'houston.csv')

# Activity

In [124]:
df_act = df_houston[~df_houston.activity.isnull()].copy().reset_index(drop=True)

In [125]:
df_act.tail(10)

Unnamed: 0,accuracy,activity,altitude,heading,latitudeE7,longitudeE7,timestampMs,velocity,verticalAccuracy,date_time
103362,30,"[{'timestampMs': '1386583293458', 'activity': ...",,,296678587,-952764943,1386583319690,,,2013-12-09 04:01:59.690000-06:00
103363,31,"[{'timestampMs': '1386583203478', 'activity': ...",,,296679650,-952764102,1386583199679,,,2013-12-09 03:59:59.679000-06:00
103364,27,"[{'timestampMs': '1386582982885', 'activity': ...",,,296679784,-952761407,1386582960952,,,2013-12-09 03:56:00.952000-06:00
103365,32,"[{'timestampMs': '1386582885373', 'activity': ...",,,296679627,-952764028,1386582899839,,,2013-12-09 03:54:59.839000-06:00
103366,30,"[{'timestampMs': '1386537188419', 'activity': ...",,,296677881,-952766664,1386537187727,,,2013-12-08 15:13:07.727000-06:00
103367,30,"[{'timestampMs': '1386537011366', 'activity': ...",,,296678288,-952766357,1386537007726,,,2013-12-08 15:10:07.726000-06:00
103368,30,"[{'timestampMs': '1386536831342', 'activity': ...",,,296678431,-952766577,1386536827760,,,2013-12-08 15:07:07.760000-06:00
103369,30,"[{'timestampMs': '1386536651324', 'activity': ...",,,296678240,-952766606,1386536647690,,,2013-12-08 15:04:07.690000-06:00
103370,5,"[{'timestampMs': '1386536388113', 'activity': ...",31.0,320.0,296677802,-952765843,1386536377898,0.0,,2013-12-08 14:59:37.898000-06:00
103371,8,"[{'timestampMs': '1386536307870', 'activity': ...",30.0,161.0,296676529,-952764549,1386536307163,0.0,,2013-12-08 14:58:27.163000-06:00


In [126]:
temp = df_act.activity.apply(pd.Series)

In [127]:
temp.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,89,90,91,92,93,94,95,96,97,98
0,"{'timestampMs': '1485778729815', 'activity': [...",,,,,,,,,,...,,,,,,,,,,
1,"{'timestampMs': '1485778608624', 'activity': [...",,,,,,,,,,...,,,,,,,,,,
2,"{'timestampMs': '1485778474389', 'activity': [...","{'timestampMs': '1485778410435', 'activity': [...",,,,,,,,,...,,,,,,,,,,


In [128]:
temp.tail(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,89,90,91,92,93,94,95,96,97,98
103369,"{'timestampMs': '1386536651324', 'activity': [...",,,,,,,,,,...,,,,,,,,,,
103370,"{'timestampMs': '1386536388113', 'activity': [...",,,,,,,,,,...,,,,,,,,,,
103371,"{'timestampMs': '1386536307870', 'activity': [...",,,,,,,,,,...,,,,,,,,,,


In [129]:
temp.shape

(103372, 99)

In [130]:
temp[~temp.loc[:,98].isnull()]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,89,90,91,92,93,94,95,96,97,98
98360,"{'timestampMs': '1449531032147', 'activity': [...","{'timestampMs': '1449531021881', 'activity': [...","{'timestampMs': '1449531016728', 'activity': [...","{'timestampMs': '1449531011574', 'activity': [...","{'timestampMs': '1449531006086', 'activity': [...","{'timestampMs': '1449531000597', 'activity': [...","{'timestampMs': '1449530995519', 'activity': [...","{'timestampMs': '1449530990441', 'activity': [...","{'timestampMs': '1449530979423', 'activity': [...","{'timestampMs': '1449530933432', 'activity': [...",...,"{'timestampMs': '1449528302551', 'activity': [...","{'timestampMs': '1449528258422', 'activity': [...","{'timestampMs': '1449528190150', 'activity': [...","{'timestampMs': '1449528168199', 'activity': [...","{'timestampMs': '1449528113419', 'activity': [...","{'timestampMs': '1449528102692', 'activity': [...","{'timestampMs': '1449528097108', 'activity': [...","{'timestampMs': '1449528081403', 'activity': [...","{'timestampMs': '1449528060477', 'activity': [...","{'timestampMs': '1449528054958', 'activity': [..."


In [131]:
# non null values percentages
((len(temp) - temp.isnull().sum()) / len(temp))*100

0     100.000000
1      31.859691
2      17.933289
3      11.551484
4       7.162481
5       4.777890
6       2.974693
7       1.958944
8       1.290485
9       0.882251
10      0.614286
11      0.442093
12      0.318268
13      0.248617
14      0.179933
15      0.140270
16      0.119955
17      0.102542
18      0.086097
19      0.070619
20      0.059010
21      0.055141
22      0.045467
23      0.039663
24      0.035793
25      0.034826
26      0.030956
27      0.029989
28      0.027087
29      0.026119
         ...    
69      0.001935
70      0.001935
71      0.001935
72      0.001935
73      0.001935
74      0.001935
75      0.001935
76      0.001935
77      0.000967
78      0.000967
79      0.000967
80      0.000967
81      0.000967
82      0.000967
83      0.000967
84      0.000967
85      0.000967
86      0.000967
87      0.000967
88      0.000967
89      0.000967
90      0.000967
91      0.000967
92      0.000967
93      0.000967
94      0.000967
95      0.000967
96      0.0009

In [58]:
df_act.iloc[197263]

accuracy                                                           33
activity            [{'timestampMs': '1449531162230', 'activity': ...
altitude                                                          NaN
heading                                                           NaN
latitudeE7                                                  296893428
longitudeE7                                                -952712267
timestampMs                                             1449531141880
velocity                                                          NaN
verticalAccuracy                                                  NaN
Name: 197263, dtype: object

In [56]:
df_act.iloc[197264]

accuracy                                                          688
activity            [{'timestampMs': '1449531032147', 'activity': ...
altitude                                                          NaN
heading                                                           NaN
latitudeE7                                                  297202088
longitudeE7                                                -953397731
timestampMs                                             1449528136448
velocity                                                          NaN
verticalAccuracy                                                  NaN
Name: 197264, dtype: object

In [57]:
df_act.iloc[197265]

accuracy                                                          688
activity            [{'timestampMs': '1449528037190', 'activity': ...
altitude                                                          NaN
heading                                                           NaN
latitudeE7                                                  297202088
longitudeE7                                                -953397731
timestampMs                                             1449527971658
velocity                                                          NaN
verticalAccuracy                                                  NaN
Name: 197265, dtype: object

In [33]:
temp2 = temp[0].apply(pd.Series)

In [35]:
temp2.iloc[14].activity

[{'type': 'STILL', 'confidence': 50}, {'type': 'UNKNOWN', 'confidence': 50}]