In [5]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from pathlib import Path
import json
import matplotlib.pyplot as plt

PATH = Path('data')
list(PATH.iterdir())

[PosixPath('data/houston.csv'), PosixPath('data/location_history.json')]

In [12]:
def read_json():
    j_file = json.load((PATH/'location_history.json').open())
    df = pd.DataFrame.from_dict(j_file['locations'])
    df.timestampMs = df.timestampMs.astype(np.int64)
    return df

def get_data_from_ts(df,ts=1485778729042):
    #houston time: ts = 1485778729042
    df_houston = df[df.timestampMs <= ts].reset_index(drop=True)
    return df_houston

def remove_long_activity_list_row(df,del_start=18,del_end=99):
    df_act = df[~df.activity.isnull()].copy()
    df_act_expand = df_act.activity.apply(pd.Series)
    
    rows_to_drop = df_act_expand[
        (~df_act_expand.loc[:,range(del_start,del_end)].isnull()).sum(axis=1) >=1].index
    df.drop(rows_to_drop,inplace=True)
    df.reset_index(inplace=True,drop=True)
    return df

def append_activity(df):
    df_act = df[~df.activity.isnull()].copy()
    df.drop(df_act.index,inplace=True)
    df.drop('activity',axis=1,inplace=True)
    
    # generating new combined activity df
    col_to_drop = ['timestampMs','activity']
    org_cols=[col for col in df_act.columns.values.tolist() if col not in col_to_drop]
    activity_cols = ['timestampMs','act_conf1','act_type1','act_cont2','act_type2',
                                    'extra_intVal','extra_name','extra_type']
    
    combined_df = pd.DataFrame(columns=org_cols + activity_cols)

    for idx,acts in zip(df_act.index.values,df_act.activity.values):
        activity_df = pd.DataFrame(columns=activity_cols)
        c=0
        for act in acts:
            row=[]
            row.append(act['timestampMs'])
            # get activity max confidence and type, which is the first dict in activity list
            cof = act['activity'][0]
            row+=[cof['confidence'],cof['type']]
            # get secondary activity conf and type
            if len(act['activity']) > 1:
                cof = act['activity'][1]
                row+=[cof['confidence'],cof['type']]
            else:
                row+=[np.NaN,np.NaN]
            # extra
            if 'extra' in act:
                ex = act['extra'][0] # only 1 item in extra list
                row+=[ex['intVal'],ex['name'],ex['type']]
            else:
                row+=[np.NaN,np.NaN,np.NaN]
            activity_df.loc[c]=row
            c+=1


        org_df = pd.DataFrame([df_act.loc[idx]]*c,index=range(0,c))
        org_df.drop(col_to_drop,axis=1,inplace=True)
        combined_df=combined_df.append(pd.concat([org_df,activity_df],axis=1),ignore_index=True)
    df = df.append(combined_df,ignore_index=True,sort=False)
    return df

In [6]:
df = read_json()

In [8]:
df.shape

(647897, 9)

In [7]:
df.head()

Unnamed: 0,accuracy,activity,altitude,heading,latitudeE7,longitudeE7,timestampMs,velocity,verticalAccuracy
0,16,,118.0,,390846488,-771527455,1531424546373,,2.0
1,16,,118.0,,390846471,-771527444,1531423345272,,2.0
2,16,,118.0,,390846486,-771527475,1531422368498,,2.0
3,16,,118.0,,390846477,-771527482,1531421692554,,2.0
4,26,,,,390845015,-771527064,1531421092427,,


In [9]:
df = get_data_from_ts(df)
df.shape

(270054, 9)

In [13]:
df = remove_long_activity_list_row(df)

In [14]:
df.shape

(269965, 9)

In [15]:
df.index

RangeIndex(start=0, stop=269965, step=1)

In [16]:
# testing function
df_non_act = df[df.activity.isnull()].iloc[:5].copy()
df_non_act

Unnamed: 0,accuracy,activity,altitude,heading,latitudeE7,longitudeE7,timestampMs,velocity,verticalAccuracy
3,8,,4.0,224.0,299850980,-953489148,1485778233440,,
13,395,,-18.0,,299857978,-953493753,1485777399000,,
14,53,,-3.0,335.0,299855823,-953490404,1485777360000,,
15,138,,54.0,30.0,299850044,-953490081,1485777320000,,
16,1700,,,,299819691,-953542128,1485777244889,,


In [17]:
df_act = df[~df.activity.isnull()]

In [22]:
df_act_1=df_act[df_act.activity.apply(lambda x: len(x))==1].iloc[:2]

In [24]:
df_act_1

Unnamed: 0,accuracy,activity,altitude,heading,latitudeE7,longitudeE7,timestampMs,velocity,verticalAccuracy
0,545,"[{'timestampMs': '1485778729815', 'activity': ...",23.0,,299853940,-953481015,1485778729042,,
1,585,"[{'timestampMs': '1485778608624', 'activity': ...",19.0,,299853159,-953482313,1485778672000,,


In [35]:
for i in df_act_1.activity:
    print(i)
    print('---')

[{'timestampMs': '1485778729815', 'activity': [{'type': 'STILL', 'confidence': 100}]}]
---
[{'timestampMs': '1485778608624', 'activity': [{'type': 'STILL', 'confidence': 100}]}]
---


In [27]:
df_act_2=df_act[df_act.activity.apply(lambda x: len(x))==2].iloc[:1]

In [29]:
for i in df_act_2.activity:
    print(i)

[{'timestampMs': '1485778474389', 'activity': [{'type': 'STILL', 'confidence': 100}]}, {'timestampMs': '1485778410435', 'activity': [{'type': 'STILL', 'confidence': 100}]}]


In [66]:
df_act_extra=df_act.loc[19559]
for i in df_act_extra.activity:
    print(i)
    print('---')

{'timestampMs': '1483475391745', 'activity': [{'type': 'IN_VEHICLE', 'confidence': 100}], 'extra': [{'type': 'VALUE', 'name': 'vehicle_personal_confidence', 'intVal': 100}]}
---
{'timestampMs': '1483475391675', 'activity': [{'type': 'TILTING', 'confidence': 100}]}
---
{'timestampMs': '1483475369895', 'activity': [{'type': 'IN_VEHICLE', 'confidence': 89}, {'type': 'STILL', 'confidence': 62}], 'extra': [{'type': 'VALUE', 'name': 'vehicle_personal_confidence', 'intVal': 100}]}
---
{'timestampMs': '1483475366454', 'activity': [{'type': 'IN_VEHICLE', 'confidence': 89}, {'type': 'STILL', 'confidence': 38}, {'type': 'UNKNOWN', 'confidence': 11}], 'extra': [{'type': 'VALUE', 'name': 'vehicle_personal_confidence', 'intVal': 100}]}
---


In [68]:
df_test = pd.DataFrame()
df_test =df_test.append(df_non_act,ignore_index=True)
df_test = df_test.append(df_act_1,ignore_index=True)
df_test = df_test.append(df_act_2,ignore_index=True)
df_test = df_test.append(df_act_extra,ignore_index=True)

In [69]:
df_test

Unnamed: 0,accuracy,activity,altitude,heading,latitudeE7,longitudeE7,timestampMs,velocity,verticalAccuracy
0,8,,4.0,224.0,299850980,-953489148,1485778233440,,
1,395,,-18.0,,299857978,-953493753,1485777399000,,
2,53,,-3.0,335.0,299855823,-953490404,1485777360000,,
3,138,,54.0,30.0,299850044,-953490081,1485777320000,,
4,1700,,,,299819691,-953542128,1485777244889,,
5,545,"[{'timestampMs': '1485778729815', 'activity': ...",23.0,,299853940,-953481015,1485778729042,,
6,585,"[{'timestampMs': '1485778608624', 'activity': ...",19.0,,299853159,-953482313,1485778672000,,
7,8,"[{'timestampMs': '1485778474389', 'activity': ...",4.0,224.0,299850980,-953489148,1485778297913,,
8,50,"[{'timestampMs': '1483475391745', 'activity': ...",106.0,209.0,302832369,-977272578,1483475370000,,


In [71]:
append_activity(df_test)

Unnamed: 0,accuracy,altitude,heading,latitudeE7,longitudeE7,timestampMs,velocity,verticalAccuracy,act_conf1,act_type1,act_cont2,act_type2,extra_intVal,extra_name,extra_type
0,8,4.0,224.0,299850980,-953489148,1485778233440,,,,,,,,,
1,395,-18.0,,299857978,-953493753,1485777399000,,,,,,,,,
2,53,-3.0,335.0,299855823,-953490404,1485777360000,,,,,,,,,
3,138,54.0,30.0,299850044,-953490081,1485777320000,,,,,,,,,
4,1700,,,299819691,-953542128,1485777244889,,,,,,,,,
5,545,23.0,,299853940,-953481015,1485778729815,,,100.0,STILL,,,,,
6,585,19.0,,299853159,-953482313,1485778608624,,,100.0,STILL,,,,,
7,8,4.0,224.0,299850980,-953489148,1485778474389,,,100.0,STILL,,,,,
8,8,4.0,224.0,299850980,-953489148,1485778410435,,,100.0,STILL,,,,,
9,50,106.0,209.0,302832369,-977272578,1483475391745,,,100.0,IN_VEHICLE,,,100.0,vehicle_personal_confidence,VALUE
