In [17]:
from uuid import uuid4
from datetime import timedelta
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from geopy.distance import vincenty

In [18]:
def distance_delta(row):
    lat = row.latitude
    long = row.longitude
    lat_1 = row.latitude_shift_1
    long_1 = row.longitude_shift_1
    
    if any(map(np.isnan, [lat, long, lat_1, long_1])):
        return np.nan
    return vincenty((lat, long), (lat_1, long_1)).miles

In [19]:
date_columns = ['timestamp']
df = pd.read_csv("./gps_dataset.csv", sep=";", parse_dates=date_columns)
df.sort_values('timestamp')
df['week_day'] = df.timestamp.dt.weekday_name
df['timestamp_delta'] = df.timestamp - df.timestamp.shift(1)
df['latitude_shift_1'] = df.latitude.shift(1)
df['longitude_shift_1'] = df.longitude.shift(1)
df['distance_delta'] = df.apply(distance_delta, axis=1)
df['is_new_walk'] = df.apply(lambda x: x.timestamp_delta > timedelta(minutes=15) or pd.isnull(x.timestamp_delta), axis=1)
df

Unnamed: 0,timestamp,latitude,longitude,altitude,horizontalAccuracy,verticalAccuracy,speed,course,week_day,timestamp_delta,latitude_shift_1,longitude_shift_1,distance_delta,is_new_walk
0,2018-03-13 21:15:01,40.702452,-73.984135,25.913151,65.000000,65.000000,-3.600000,-1.000000,Tuesday,NaT,,,,True
1,2018-03-13 21:15:20,40.702438,-73.984151,28.514435,50.000000,50.000000,0.000000,251.718750,Tuesday,00:00:19,40.702452,-73.984135,0.001280,False
2,2018-03-13 21:15:20,40.702425,-73.984172,29.354584,30.000000,30.000000,1.332000,207.421875,Tuesday,00:00:00,40.702438,-73.984151,0.001422,False
3,2018-03-13 21:15:37,40.702373,-73.984281,19.380890,25.000000,25.000000,5.256000,278.789062,Tuesday,00:00:17,40.702425,-73.984172,0.006756,False
4,2018-03-13 21:15:44,40.702355,-73.984412,20.954529,25.000000,25.000000,5.220000,270.703125,Tuesday,00:00:07,40.702373,-73.984281,0.006990,False
5,2018-03-13 21:15:51,40.702327,-73.984551,18.081909,25.000000,25.000000,9.324000,251.015625,Tuesday,00:00:07,40.702355,-73.984412,0.007551,False
6,2018-03-13 21:15:56,40.702300,-73.984664,14.060150,25.000000,25.000000,7.092000,272.812500,Tuesday,00:00:05,40.702327,-73.984551,0.006220,False
7,2018-03-13 21:16:03,40.702337,-73.984773,15.728363,25.000000,25.000000,4.572000,261.562500,Tuesday,00:00:07,40.702300,-73.984664,0.006268,False
8,2018-03-13 21:16:12,40.702385,-73.984881,14.312653,25.000000,25.000000,5.904000,285.117188,Tuesday,00:00:09,40.702337,-73.984773,0.006568,False
9,2018-03-13 21:16:19,40.702404,-73.984992,15.765320,10.000000,10.000000,5.076000,284.414062,Tuesday,00:00:07,40.702385,-73.984881,0.005975,False


To do:
- filter distance delta by horizonal / verical accuracy threshold. Is the distane between these points meaningful enough to suggest that someone has actually moved?
- ask what course, speed, and horizontalAccuracy	verticalAccuracy do / are all about
- identify walk start
- sum delta distance between start and end points for total walk milaige, df.group
- get max distance from walk start and reverese geo code for destination

In [20]:
walk_series = []
walk_slice_pairs = []
walks = [*df[df.is_new_walk].index, len(df)]
walks_count = len(walks)


In [21]:
for index, walk_start in enumerate(walks):
    next_index = index + 1
    
    if next_index >= walks_count:
        break
    
    walk_end = walks[next_index]
    walk_slice_pairs.append((walk_start, walk_end))

for start_index, end_index in walk_slice_pairs:
    walk_record_count = end_index - start_index
    walk_series.append(pd.Series([str(uuid4())], name='walk_id').repeat(walk_record_count))
    

pd.concat(walk_series)
df = df.assign(walk_id=pd.concat(walk_series).values)

In [59]:
rollup = df[['walk_id', 'distance_delta', 'week_day', 'timestamp_delta']].groupby(['walk_id']).agg({
    'distance_delta': 'sum',
    'timestamp_delta': 'sum',
    'week_day': 'first',
})
rollup
# add start time? / location

Unnamed: 0_level_0,distance_delta,timestamp_delta,week_day
walk_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1e80fa18-44f7-4e52-81ff-0fd6b1fc85bc,0.884004,09:10:38,Wednesday
34259282-5c89-4ae5-b43f-93600aa52c62,0.418876,11:45:30,Wednesday
47aa2080-6567-46e2-b388-a639fedb9a53,2.995915,02:45:30,Tuesday
59c0b918-6a00-43a4-9db5-10229f8e9c72,0.360357,00:05:43,Tuesday
daa8163e-3564-4916-b308-0068d1ddbfe5,0.549099,01:03:41,Wednesday
e27f3f33-5c93-4ff1-8372-a307b5aa6fde,0.344408,12:09:04,Thursday
ec16a0c4-f25c-44b7-8256-116538233ab0,0.963904,01:30:01,Wednesday


In [76]:
output = rollup.reset_index().rename(columns={'walk_id': 'id', "distance_delta": "distance", "timestamp_delta": "duration"})
output.to_json(orient='records')

'[{"id":"1e80fa18-44f7-4e52-81ff-0fd6b1fc85bc","distance":0.8840039421,"duration":33038000,"week_day":"Wednesday"},{"id":"34259282-5c89-4ae5-b43f-93600aa52c62","distance":0.4188761454,"duration":42330000,"week_day":"Wednesday"},{"id":"47aa2080-6567-46e2-b388-a639fedb9a53","distance":2.9959148554,"duration":9930000,"week_day":"Tuesday"},{"id":"59c0b918-6a00-43a4-9db5-10229f8e9c72","distance":0.3603569624,"duration":343000,"week_day":"Tuesday"},{"id":"daa8163e-3564-4916-b308-0068d1ddbfe5","distance":0.5490985275,"duration":3821000,"week_day":"Wednesday"},{"id":"e27f3f33-5c93-4ff1-8372-a307b5aa6fde","distance":0.3444081527,"duration":43744000,"week_day":"Thursday"},{"id":"ec16a0c4-f25c-44b7-8256-116538233ab0","distance":0.9639044118,"duration":5401000,"week_day":"Wednesday"}]'

In [78]:
output.to_json('./walks/public/walks.json', orient='records')