In [1]:
import os
import pandas as pd
import numpy as np
import math
from run import Run
import geomodel.haversine as h
import time

_MINIMUM_DISTANCE = 40000 # 40KM
_POINT_RADIUS = 50/3 # 50 ft approx

_columns = ['row','time_stamp','event','event_type','latitude','longitude','altitude','heart_rate','speed','distance','last_event','filename']

_DATA_PATH = "/Users/btb/Documents/Garmin fit Files/data"
_DATA_FILE = "locations.csv"
_DATA_OUT_PATH = "data"

data_dict = {}
test_keys = ["2020-05-01-16-58-23.csv","2018-09-30-10-21-14.csv","2020-03-25-17-23-37.csv","2020-05-09-10-11-16.csv"]



In [4]:
def generate_label_features(point_sequence,window=2):
    
    labels = [] 
    features = []
    
    for i in range(window,len(point_sequence)):
        
        y = point_sequence[i]
        X = point_sequence[i-window:i]
        
        labels.append(y)
        features.append(X)

    return np.array(labels), np.array(features)

In [5]:
points = pd.read_csv("points.csv",names=["number","lat","lon"])
close_lats = points["lat"].to_numpy()
close_lons = points["lon"].to_numpy()

In [6]:
### Gather all file names

files = [f for f in os.listdir(_DATA_PATH) if f.endswith(".csv")]

In [41]:
t = time.time()

### Read all the data, calculate which points of interest are close to the run
### Calculate the "point sequence"
skip = False
runs = {}

for f in files:
    if skip:
        continue
    else:
        runs[f] = {}
        runs[f]["data"] = pd.read_csv("{}/{}".format(_DATA_PATH,f),names=_columns)
        runs[f]["data"] = runs[f]["data"][runs[f]["data"]["latitude"].notnull() == True]
        runs[f]["latitude"] = runs[f]["data"]["latitude"].to_numpy() * 180 / math.pow(2,31)
        runs[f]["longitude"] = runs[f]["data"]["longitude"].to_numpy() * 180 / math.pow(2,31)
        lat = runs[f]["latitude"]
        lon = runs[f]["longitude"]
        print(f)
        if len(lat) < 1:
            continue
        
        y = np.array([np.mean(lat), np.mean(lon)])
        runs[f]["mean_location"] = y
        hav_distances = h.haversine(y,(close_lats,close_lons))
        close_points = np.where(hav_distances < _MINIMUM_DISTANCE)[0]
        runs[f]["close_points"] = close_points

        if len(close_points) > 0:
            
            points_visited = []
            
            for y,x in np.nditer((lat,lon)):
                j = 0
                for l,ll in np.nditer([close_lats[close_points], close_lons[close_points]]):
                    distance = h.haversine_single_coordinates((y,x),(l,ll))
                    
                    if distance < _POINT_RADIUS: points_visited.append(close_points[j])
                    
                    j += 1
            
            if len(points_visited) > 0: runs[f]["points_visited"] = np.array(points_visited)
        
        if "points_visited" in runs[f].keys():
            pv = runs[f]["points_visited"]
            runs[f]["point_sequence"] = pv[np.where(np.insert(np.diff(pv),0,1) != 0)]


print("Finished in: {0:.3f} (s)".format(time.time()-t))

2013-09-08-08-57-11.csv
2013-09-08-09-06-00.csv
2013-09-14-08-58-20.csv
2013-09-14-09-21-39.csv
2013-09-14-09-22-44.csv
2013-10-03-08-46-52.csv
2014-10-05-13-49-26.csv
2014-10-07-18-07-51.csv
2014-10-11-14-34-48.csv
2014-10-29-17-41-54.csv
2014-10-31-17-05-37.csv
2014-11-02-15-07-58.csv
2015-02-01-10-29-25.csv
2015-12-01-19-21-51.csv
2015-12-09-19-29-58.csv
2015-12-12-08-04-36.csv
2015-12-13-12-53-35.csv
2015-12-19-09-11-44.csv
2015-12-20-11-45-12.csv
2015-12-22-17-06-25.csv
2015-12-22-17-11-54.csv
2015-12-22-17-13-12.csv
2015-12-22-17-13-55.csv
2016-01-03-14-32-45.csv
2016-01-06-18-45-01.csv
2016-01-08-18-16-15.csv
2016-01-10-08-45-25.csv
2016-01-12-18-20-16.csv
2016-01-14-19-54-45.csv
2016-01-16-09-54-23.csv
2016-01-17-14-20-43.csv
2016-01-23-16-52-55.csv
2016-01-24-14-32-41.csv
2016-01-26-19-50-33.csv
2016-01-30-14-04-30.csv
2016-01-31-13-03-13.csv
2016-02-02-19-48-19.csv
2016-02-06-14-39-54.csv
2016-02-07-14-58-11.csv
2016-02-09-19-33-57.csv
2016-02-11-18-36-19.csv
2016-02-11-19-45

In [44]:
import pickle
with open("runs.pkl","wb") as f:
    pickle.dump(runs,f)

In [4]:
with open("runs.pkl","rb") as f:
    runs = pickle.load(f)

EOFError: Ran out of input

In [57]:
t = time.time()
for key in runs.keys():
    if "point_sequence" in runs[key].keys():
        labels, features = generate_label_features(runs[key]["point_sequence"],window=3)
        runs[key]["labels"] = labels
        runs[key]["features"] = features
print("Finished in: {0:.3f} (s)".format(time.time()-t))

Finished in: 0.060 (s)


In [62]:
t = time.time()
labels_list = []
features_list = []
for key in runs.keys():
    if "labels" in runs[key].keys():
        if len(runs[key]["labels"]) > 0:
            labels_list.append(runs[key]["labels"])
            features_list.append(runs[key]["features"])
print("Finished in: {0:.3f} (s)".format(time.time()-t))
        


Finished in: 0.002 (s)


In [64]:
labels = np.concatenate(labels_list)
features = np.concatenate(features_list)

In [65]:
np.save("{}/labels.npy".format(_DATA_OUT_PATH),labels)
np.save("{}/features.npy".format(_DATA_OUT_PATH),features)

In [68]:
len(labels), len(features)

(5742, 5742)