In [14]:
import json
import os
from collections import OrderedDict
from os import path
from sys import argv
from pykml import parser
from re import compile as re_compile

import pandas as pd

In [15]:
RIDES_DIRECTORY = "../../data/raw/rides"
CLEAN_DIRECTORY = "../../data/cleaner/rides"
path.exists(RIDES_DIRECTORY)
RIDE_PATHS = [path.join(RIDES_DIRECTORY, R) for R in os.listdir(RIDES_DIRECTORY)]

def select_ride(index, ride_paths=RIDE_PATHS):
    route = ride_paths[index]
    assert path.exists(route)
    return route

TESTRIDE = select_ride(1)
TESTRIDE

'../../data/raw/rides/February_1st_ride.kml'

```xml
<?xml version="1.0" encoding="UTF-8"?>
<kml xmlns="http://www.opengis.net/kml/2.2">
  <Document>
    <description>A Trip recorded by Ben on Feb 01, 24 -- View in full at http://ridewithgps.com/trips/144448061</description>
    <name>February 1st ride</name>
    <Style id="default_style">
      <LineStyle>
        <color>ff0000ff</color>
        <width>2</width>
      </LineStyle>
    </Style>
    <Placemark>
      <description> -- View in full at http://ridewithgps.com/trips/144448061</description>
      <name>February 1st ride</name>
      <visibility>1</visibility>
      <flyToView>1</flyToView>
      <styleUrl>#default_style</styleUrl>
      <LineString>
        <extrude>1</extrude>
        <tessellate>1</tessellate>
        <altitudeMode>clampedToGround</altitudeMode>
        <coordinates>
-85.722417,38.240141,118.9
-85.72237,38.240123,119.0
-85.722363,38.240131,118.6
...
```

In [16]:
def get_kml_root(kml_file_path):
    with open(kml_file_path, 'r') as file:
        parsed = parser.parse(file)
    return parsed.getroot()

In [17]:
def kml_points_generator(root):
    for chunk in root.Document.Placemark.LineString.coordinates.text.split():
        longitude, latitude, _ = chunk.split(',')
        yield (float(longitude), float(latitude))

In [18]:
def get_box(points):
    longitude, latitude = set(), set()
    for long, lat in points:
        longitude.add(long)
        latitude.add(lat)
    out = dict()
    for dimension in ('longitude', 'latitude'):
        values = eval(dimension)
        out[dimension] = {'min':min(values), 'max':max(values)}
    return out

In [19]:
description_regex = re_compile(".*\s(\w+)\s(\d\d),\s(\d\d) -- .*trips/(\d*)")

def make_data_object(kml_filepath):
    # Open KML file and get root fo KML document
    with open(kml_filepath, 'r') as file:
        parsed = parser.parse(file)
    root = parsed.getroot()
    
    # Create dictionary to hold KML data
    data = OrderedDict()

    # Add source and ride name information
    data['name'] = root.Document.name.text
    data['source'] = path.basename(kml_filepath)

    # Parse out description from document
    description = root.Document.description.text
    month, day, year, ride_id = description_regex.match(description).groups()
    year = f"20{year}"
    data['date'] = ' '.join((month, day, year))
    data['id'] = ride_id

    points = tuple(kml_points_generator(root))
    data['box'] = get_box(points)
    data['data'] = points
    return data

def make_filename(data_object):
    """Create unique file name for json file based on ride date"""
    datestring = data_object['date'].replace(' ', '_')
    return f"{datestring}_ride.json"

#jfile = make_data_object(TESTRIDE)

def create_json_file(kml_filepath, clean_directory):
    """Main function to read in KML and write json file"""
    json_data = make_data_object(kml_filepath)
    json_name = make_filename(json_data)
    json_path = path.join(clean_directory, json_name)
    with open(json_path, 'w') as file:
        json.dump(json_data, file)
    return

In [20]:
create_json_file(TESTRIDE, CLEAN_DIRECTORY)

In [21]:
# Read in json

class Ride(OrderedDict):
    def __init__(self, *, name, source, date, id, box, points):
        self['name'] = name
        self['source'] = source
        self['date'] = date
        self['id'] = id
        self['box'] = box
        self['data'] = pd.Series(data).apply(tuple)

In [23]:

def read_in_json(json_path):
    with open(json_path, 'r') as file:
        jfile = json.load(file)
    json_data = Ride(name=jfile['name'], source=jfile['source'], id=jfile['id'],
                     date=jfile['date'], box=jfile['box'], points=jfile['data'])
    return json_data

read_in_json("../../data/cleaner/rides/Feb_01_2024_ride.json")

Ride([('name', 'February 1st ride'),
      ('source', 'February_1st_ride.kml'),
      ('date', 'Feb 01 2024'),
      ('id', '144448061'),
      ('box',
       {'longitude': {'min': -85.752252, 'max': -85.704454},
        'latitude': {'min': 38.239911, 'max': 38.270028}}),
      ('data',
       0       (-85.722417, 38.240141)
       1        (-85.72237, 38.240123)
       2       (-85.722363, 38.240131)
       3        (-85.72235, 38.240149)
       4       (-85.722346, 38.240171)
                        ...           
       1933     (-85.72238, 38.240185)
       1934    (-85.722357, 38.240174)
       1935    (-85.722356, 38.240174)
       1936    (-85.722357, 38.240174)
       1937    (-85.722355, 38.240176)
       Length: 1938, dtype: object)])

In [24]:
for ride in RIDE_PATHS:
    create_json_file(ride, CLEAN_DIRECTORY)