In [None]:
import pandas as pd
import numpy as np
import os
import json
from datetime import date, datetime

root_path = 'd:/data/bftrader/horseraces'

def read_json(file_path):
    with open(file_path, 'r', encoding='UTF-8') as f:
        return json.load(f)

In [None]:
meeting_files = [os.path.join(root_path, file_name) for file_name in os.listdir(root_path)\
                if os.path.isfile(os.path.join(root_path, file_name))]

#race_files = [os.path.join(f'{root_path}/races', file_name) for file_name in os.listdir(f'{root_path}/races')\
#                if os.path.isfile(os.path.join(f'{root_path}/races', file_name))]

In [None]:
meetings = []
races = []
for meeting_file in meeting_files:
    for meeting in read_json(meeting_file):
        meetings.append(meeting)

In [None]:
def read_trainer(source):
    return {
        'id' : source['business_reference']['id'],
        'name' : source['name'],
    }

def read_jockey(source):
    return {
        'id' : source['person_reference']['id'],
        'name' : source['name'],
    }

def read_horse(source):
    return {
        'id': source['horse_reference']['id'],
        'name': source['name'],
        'sex': source['sex']['type'],
        'foaled': source['foaled']
    }

def parse_odds(source):
    parts = source.split('/')
    return float(parts[0]) / float(parts[1]) + 1

def parse_pounds(source):
    parts = source.split('-')
    result = float(parts[0]) * 14
    if len(parts) == 2:
        result += float(parts[1])
    return result

def read_ride(source):
    horse = read_horse(source['horse'])
    trainer = read_trainer(source['trainer'])
    jockey = read_jockey(source['jockey'])
    odds = source['betting']['current_odds']
    odds_decimal = parse_odds(odds)
    return {
            'horse': horse,
            'trainer': trainer,
            'jockey' : jockey,
            'official_rating': source.get('official_rating', np.nan),
            'finish_position': source['finish_position'],
            'owner': source['owner']['name'],
            'status': source['ride_status'],
            'handicap': source['handicap'],
            'handicap_pounds': parse_pounds(source['handicap']),
            'odds': odds,
            'odds_decimal': odds_decimal
           }

def read_race(source):
    rides = []
    for ride in source['rides']:
        rides.append(read_ride(ride))
    race_summary = source['race_summary']
    return {
        'id': race_summary['race_summary_reference']['id'],
        'name': race_summary['name'],
        'course_name': race_summary['course_name'],
        'course_surface': race_summary['course_surface']['surface'],
        'class': race_summary['race_class'],
        'distance': race_summary['distance'],
        'date': race_summary['date'],
        'time': race_summary['time'],
        'going': race_summary['going'],
        'rides': rides
    }

In [None]:
race = read_json(f'{root_path}/races/545785.json')
pd.json_normalize(read_race(race), meta=['id','name','course_name'], record_path='rides', record_prefix='ride_')

In [None]:
races = []
for file_name in os.listdir(f'{root_path}/races'):
    race = read_race(read_json(f'{root_path}/races/{file_name}'))
    races.append(race)
    break

races

In [None]:
races[0]['rides']