In [None]:
import requests
from io import BytesIO
from zipfile import ZipFile
import time

In [None]:
def download_url(url, folder, chunk_size=128): # download and extract zips to folder
    resp = requests.get(url)
    if resp.status_code == 200:
        zipfile = ZipFile(BytesIO(resp.content))
        for file in zipfile.namelist():
            zipfile.extract(file, folder)

base_url = 'https://rata.digitraffic.fi/api/v1/trains/dumps//digitraffic-rata-trains-' # 2014-01-01.zip


for year in range(2015, 2022):
    for month in range(1, 13):
        year_str = str(year)
        month_str = str(month) if len(str(month)) == 2 else '0' + str(month)
        day_str = '01'
        url = base_url + f'{year_str}-{month_str}-{day_str}.zip'

        download_url(url, 'files/')
        time.sleep(0.1)


In [None]:
import numpy as np
from dateutil.parser import parse

def process(json_file, attrs, csv_writer): # process single json
    for item in data:
        values = [str(item[x]) for x in attrs]
        
        time_differences = []
        for timetable in item['timeTableRows']:
            if 'actualTime' not in timetable:
                continue
            
            expected = timetable['scheduledTime']
            actual = timetable['actualTime']

            time_diff = parse(actual) - parse(expected)
            time_diff = time_diff.total_seconds() / 60.0

            time_differences.append(time_diff)

        if len(time_differences) == 0:
            continue

        mean = np.mean(time_differences)
        std = np.std(time_differences)

        csv_writer.writerow(values + [mean, std])  # write to csv


In [None]:
import json
import pandas
import csv
import os
import time
from collections import deque
from IPython.display import clear_output


attrs = ['trainNumber', 'departureDate', 'trainType', 'trainCategory']

with open('daily.csv', 'w') as f:
    csv_writer = csv.writer(f)
    csv_writer.writerow(attrs + ['timeDifferenceMean', 'timeDifferenceStd'])

    files = os.listdir('files')
    for i, json_file in enumerate(sorted(files)):
        start_time = time.time()
        file_path = os.path.join('files', json_file)

        with open(file_path, 'r') as file:
            data = json.load(file)
            if len(data) == 0:
                continue

            process(data, attrs, csv_writer)

        clear_output(wait=True)
        print(f'{i} / {len(files)}')

        #os.remove(file_path) # I have to delete the files after i am done to make space

