# Step 0 - download raw data using API 

This notebook downloads the raw data from the official source: https://infrabel.opendatasoft.com/explore/dataset/stiptheid-gegevens-maandelijksebestanden/information/. 

Set `save_dir` for folder to save data and `download_range` - the range of month-years over which to download data.

In [1]:
import numpy as np
import sys
import os
from pathlib import Path
from tqdm.notebook import tqdm
from datetime import datetime, timedelta
import time
import requests  # Simple HTTP operations (GET and POST)
import pandas as pd

## Technical function

In [2]:
from itertools import cycle
# GET DATE STRINGS OF DATES FROM START MONTH-YEAR TILL END MONTH-YEAR
def get_date_range(month_start, year_start, month_end, year_end):
    month_range = list(range(1,13))
    cycle_month_range = cycle(month_range)
    while True:
        current_month = next(cycle_month_range)
        if current_month == month_start:
            break
    date_tuples = []
    year = year_start
    while True:
        if current_month < 10:
            date_tuples.append(("0"+str(current_month), str(year)))
        else:
            date_tuples.append((str(current_month), str(year)))
        if year == year_end and current_month == month_end:
            break
        current_month = next(cycle_month_range)
        if current_month == 1:
            year += 1
    return date_tuples

## Define download range and output dir

In [3]:
save_dir = Path("./infrabel_raw_data/")
download_range = get_date_range(4, 2019, 4, 2019)
download_range

[('04', '2019')]

## Main download

In [4]:
for month, year in tqdm(download_range, desc = "Downloading from predefined range of month-years"):
    # FILE FORMAT
    file = "Data_raw_punctuality_{}{}.csv".format(year, month)
    # FILE OUTPUT
    os.makedirs(save_dir, exist_ok=True)
    save_file = os.path.join(save_dir, file)
    
    # MAIN DOWNLOAD
    base_url = "https://fr.ftp.opendatasoft.com/infrabel/PunctualityHistory/"
    file_url = base_url + file
    
    with open(save_file, "wb") as f:
        print("Downloading {}".format(file))
        r = requests.get(file_url, stream=True)
        total_length = r.headers.get('content-length')

        if total_length is None: # no content length header
            f.write(r.content)
        else:
            dl = 0
            total_length = int(total_length)
            for data in r.iter_content(chunk_size=4096):
                dl += len(data)
                f.write(data)
                done = int(50 * dl / total_length)
                sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50-done)) )    
                sys.stdout.flush()
    print("\n")

Downloading from predefined range of month-years:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading Data_raw_punctuality_201904.csv

