## Motorized Traffic Data Download

In [1]:
import os
import time
import requests

In [2]:
# Let's define the base URL
base_url = "https://mdhopendata.blob.core.windows.net/verkehrsdetektion"

# And set the years and months
years = range(2015, 2024)
months = [f"{m:02d}" for m in range(1, 13)]

# Local storage path
data_dir = "Berlin_Traffic_Data"
os.makedirs(data_dir, exist_ok=True)

In [3]:
# Function to download a file
def download_file(url, save_path):
    try:
        response = requests.get(url, stream=True, timeout=10)
        if response.status_code == 200:
            with open(save_path, "wb") as file:
                for chunk in response.iter_content(chunk_size=1024):
                    file.write(chunk)
            print(f"Downloaded: {save_path}")
        else:
            print(f"Failed ({response.status_code}): {url}")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {url}: {e}")

In [4]:
# Loop through years and months to download files
for year in years:
    year_dir = os.path.join(data_dir, str(year))
    os.makedirs(year_dir, exist_ok=True)

    for month in months:
        file_name = f"det_val_hr_{year}_{month}.csv.gz"
        file_url = f"{base_url}/{year}/Detektoren%20(einzelne%20Fahrspur)/{file_name}"
        save_path = os.path.join(year_dir, file_name)

        if not os.path.exists(save_path): 
            download_file(file_url, save_path)
            time.sleep(1.5)
        else:
            print(f"Already downloaded: {save_path}")

print("All downloads complete!")

Downloaded: Berlin_Traffic_Data\2015\det_val_hr_2015_01.csv.gz
Downloaded: Berlin_Traffic_Data\2015\det_val_hr_2015_02.csv.gz
Downloaded: Berlin_Traffic_Data\2015\det_val_hr_2015_03.csv.gz
Downloaded: Berlin_Traffic_Data\2015\det_val_hr_2015_04.csv.gz
Downloaded: Berlin_Traffic_Data\2015\det_val_hr_2015_05.csv.gz
Downloaded: Berlin_Traffic_Data\2015\det_val_hr_2015_06.csv.gz
Downloaded: Berlin_Traffic_Data\2015\det_val_hr_2015_07.csv.gz
Downloaded: Berlin_Traffic_Data\2015\det_val_hr_2015_08.csv.gz
Downloaded: Berlin_Traffic_Data\2015\det_val_hr_2015_09.csv.gz
Downloaded: Berlin_Traffic_Data\2015\det_val_hr_2015_10.csv.gz
Downloaded: Berlin_Traffic_Data\2015\det_val_hr_2015_11.csv.gz
Downloaded: Berlin_Traffic_Data\2015\det_val_hr_2015_12.csv.gz
Downloaded: Berlin_Traffic_Data\2016\det_val_hr_2016_01.csv.gz
Downloaded: Berlin_Traffic_Data\2016\det_val_hr_2016_02.csv.gz
Downloaded: Berlin_Traffic_Data\2016\det_val_hr_2016_03.csv.gz
Downloaded: Berlin_Traffic_Data\2016\det_val_hr_2016_04

In [5]:
# The data download for 2022 and 2023 failed, I have checked and the error was in the final link
# Only download for 2022 and 2023
years_22_23 = [2022, 2023]
months_22_23 = [f"{m:02d}" for m in range(1, 13)]

# Folder name for 2022-2023 data
folder_name = "Detektordaten%20(einzelne%20Fahrspur)"

In [6]:
# Function to download a file
def download_file(url, save_path):
    try:
        response = requests.get(url, stream=True, timeout=10)
        if response.status_code == 200:
            with open(save_path, "wb") as file:
                for chunk in response.iter_content(chunk_size=1024):
                    file.write(chunk)
            print(f"Downloaded: {save_path}")
        else:
            print(f"Failed ({response.status_code}): {url}")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {url}: {e}")

In [8]:
# Loop through 2022 and 2023 months to download files
for year in years_22_23:
    year_dir = os.path.join(data_dir, str(year))
    os.makedirs(year_dir, exist_ok=True)

    for month in months_22_23:
        file_name = f"det_val_hr_{year}_{month}.csv.gz"
        file_url = f"{base_url}/{year}/{folder_name}/{file_name}" 
        save_path = os.path.join(year_dir, file_name)

        if not os.path.exists(save_path): 
            download_file(file_url, save_path)
            time.sleep(1.5)
        else:
            print(f"Already downloaded: {save_path}")

print("Download for 2022 & 2023 complete!")

Downloaded: Berlin_Traffic_Data\2022\det_val_hr_2022_01.csv.gz
Downloaded: Berlin_Traffic_Data\2022\det_val_hr_2022_02.csv.gz
Downloaded: Berlin_Traffic_Data\2022\det_val_hr_2022_03.csv.gz
Downloaded: Berlin_Traffic_Data\2022\det_val_hr_2022_04.csv.gz
Downloaded: Berlin_Traffic_Data\2022\det_val_hr_2022_05.csv.gz
Downloaded: Berlin_Traffic_Data\2022\det_val_hr_2022_06.csv.gz
Downloaded: Berlin_Traffic_Data\2022\det_val_hr_2022_07.csv.gz
Downloaded: Berlin_Traffic_Data\2022\det_val_hr_2022_08.csv.gz
Downloaded: Berlin_Traffic_Data\2022\det_val_hr_2022_09.csv.gz
Downloaded: Berlin_Traffic_Data\2022\det_val_hr_2022_10.csv.gz
Downloaded: Berlin_Traffic_Data\2022\det_val_hr_2022_11.csv.gz
Downloaded: Berlin_Traffic_Data\2022\det_val_hr_2022_12.csv.gz
Downloaded: Berlin_Traffic_Data\2023\det_val_hr_2023_01.csv.gz
Downloaded: Berlin_Traffic_Data\2023\det_val_hr_2023_02.csv.gz
Downloaded: Berlin_Traffic_Data\2023\det_val_hr_2023_03.csv.gz
Downloaded: Berlin_Traffic_Data\2023\det_val_hr_2023_04

## .gz to .csv

In [10]:
import os
import gzip
import shutil
from glob import glob

In [11]:
# Define base directory where the `.gz` files are stored
DATA_DIR = "Berlin_Traffic_Data"

# Find all `.gz` files recursively in all year folders
gz_files = glob(os.path.join(DATA_DIR, "**", "*.csv.gz"), recursive=True)

In [12]:
# Process each `.gz` file
for gz_file in gz_files:
    csv_file = gz_file.replace(".csv.gz", ".csv")  # Define the new `.csv` filename

    # Extract and save as .csv
    with gzip.open(gz_file, "rb") as f_in:
        with open(csv_file, "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)
    print(f"✔ Converted: {gz_file} → {csv_file}")

    # Delete the original .gz file
    os.remove(gz_file)
    print(f"🗑 Deleted: {gz_file}")

print("All .gz files have been converted to .csv and deleted!")


✔ Converted: Berlin_Traffic_Data\2015\det_val_hr_2015_01.csv.gz → Berlin_Traffic_Data\2015\det_val_hr_2015_01.csv
🗑 Deleted: Berlin_Traffic_Data\2015\det_val_hr_2015_01.csv.gz
✔ Converted: Berlin_Traffic_Data\2015\det_val_hr_2015_02.csv.gz → Berlin_Traffic_Data\2015\det_val_hr_2015_02.csv
🗑 Deleted: Berlin_Traffic_Data\2015\det_val_hr_2015_02.csv.gz
✔ Converted: Berlin_Traffic_Data\2015\det_val_hr_2015_03.csv.gz → Berlin_Traffic_Data\2015\det_val_hr_2015_03.csv
🗑 Deleted: Berlin_Traffic_Data\2015\det_val_hr_2015_03.csv.gz
✔ Converted: Berlin_Traffic_Data\2015\det_val_hr_2015_04.csv.gz → Berlin_Traffic_Data\2015\det_val_hr_2015_04.csv
🗑 Deleted: Berlin_Traffic_Data\2015\det_val_hr_2015_04.csv.gz
✔ Converted: Berlin_Traffic_Data\2015\det_val_hr_2015_05.csv.gz → Berlin_Traffic_Data\2015\det_val_hr_2015_05.csv
🗑 Deleted: Berlin_Traffic_Data\2015\det_val_hr_2015_05.csv.gz
✔ Converted: Berlin_Traffic_Data\2015\det_val_hr_2015_06.csv.gz → Berlin_Traffic_Data\2015\det_val_hr_2015_06.csv
🗑 Dele

## Datasets Merging

In [14]:
import pandas as pd
import os
from glob import glob

In [15]:
# Let's define base directory
DATA_DIR = "Berlin_Traffic_Data"
output_file = "Berlin_Traffic_Data/merged_traffic_data.csv"

In [17]:
# Let's also define the metadata file path
metadata_path = os.path.join("Stammdaten_Verkehrsdetektion_2022_07_20.xlsx")

# And, load metadata and rename relevant columns
metadata_df = pd.read_excel(metadata_path, sheet_name=0)
metadata_df.rename(columns={
    "DET_ID15": "detid",
    "LÄNGE (WGS84)": "longitude",
    "BREITE (WGS84)": "latitude"
}, inplace=True)

# And, keep only necessary columns from metadata
metadata_df = metadata_df[["detid", "longitude", "latitude"]]

In [18]:
# Now, let's find all `.csv` files
csv_files = glob(os.path.join(DATA_DIR, "**", "det_val_hr_*.csv"), recursive=True)

# And, store the processed data
all_data = []

for file_path in csv_files:
    print(f"Processing: {file_path}")
    
    # Load traffic data
    df = pd.read_csv(file_path, sep=";", low_memory=False)
    
    # Rename columns
    df.rename(columns={
        "detid_15": "detid",
        "tag": "date",
        "stunde": "hour",
        "qualitaet": "data_quality",
        "q_kfz_det_hr": "num_motor_vehicles_per_hour",
        "v_kfz_det_hr": "avg_speed_motor_vehicles_kmh",
        "q_pkw_det_hr": "num_cars_per_hour",
        "v_pkw_det_hr": "avg_speed_cars_kmh",
        "q_lkw_det_hr": "num_trucks_per_hour",
        "v_lkw_det_hr": "avg_speed_trucks_kmh"
    }, inplace=True)
    
    # Convert date format
    df["date"] = pd.to_datetime(df["date"], format="%d.%m.%Y")
    
    # Merge with metadata
    df = df.merge(metadata_df, on="detid", how="left")
    
    # Append processed data
    all_data.append(df)


Processing: Berlin_Traffic_Data\2015\det_val_hr_2015_01.csv
Processing: Berlin_Traffic_Data\2015\det_val_hr_2015_02.csv
Processing: Berlin_Traffic_Data\2015\det_val_hr_2015_03.csv
Processing: Berlin_Traffic_Data\2015\det_val_hr_2015_04.csv
Processing: Berlin_Traffic_Data\2015\det_val_hr_2015_05.csv
Processing: Berlin_Traffic_Data\2015\det_val_hr_2015_06.csv
Processing: Berlin_Traffic_Data\2015\det_val_hr_2015_07.csv
Processing: Berlin_Traffic_Data\2015\det_val_hr_2015_08.csv
Processing: Berlin_Traffic_Data\2015\det_val_hr_2015_09.csv
Processing: Berlin_Traffic_Data\2015\det_val_hr_2015_10.csv
Processing: Berlin_Traffic_Data\2015\det_val_hr_2015_11.csv
Processing: Berlin_Traffic_Data\2015\det_val_hr_2015_12.csv
Processing: Berlin_Traffic_Data\2016\det_val_hr_2016_01.csv
Processing: Berlin_Traffic_Data\2016\det_val_hr_2016_02.csv
Processing: Berlin_Traffic_Data\2016\det_val_hr_2016_03.csv
Processing: Berlin_Traffic_Data\2016\det_val_hr_2016_04.csv
Processing: Berlin_Traffic_Data\2016\det

In [19]:
# Now, let's combine all processed data into one dataset
final_df = pd.concat(all_data, ignore_index=True)

In [20]:
# And, save final merged dataset
final_df.to_csv(output_file, index=False)

print(f"All files processed! Merged dataset saved as '{output_file}'.")

All files processed! Merged dataset saved as 'Berlin_Traffic_Data/merged_traffic_data.csv'.


## Note

We began by understanding the structure and availability of the data. The traffic data was available for 2015 to 2023 in monthly .csv.gz files on an open data platform. However, downloading each file manually was inefficient, so we created an automated script to download all the files programmatically. Initially, the download process worked for 2015 to 2021, but we later discovered that the folder structure changed in 2022 and 2023. To resolve this, we modified the script to handle these structural differences and successfully downloaded the entire dataset.

Once the .gz files were downloaded, we needed to extract them into .csv format. We wrote a script that iterated through all .gz files, extracted them, and saved them as .csv files while also deleting the original compressed files. This ensured that our working directory contained only .csv files, making further processing easier.

After extracting the files, we moved on to renaming the columns, as the original dataset contained German column names. We referred to the dataset’s README file to accurately map these columns to English equivalents. This step was crucial for consistency and easier understanding. Some key variables included detid_15 (renamed to detid for the detector ID), tag (renamed to date), stunde (renamed to hour), and various vehicle count and speed variables. Additionally, we processed the metadata file (Stammdaten_Verkehrsdetektion_2022_07_20.xlsx), which contained important information about the location of detectors (latitude, longitude) and merged it with the traffic data to ensure each record had spatial information.

With all files properly formatted and renamed, we proceeded with batch processing to handle all datasets at once. Instead of applying renaming and merging manually for each month, we wrote a script that iterated through all .csv files in the dataset folder, applied renaming, merged them with metadata, and concatenated them into a single cleaned dataset (merged_traffic_data.csv) containing all relevant information from 2015 to 2023. This ensured that all monthly datasets were standardized and merged into a single file for analysis.