In [7]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import folium
import matplotlib.pyplot as plt

## Load the Motorized Hourly Traffic Data, Clean, and Convert to Daily Aggregates

In [2]:
# Let's load the motorized traffic data
motor_data = pd.read_csv("Berlin_Traffic_Data/merged_traffic_data.csv")

In [4]:
# Let's quckly check the motorized traffic data once
motor_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39600736 entries, 0 to 39600735
Data columns (total 12 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   detid                         int64  
 1   date                          object 
 2   hour                          int64  
 3   data_quality                  float64
 4   num_motor_vehicles_per_hour   int64  
 5   avg_speed_motor_vehicles_kmh  float64
 6   num_cars_per_hour             int64  
 7   avg_speed_cars_kmh            float64
 8   num_trucks_per_hour           int64  
 9   avg_speed_trucks_kmh          float64
 10  longitude                     float64
 11  latitude                      float64
dtypes: float64(6), int64(5), object(1)
memory usage: 3.5+ GB


In [5]:
motor_data.head()

Unnamed: 0,detid,date,hour,data_quality,num_motor_vehicles_per_hour,avg_speed_motor_vehicles_kmh,num_cars_per_hour,avg_speed_cars_kmh,num_trucks_per_hour,avg_speed_trucks_kmh,longitude,latitude
0,100101010000167,2015-01-01,0,1.0,116,98.2,115,99.0,1,0.0,13.192578,52.433868
1,100101010000167,2015-01-01,1,1.0,393,76.3,392,76.3,1,72.0,13.192578,52.433868
2,100101010000167,2015-01-01,2,1.0,335,76.6,327,77.0,8,61.9,13.192578,52.433868
3,100101010000167,2015-01-01,3,1.0,208,81.9,202,82.3,6,69.5,13.192578,52.433868
4,100101010000167,2015-01-01,4,1.0,141,89.4,137,89.2,4,90.0,13.192578,52.433868


In [8]:
# Let's convert the hourly motorized traffic data to daily
# Now, let's aggregate motor features daily per detid
daily_motor = motor_data.groupby(['detid', 'date']).agg({
    'num_motor_vehicles_per_hour': 'sum',
    'avg_speed_motor_vehicles_kmh': 'mean',
    'num_cars_per_hour': 'sum',
    'avg_speed_cars_kmh': 'mean',
    'num_trucks_per_hour': 'sum',
    'avg_speed_trucks_kmh': 'mean',
    'latitude': 'first',
    'longitude': 'first'
}).reset_index()

In [9]:
daily_motor.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1615426 entries, 0 to 1615425
Data columns (total 10 columns):
 #   Column                        Non-Null Count    Dtype  
---  ------                        --------------    -----  
 0   detid                         1615426 non-null  int64  
 1   date                          1615426 non-null  object 
 2   num_motor_vehicles_per_hour   1615426 non-null  int64  
 3   avg_speed_motor_vehicles_kmh  1615426 non-null  float64
 4   num_cars_per_hour             1615426 non-null  int64  
 5   avg_speed_cars_kmh            1615426 non-null  float64
 6   num_trucks_per_hour           1615426 non-null  int64  
 7   avg_speed_trucks_kmh          1615426 non-null  float64
 8   latitude                      1595853 non-null  float64
 9   longitude                     1595853 non-null  float64
dtypes: float64(5), int64(4), object(1)
memory usage: 123.2+ MB


In [10]:
# Let's quickly check the missing values for lat, lon for specifi IDs
# Let's check for missing values
missing_coords = daily_motor[daily_motor['latitude'].isna() | daily_motor['longitude'].isna()]

# Get unique detid values with missing coordinates
missing_detids = missing_coords['detid'].unique()

# Show them
print(f"Number of detids with missing coordinates: {len(missing_detids)}")
print("Sample missing detids:", missing_detids[:10])

Number of detids with missing coordinates: 43
Sample missing detids: [100101010025934 100101010026035 100101010026540 100101010026641
 100101010072515 100101010083023 100101010083124 100101010094238
 100101010094339 100101010094440]


In [11]:
# Let's drop these detids since there is no info about them in the metadata too
daily_motor = daily_motor.dropna(subset=['latitude', 'longitude'])

In [13]:
# Let's save the daily motorized traffic dataset
daily_motor.to_csv("daily_motor_traffic_berlin_12032025.csv", index=False)

## Exploring the Daily Motorized Traffic Data

In [2]:
# Let's load the daily motorized traffic dataset
data = pd.read_csv("daily_motor_traffic_berlin_12032025.csv")

In [3]:
# Some initial data checks
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1595853 entries, 0 to 1595852
Data columns (total 10 columns):
 #   Column                        Non-Null Count    Dtype  
---  ------                        --------------    -----  
 0   detid                         1595853 non-null  int64  
 1   date                          1595853 non-null  object 
 2   num_motor_vehicles_per_hour   1595853 non-null  int64  
 3   avg_speed_motor_vehicles_kmh  1595853 non-null  float64
 4   num_cars_per_hour             1595853 non-null  int64  
 5   avg_speed_cars_kmh            1595853 non-null  float64
 6   num_trucks_per_hour           1595853 non-null  int64  
 7   avg_speed_trucks_kmh          1595853 non-null  float64
 8   latitude                      1595853 non-null  float64
 9   longitude                     1595853 non-null  float64
dtypes: float64(5), int64(4), object(1)
memory usage: 121.8+ MB


In [4]:
data.head()

Unnamed: 0,detid,date,num_motor_vehicles_per_hour,avg_speed_motor_vehicles_kmh,num_cars_per_hour,avg_speed_cars_kmh,num_trucks_per_hour,avg_speed_trucks_kmh,latitude,longitude
0,100101010000167,2015-01-01,11216,80.875,10828,81.566667,388,66.3,52.433868,13.192578
1,100101010000167,2015-01-02,15797,80.241667,14862,80.7625,935,73.141667,52.433868,13.192578
2,100101010000167,2015-01-03,12786,81.625,12230,82.279167,556,71.629167,52.433868,13.192578
3,100101010000167,2015-01-04,12139,80.25,11718,80.8125,421,71.291667,52.433868,13.192578
4,100101010000167,2015-01-05,15333,80.320833,13637,81.354167,1696,75.5125,52.433868,13.192578


In [5]:
# Let's check the number of unique counting stations
num_stations = data['detid'].nunique()
print(f"Number of unique motorized traffic counting stations: {num_stations}")

Number of unique motorized traffic counting stations: 548


In [6]:
# Let's check how the counting stations are distributed across Berlin
map_center = [data['latitude'].mean(), data['longitude'].mean()]
motor_map = folium.Map(location=map_center, zoom_start=12)

for _, row in data.drop_duplicates(subset=['detid']).iterrows():
    folium.Marker(
        location=[row['latitude'], row['longitude']],
        popup=f"Name: {row['detid']}",
        tooltip=row['detid']
    ).add_to(motor_map)

motor_map