In [23]:
import pandas as pd
import numpy as np
import psycopg2
from utils.db_connect import create_db_connection
from sqlalchemy import create_engine

In [24]:
conn = create_db_connection(host='localhost')

count sensors

tickets

tickets correct

In [25]:
# count tickets for each trip
conn.rollback()
query = """
SELECT 
    COUNT(fare) as count_fare,
    trip_id,
    DATE_TRUNC('day', timestamp) as day,
    MAX(timestamp) as timestamp
FROM 
    raw_tickets 
GROUP BY 
    trip_id, 
    DATE_TRUNC('day', timestamp),
    DATE_TRUNC('month', timestamp),
    DATE_TRUNC('year', timestamp)
ORDER BY 
    day
"""

tickets =pd.read_sql(query, conn)
tickets.head()

Unnamed: 0,count_fare,trip_id,day,timestamp
0,107,0004221452024090920250612,2025-05-16,2025-05-16 17:37:00
1,133,0004206392024090920250612,2025-05-16,2025-05-16 10:04:00
2,6,0004188862024090920250612,2025-05-16,2025-05-16 11:35:00
3,153,0004229762024090920250612,2025-05-16,2025-05-16 10:04:00
4,12,0004199522024090920250612,2025-05-16,2025-05-16 13:00:00


In [26]:
conn.rollback()

query = """
select trip_id, timestamp, traffic_level, normal, traffic
from traffic
"""
traffic = pd.read_sql(query, conn)
traffic.head()

Unnamed: 0,trip_id,timestamp,traffic_level,normal,traffic
0,0004187712024090920250612,2025-05-16 06:33:00,no traffic/low,909,975
1,0004187802024090920250612,2025-05-16 07:26:00,no traffic/low,909,924
2,0004187842024090920250612,2025-05-16 07:12:00,no traffic/low,778,825
3,0004188682024090920250612,2025-05-16 12:26:00,no traffic/low,777,802
4,0004199502024090920250612,2025-05-16 12:31:00,medium,660,749


In [27]:
traffic_tickets = traffic.merge(tickets, on='trip_id', how='inner')
traffic_tickets.head()

Unnamed: 0,trip_id,timestamp_x,traffic_level,normal,traffic,count_fare,day,timestamp_y
0,0004187712024090920250612,2025-05-16 06:33:00,no traffic/low,909,975,185,2025-05-16,2025-05-16 06:50:00
1,0004187712024090920250612,2025-05-16 06:33:00,no traffic/low,909,975,77,2025-05-17,2025-05-17 06:50:00
2,0004187712024090920250612,2025-05-16 06:33:00,no traffic/low,909,975,77,2025-05-18,2025-05-18 06:50:00
3,0004187712024090920250612,2025-05-16 06:33:00,no traffic/low,909,975,77,2025-05-19,2025-05-19 06:50:00
4,0004187802024090920250612,2025-05-16 07:26:00,no traffic/low,909,924,158,2025-05-16,2025-05-16 07:43:00


In [28]:
conn.rollback()

query = """
select *
from weather
"""
weather = pd.read_sql(query, conn)
weather = weather.rename(columns={'timestamp':'time'})
weather.head()

Unnamed: 0,measurement_id,time,latitude,longitude,weather_code,precipitation_probability,temperature,hour
0,865,2025-05-19 18:31:00,46.04,11.119999,3,0.0,12.7,2025-05-15 00:00:00
1,866,2025-05-19 18:31:00,46.04,11.119999,2,0.0,11.5,2025-05-15 01:00:00
2,867,2025-05-19 18:31:00,46.04,11.119999,1,0.0,11.8,2025-05-15 02:00:00
3,868,2025-05-19 18:31:00,46.04,11.119999,2,0.0,10.1,2025-05-15 03:00:00
4,869,2025-05-19 18:31:00,46.04,11.119999,2,0.0,10.8,2025-05-15 04:00:00


In [29]:
traffic_tickets['timestamp'] = pd.to_datetime(traffic_tickets['timestamp_x'])
weather['hour'] = pd.to_datetime(weather['hour'])

traffic_tickets = traffic_tickets.sort_values('timestamp')
weather = weather.sort_values('hour')

In [30]:
merged = pd.merge_asof(
    traffic_tickets,
    weather,
    left_on='timestamp',
    right_on='hour',          # match on timestamp column
    direction='backward',     # can also use 'backward' or 'forward'
    tolerance=pd.Timedelta('1h')  # optional: max time difference allowed
)

In [31]:
merged

Unnamed: 0,trip_id,timestamp_x,traffic_level,normal,traffic,count_fare,day,timestamp_y,timestamp,measurement_id,time,latitude,longitude,weather_code,precipitation_probability,temperature,hour
0,0004229912024090920250612,2025-05-16 06:11:00,no traffic/low,460,498,35,2025-05-18,2025-05-18 06:23:00,2025-05-16 06:11:00,895,2025-05-19 18:31:00,46.04,11.119999,0,0.0,12.4,2025-05-16 06:00:00
1,0004229912024090920250612,2025-05-16 06:11:00,no traffic/low,460,498,35,2025-05-16,2025-05-16 06:23:00,2025-05-16 06:11:00,895,2025-05-19 18:31:00,46.04,11.119999,0,0.0,12.4,2025-05-16 06:00:00
2,0004229912024090920250612,2025-05-16 06:11:00,no traffic/low,460,498,35,2025-05-17,2025-05-17 06:23:00,2025-05-16 06:11:00,895,2025-05-19 18:31:00,46.04,11.119999,0,0.0,12.4,2025-05-16 06:00:00
3,0004187712024090920250612,2025-05-16 06:33:00,no traffic/low,909,975,185,2025-05-16,2025-05-16 06:50:00,2025-05-16 06:33:00,895,2025-05-19 18:31:00,46.04,11.119999,0,0.0,12.4,2025-05-16 06:00:00
4,0004187712024090920250612,2025-05-16 06:33:00,no traffic/low,909,975,77,2025-05-17,2025-05-17 06:50:00,2025-05-16 06:33:00,895,2025-05-19 18:31:00,46.04,11.119999,0,0.0,12.4,2025-05-16 06:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,0004229672024090920250612,2025-05-16 17:42:00,no traffic/low,673,618,180,2025-05-16,2025-05-16 18:16:00,2025-05-16 17:42:00,906,2025-05-19 18:31:00,46.04,11.119999,1,0.0,19.2,2025-05-16 17:00:00
66,0004229672024090920250612,2025-05-16 17:42:00,no traffic/low,673,618,180,2025-05-18,2025-05-18 18:16:00,2025-05-16 17:42:00,906,2025-05-19 18:31:00,46.04,11.119999,1,0.0,19.2,2025-05-16 17:00:00
67,0004210772024090920250612,2025-05-16 21:35:00,no traffic/low,909,734,17,2025-05-16,2025-05-16 22:06:00,2025-05-16 21:35:00,910,2025-05-19 18:31:00,46.04,11.119999,1,0.0,11.8,2025-05-16 21:00:00
68,0004210772024090920250612,2025-05-16 21:35:00,no traffic/low,909,734,17,2025-05-17,2025-05-17 22:06:00,2025-05-16 21:35:00,910,2025-05-19 18:31:00,46.04,11.119999,1,0.0,11.8,2025-05-16 21:00:00
