## Fetch hourly weather for trips
This notebook loads cleaned trip data, rounds start times to the nearest hour, queries Open-Meteo for temperature, rain, and weather code, and merges the results back to the trips.

In [34]:
import pandas as pd
import numpy as np
import requests_cache
from concurrent.futures import ThreadPoolExecutor, as_completed

# 1) Trips einlesen und Merge-Keys bauen
df = pd.read_csv('cleaned_trip_data.csv', parse_dates=['start_time'])
df['date_only']         = df['start_time'].dt.strftime('%Y-%m-%d')
df['time_only_rounded'] = df['start_time'].dt.round('h').dt.strftime('%H:%M:%S')

# 2) Unique (lat, lon) extrahieren
stations = (
    df[['start_lat','start_lon']]
      .drop_duplicates()
      .reset_index(drop=True)
)

# 3) In Batches aufteilen (z.B. 50 Stationen pro Request)
chunk_size = 50
batches = [
    stations.iloc[i:i+chunk_size].to_dict('records')
    for i in range(0, len(stations), chunk_size)
]

# 4) Session mit Cache (1h)
session = requests_cache.CachedSession('.cache', expire_after=3600)

# 5) Batch-Fetcher
def fetch_batch(batch):
    # Koordinaten als Komma-Strings
    lat_str = ",".join(str(p['start_lat']) for p in batch)
    lon_str = ",".join(str(p['start_lon']) for p in batch)

    params = {
        "latitude":   lat_str,
        "longitude":  lon_str,
        "start_date": df['date_only'].min(),
        "end_date":   df['date_only'].max(),
        "hourly":     "temperature_2m,rain,weathercode",
        "timezone":   "auto",   # → liefert lokale Iso-Strings
    }

    # JSON-Request
    url  = "https://historical-forecast-api.open-meteo.com/v1/forecast"
    r    = session.get(url, params=params)
    r.raise_for_status()
    js   = r.json()

    # Falls das API-Resultat eine Liste ist, nimm diese,
    # sonst wickle das Einzel-Objekt in eine Liste
    forecasts = js if isinstance(js, list) else [js]

    # Die Zeit‐Achse ist für alle Punkte gleich:
    times = pd.to_datetime(forecasts[0]["hourly"]["time"])
    n     = len(times)

    records = []
    for idx, loc in enumerate(batch):
        data_hourly = forecasts[idx]["hourly"]

        temp_arr = np.array(data_hourly["temperature_2m"])
        rain_arr = np.array(data_hourly["rain"])
        code_arr = np.array(data_hourly["weathercode"])

        # Datum/Uhrzeit-Strings in lokaler Zeit
        date_strs = times.strftime('%Y-%m-%d')
        time_strs = times.strftime('%H:%M:%S')

        # Baue Mini-DataFrame für diese Station
        df_loc = pd.DataFrame({
            "start_lat":         [loc['start_lat']] * n,
            "start_lon":         [loc['start_lon']] * n,
            "date_only":         date_strs,
            "time_only_rounded": time_strs,
            "temperature_2m":    temp_arr,
            "rain":              rain_arr,
            "weather_code":      code_arr,
        })
        records.append(df_loc)

    return records

# 6) Parallel alle Batches abarbeiten
all_dfs = []
with ThreadPoolExecutor(max_workers=5) as exe:
    futures = [exe.submit(fetch_batch, b) for b in batches]
    for fut in as_completed(futures):
        all_dfs.extend(fut.result())

# 7) Zu einem DataFrame zusammenführen
weather_full = pd.concat(all_dfs, ignore_index=True)

# 8) Rückmerge ins Original-Frame
result = df.merge(
    weather_full,
    on=['start_lat','start_lon','date_only','time_only_rounded'],
    how='left'
)

# 9) Kontrolle
print(result.head())


   duration          start_time             end_time  start_station  \
0         5 2025-01-01 00:12:00  2025-01-01 00:17:00           3030   
1         7 2025-01-01 00:12:00  2025-01-01 00:19:00           4558   
2        11 2025-01-01 00:13:00  2025-01-01 00:24:00           4212   
3        11 2025-01-01 00:13:00  2025-01-01 00:24:00           4212   
4        13 2025-01-01 00:27:00  2025-01-01 00:40:00           4472   

   start_lat   start_lon  end_station    end_lat     end_lon  bike_id  \
0  34.051941 -118.243530         4491  34.047440 -118.247940    13668   
1  34.025688 -118.395302         4569  34.026550 -118.408463    30021   
2  33.988129 -118.471741         4206  33.998341 -118.461014    14923   
3  33.988129 -118.471741         4206  33.998341 -118.461014    26704   
4  34.092602 -118.280930         4509  34.101639 -118.309174    14790   

  bike_type   date_only time_only_rounded  temperature_2m  rain  weather_code  
0  standard  2025-01-01          00:00:00             

In [35]:
result[result['rain'] > 0]

Unnamed: 0,duration,start_time,end_time,start_station,start_lat,start_lon,end_station,end_lat,end_lon,bike_id,bike_type,date_only,time_only_rounded,temperature_2m,rain,weather_code
2066,12,2025-01-03 08:57:00,2025-01-03 09:09:00,4652,34.027050,-118.485184,4546,34.031399,-118.453629,29600,electric,2025-01-03,09:00:00,10.0,0.1,51
2075,31,2025-01-03 09:09:00,2025-01-03 09:40:00,4652,34.027050,-118.485184,4652,34.027050,-118.485184,16086,standard,2025-01-03,09:00:00,10.0,0.1,51
23937,25,2025-01-25 21:57:00,2025-01-25 22:22:00,4573,34.030472,-118.420982,4575,34.000309,-118.402527,25378,electric,2025-01-25,22:00:00,12.0,1.7,61
23944,10,2025-01-25 22:10:00,2025-01-25 22:20:00,4549,34.022449,-118.438332,4564,34.035351,-118.434143,6419,standard,2025-01-25,22:00:00,12.1,1.7,61
23952,13,2025-01-25 22:29:00,2025-01-25 22:42:00,4553,33.999580,-118.441360,4207,34.000881,-118.468910,25429,electric,2025-01-25,22:00:00,12.7,0.1,51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94945,4,2025-03-31 08:12:00,2025-03-31 08:16:00,4248,34.028351,-118.288673,4245,34.024040,-118.283409,12285,standard,2025-03-31,08:00:00,13.6,0.1,51
94946,18,2025-03-31 08:17:00,2025-03-31 08:35:00,3068,34.053200,-118.250954,4245,34.024040,-118.283409,13868,standard,2025-03-31,08:00:00,13.0,0.2,51
94947,2,2025-03-31 08:19:00,2025-03-31 08:21:00,4254,34.028679,-118.284111,4273,34.025860,-118.284103,12443,standard,2025-03-31,08:00:00,13.6,0.1,51
94949,6,2025-03-31 08:21:00,2025-03-31 08:27:00,4273,34.025860,-118.284103,4249,34.020302,-118.281181,20246,standard,2025-03-31,08:00:00,13.6,0.1,51


In [36]:

# 1) Calendar week (ISO standard, 1–53)
result['calendar_week'] = result['start_time'].dt.isocalendar().week

# 2) Weekday name and number (Monday=1 … Sunday=7)
result['weekday_name'] = result['start_time'].dt.day_name()        # e.g. "Monday"
result['weekday_num']  = result['start_time'].dt.weekday + 1        # Monday=1 … Sunday=7

# 3) Season (meteorological: Winter, Spring, Summer, Fall)
seasons = {0: 'Winter', 1: 'Spring', 2: 'Summer', 3: 'Fall'}
# (month % 12)//3 → 0 for Dec–Feb, 1 for Mar–May, 2 for Jun–Aug, 3 for Sep–Nov
result['season'] = (result['start_time'].dt.month % 12 // 3).map(seasons)

# 4) Time of day category
def time_of_day(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

result['time_of_day'] = result['start_time'].dt.hour.map(time_of_day)

# 5) Weekend indicator
def is_weekend(weekday_num):
    return weekday_num >= 6  # Saturday=6, Sunday=7

result['is_weekend'] = result['weekday_num'].map(is_weekend)

# Inspect the new columns



In [37]:
result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95916 entries, 0 to 95915
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   duration           95916 non-null  int64         
 1   start_time         95916 non-null  datetime64[ns]
 2   end_time           95916 non-null  object        
 3   start_station      95916 non-null  int64         
 4   start_lat          95916 non-null  float64       
 5   start_lon          95916 non-null  float64       
 6   end_station        95916 non-null  int64         
 7   end_lat            95916 non-null  float64       
 8   end_lon            95916 non-null  float64       
 9   bike_id            95916 non-null  int64         
 10  bike_type          95916 non-null  object        
 11  date_only          95916 non-null  object        
 12  time_only_rounded  95916 non-null  object        
 13  temperature_2m     95916 non-null  float64       
 14  rain  

In [38]:
result[result["is_weekend"]==1]

Unnamed: 0,duration,start_time,end_time,start_station,start_lat,start_lon,end_station,end_lat,end_lon,bike_id,...,time_only_rounded,temperature_2m,rain,weather_code,calendar_week,weekday_name,weekday_num,season,time_of_day,is_weekend
2905,8,2025-01-04 00:10:00,2025-01-04 00:18:00,4445,34.073639,-118.251572,4476,34.082520,-118.272720,15660,...,00:00:00,11.6,0.0,3,1,Saturday,6,Winter,Night,True
2906,6,2025-01-04 00:12:00,2025-01-04 00:18:00,4304,34.062580,-118.290092,4518,34.057968,-118.299751,20016,...,00:00:00,12.2,0.0,3,1,Saturday,6,Winter,Night,True
2907,29,2025-01-04 00:12:00,2025-01-04 00:41:00,3033,34.040989,-118.255798,4447,34.061600,-118.281990,27765,...,00:00:00,12.5,0.0,3,1,Saturday,6,Winter,Night,True
2908,18,2025-01-04 00:13:00,2025-01-04 00:31:00,4672,34.045021,-118.253189,3018,34.043732,-118.260139,17321,...,00:00:00,12.1,0.0,3,1,Saturday,6,Winter,Night,True
2909,13,2025-01-04 00:23:00,2025-01-04 00:36:00,4627,34.101849,-118.325142,4509,34.101639,-118.309174,13762,...,00:00:00,11.9,0.0,3,1,Saturday,6,Winter,Night,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94902,27,2025-03-30 23:26:00,2025-03-30 23:53:00,4255,33.973660,-118.422859,4583,33.976189,-118.418419,5801,...,23:00:00,13.7,0.0,3,13,Sunday,7,Spring,Night,True
94903,24,2025-03-30 23:29:00,2025-03-30 23:53:00,4255,33.973660,-118.422859,4583,33.976189,-118.418419,13869,...,23:00:00,13.7,0.0,3,13,Sunday,7,Spring,Night,True
94904,9,2025-03-30 23:37:00,2025-03-30 23:46:00,3042,34.049301,-118.238808,3000,0.000000,0.000000,22538,...,00:00:00,12.0,0.0,3,13,Sunday,7,Spring,Night,True
94905,13,2025-03-30 23:48:00,2025-03-31 00:01:00,3005,34.048500,-118.258537,3026,34.063179,-118.245880,24215,...,00:00:00,12.0,0.0,3,13,Sunday,7,Spring,Night,True


In [39]:
%pip install scikit-learn

import pandas as pd
from sklearn.cluster import KMeans

# ------------- 1. Kopie & Grund-Features ------------------
df = result.copy()                                # ← dein Trip-DataFrame
df['slot_ts'] = df['start_time'].dt.floor('h')     # stündliche Slots
df['hour_of_day'] = df['slot_ts'].dt.hour
df['weekday_num'] = df['slot_ts'].dt.weekday
df['is_weekend']  = df['weekday_num'].isin([5, 6])

# Temperatur in Klassen einteilen
def temp_class(t):
    if t < 10:   return 'cold'
    if t < 20:   return 'mid'
    if t < 28:   return 'warm'
    return 'heiss'

df['temp_class'] = df['temperature_2m'].apply(temp_class)
df['is_raining'] = df['rain'] >= 0.1

# ------------- 2. Stations-Stammdaten & Cluster ------------
stations = (
    df[['start_station', 'start_lat', 'start_lon']]
      .drop_duplicates()
      .rename(columns={'start_station': 'station_id',
                       'start_lat': 'lat',
                       'start_lon': 'lon'})
)

# einfache räumliche Cluster-ID (80 Cluster als Beispiel)
kmeans = KMeans(n_clusters=80, random_state=0)
stations['cluster_id'] = kmeans.fit_predict(stations[['lat', 'lon']])

# ------------- 3. Bikes taken / returned pro Stunde --------
taken = (df.groupby(['slot_ts', 'start_station'])
           .size().reset_index(name='bikes_taken'))

returned = (df.groupby(['slot_ts', 'end_station'])
             .size().reset_index(name='bikes_returned'))

agg = (pd.merge(taken, returned,
                left_on=['slot_ts', 'start_station'],
                right_on=['slot_ts', 'end_station'],
                how='outer')
         .rename(columns={'start_station': 'station_id'})
       )

# station_id von rechten Merge übernehmen, fehlende Counts = 0
agg['station_id'] = agg['station_id'].fillna(agg['end_station']).astype(int)
agg = agg.drop(columns='end_station')
agg[['bikes_taken', 'bikes_returned']] = agg[['bikes_taken',
                                             'bikes_returned']].fillna(0).astype(int)

# ------------- 4. Wetter stündlich mitteln -----------------
weather_hourly = (df.groupby('slot_ts')
                    .agg(temperature_2m=('temperature_2m', 'mean'),
                         temp_class=('temp_class',
                                     lambda x: x.mode().iloc[0] if not x.mode().empty else None),
                         rain_mm=('rain', 'mean'),
                         is_raining=('is_raining', 'max'),
                         weather_code=('weather_code',
                                       lambda x: x.mode().iloc[0] if not x.mode().empty else None))
                    .reset_index())

# ------------- 5. Alles zusammenführen --------------------
agg = (agg
       .merge(stations,       on='station_id', how='left')
       .merge(weather_hourly, on='slot_ts',    how='left'))

# Reihenfolge & Saisonzuteilung
season_map = {12:'Winter', 1:'Winter', 2:'Winter',
              3:'Spring', 4:'Spring', 5:'Spring',
              6:'Summer', 7:'Summer', 8:'Summer',
              9:'Fall', 10:'Fall', 11:'Fall'}
agg['season'] = agg['slot_ts'].dt.month.map(season_map)

# ------------- 6. SQL-Typ-Mapping printen ------------------
sql_map = {
    'int64':        'INTEGER',
    'int32':        'INTEGER',
    'float64':      'REAL',
    'bool':         'BOOLEAN',
    'datetime64[ns]':'TIMESTAMP',
    'object':       'TEXT',
    'category':     'VARCHAR'
}

print("\nSpalten  |  Pandas-dtype  →  SQL-Typ")
print("-" * 45)
for col, dt in agg.dtypes.items():
    print(f"{col:<15} {str(dt):<15} →  {sql_map.get(str(dt), 'TEXT')}")

print("\nBeispiel-Rows:")
print(agg.head())


Note: you may need to restart the kernel to use updated packages.

Spalten  |  Pandas-dtype  →  SQL-Typ
---------------------------------------------
slot_ts         datetime64[ns]  →  TIMESTAMP
station_id      int64           →  INTEGER
bikes_taken     int64           →  INTEGER
bikes_returned  int64           →  INTEGER
lat             float64         →  REAL
lon             float64         →  REAL
cluster_id      int32           →  INTEGER
temperature_2m  float64         →  REAL
temp_class      object          →  TEXT
rain_mm         float64         →  REAL
is_raining      bool            →  BOOLEAN
weather_code    int64           →  INTEGER
season          object          →  TEXT

Beispiel-Rows:
     slot_ts  station_id  bikes_taken  bikes_returned        lat         lon  \
0 2025-01-01        3026            0               1  34.063179 -118.245880   
1 2025-01-01        3028            1               0  34.058319 -118.246094   
2 2025-01-01        3030            1              

In [40]:
agg

Unnamed: 0,slot_ts,station_id,bikes_taken,bikes_returned,lat,lon,cluster_id,temperature_2m,temp_class,rain_mm,is_raining,weather_code,season
0,2025-01-01 00:00:00,3026,0,1,34.063179,-118.245880,17,10.200000,mid,0.0,False,3,Winter
1,2025-01-01 00:00:00,3028,1,0,34.058319,-118.246094,17,10.200000,mid,0.0,False,3,Winter
2,2025-01-01 00:00:00,3030,1,0,34.051941,-118.243530,53,10.200000,mid,0.0,False,3,Winter
3,2025-01-01 00:00:00,3042,1,0,34.049301,-118.238808,8,10.200000,mid,0.0,False,3,Winter
4,2025-01-01 00:00:00,3047,1,0,34.039982,-118.266403,52,10.200000,mid,0.0,False,3,Winter
...,...,...,...,...,...,...,...,...,...,...,...,...,...
92670,2025-03-31 23:00:00,4627,0,1,34.101849,-118.325142,55,13.041176,mid,0.0,False,3,Spring
92671,2025-03-31 23:00:00,4643,2,0,34.072620,-118.449440,60,13.041176,mid,0.0,False,3,Spring
92672,2025-03-31 23:00:00,4664,0,1,34.055161,-118.308990,35,13.041176,mid,0.0,False,3,Spring
92673,2025-03-31 23:00:00,4672,0,1,34.045021,-118.253189,14,13.041176,mid,0.0,False,3,Spring
