In [85]:
import math

import numpy as np
import pandas as pd
import requests

In [21]:
df = pd.read_csv("../data/bicikelj_train.csv")

In [22]:
df["timestamp"] = pd.to_datetime(df["timestamp"])

In [23]:
df = pd.melt(df, id_vars=["timestamp"], value_vars=df.columns[1:], var_name="station", value_name="target")

In [124]:
stations_df = pd.read_csv("../bicikelj_postaje.csv")

In [126]:
fixed_coords = (46.051446, 14.506051)

In [129]:
def haversine(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
    c = 2 * math.asin(math.sqrt(a))

    # Radius of Earth in kilometers. Use 3956 for miles
    r = 6371

    # Calculate the result
    return c * r

# Calculate distance
stations_df["distance_to_center"] = stations_df.apply(lambda row: haversine(row['position_lat'], row['position_lon'], fixed_coords[0], fixed_coords[1]), axis=1)

In [131]:
df = df.merge(stations_df, left_on="station", right_on="name")

In [5]:
relative_time_of_day = df["timestamp"].dt.hour / 24 + df["timestamp"].dt.minute / 1440 + df["timestamp"].dt.second / 86400
circular_relative_time = relative_time_of_day * 2 * math.pi
df["time_of_day_sin"] = np.sin(circular_relative_time)
df["time_of_day_cos"] = np.cos(circular_relative_time)

In [6]:
weather_df = pd.read_csv("../data/open-meteo-ljubljana.csv")
weather_df["timestamp"] = pd.to_datetime(weather_df["timestamp"])

In [7]:
def add_weather_data(df, weather_df):
    i = 0
    closest_rows = []
    weather_no_timestamp = weather_df.drop(columns=["timestamp"])

    for _, row in df.iterrows():
        while i < len(weather_df) - 1 and abs(weather_df.iloc[i + 1]["timestamp"] - row["timestamp"]) < abs(weather_df.iloc[i]["timestamp"] - row["timestamp"]):
            i += 1
        closest_rows.append(weather_no_timestamp.iloc[i])

    closest_df = pd.DataFrame(closest_rows).reset_index(drop=True)
    result = pd.concat([df, closest_df], axis=1)

    return result

In [8]:
df = add_weather_data(df, weather_df)

In [10]:
df["is_weekend"] = df["timestamp"].dt.dayofweek > 4

In [11]:
df

Unnamed: 0,timestamp,station,target,time_of_day_sin,time_of_day_cos,temperature,precipitation,snow_depth,cloud_cover,is_day,is_weekend
0,2022-08-02 13:04:00,PREŠERNOV TRG-PETKOVŠKOVO NABREŽJE,17,-0.275637,-0.961262,25.6,0.1,0.0,37.0,1.0,False
1,2022-08-02 13:10:00,PREŠERNOV TRG-PETKOVŠKOVO NABREŽJE,18,-0.300706,-0.953717,25.6,0.1,0.0,37.0,1.0,False
2,2022-08-02 13:14:00,PREŠERNOV TRG-PETKOVŠKOVO NABREŽJE,17,-0.317305,-0.948324,25.6,0.1,0.0,37.0,1.0,False
3,2022-08-02 13:20:00,PREŠERNOV TRG-PETKOVŠKOVO NABREŽJE,18,-0.342020,-0.939693,25.6,0.1,0.0,37.0,1.0,False
4,2022-08-02 13:25:00,PREŠERNOV TRG-PETKOVŠKOVO NABREŽJE,20,-0.362438,-0.932008,25.6,0.1,0.0,37.0,1.0,False
...,...,...,...,...,...,...,...,...,...,...,...
642332,2022-10-01 18:03:00,SUPERNOVA LJUBLJANA - RUDNIK,7,-0.999914,0.013090,14.9,0.0,0.0,31.0,0.0,True
642333,2022-10-01 18:07:00,SUPERNOVA LJUBLJANA - RUDNIK,7,-0.999534,0.030539,14.9,0.0,0.0,31.0,0.0,True
642334,2022-10-01 18:13:00,SUPERNOVA LJUBLJANA - RUDNIK,7,-0.998392,0.056693,14.9,0.0,0.0,31.0,0.0,True
642335,2022-10-01 18:16:00,SUPERNOVA LJUBLJANA - RUDNIK,7,-0.997564,0.069756,14.9,0.0,0.0,31.0,0.0,True
