In [1]:
import numpy as np
import pandas as pd

from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from is_holiday import check_holiday
from utilites import hour_features, day_features, dayofyear_features

from datetime import datetime
import matplotlib.pyplot as plt
import matplotlib

In [2]:
def encode(original_dataframe: pd.DataFrame, 
    feature_to_encode: str) -> pd.DataFrame:
    
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]], dtype=int)
    res = pd.concat([original_dataframe, dummies], axis=1)
    return res.drop(columns=feature_to_encode)

def load_and_preprocess(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    # Drop unnecessary columns
    df = df[['timestamp', 'location_id', 'weather_condition', 'temperature', 'pedestrians_count']]
    
    # Rearrange date
    df['date'] = df.apply(lambda row: str(datetime.strptime(row['timestamp'], '%Y-%m-%dT%H:%M:%SZ')), axis = 1)
    df = df.drop(columns='timestamp')

    # One-hot encoding for weather
    df = encode(df, 'weather_condition')
    return df


In [3]:
path = 'data/hystreet_fussgaengerfrequenzen_seit2021.csv'
df = load_and_preprocess(path)
df

Unnamed: 0,location_id,temperature,pedestrians_count,date,weather_condition_clear-day,weather_condition_clear-night,weather_condition_cloudy,weather_condition_fog,weather_condition_partly-cloudy-day,weather_condition_partly-cloudy-night,weather_condition_rain,weather_condition_snow,weather_condition_wind
0,329,13.51,151,2021-09-28 22:00:00,0,0,1,0,0,0,0,0,0
1,331,13.51,118,2021-09-28 22:00:00,0,0,1,0,0,0,0,0,0
2,330,13.51,203,2021-09-28 22:00:00,0,0,1,0,0,0,0,0,0
3,329,13.46,19,2021-09-28 23:00:00,0,0,1,0,0,0,0,0,0
4,331,13.46,7,2021-09-28 23:00:00,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
63979,331,6.80,2053,2024-03-05 12:00:00,0,0,0,0,0,0,1,0,0
63980,330,6.80,1394,2024-03-05 12:00:00,0,0,0,0,0,0,1,0,0
63981,329,6.90,644,2024-03-05 13:00:00,0,0,0,0,0,0,1,0,0
63982,331,6.90,477,2024-03-05 13:00:00,0,0,0,0,0,0,1,0,0


In [4]:
df_mid = df.loc[df['location_id'] == 329]
print(df_mid)
df_mid = df_mid.drop(columns='location_id')

# Temporary drop date
df_mid['holiday'] = df_mid.apply(lambda row: check_holiday(row['date']), axis = 1)

df_mid['time_cos'] = df_mid.apply(lambda row: hour_features(row['date'])[0], axis = 1)
df_mid['time_sin'] = df_mid.apply(lambda row: hour_features(row['date'])[1], axis = 1)

df_mid['day_cos'] = df_mid.apply(lambda row: day_features(row['date'])[0], axis = 1)
df_mid['day_sin'] = df_mid.apply(lambda row: day_features(row['date'])[1], axis = 1)

df_mid['dayofyear_cos'] = df_mid.apply(lambda row: dayofyear_features(row['date'])[0], axis = 1)
df_mid['dayofyear_sin'] = df_mid.apply(lambda row: dayofyear_features(row['date'])[1], axis = 1)

df_mid_old = df_mid.copy()

df_mid = df_mid.drop(columns='date')
# df_mid['date'][0].split()
# 
df_mid

       location_id  temperature  pedestrians_count                 date  \
0              329        13.51                151  2021-09-28 22:00:00   
3              329        13.46                 19  2021-09-28 23:00:00   
6              329        13.26                 10  2021-09-29 00:00:00   
9              329        13.51                  6  2021-09-29 01:00:00   
12             329        13.44                 22  2021-09-29 02:00:00   
...            ...          ...                ...                  ...   
63969          329         6.00               1507  2024-03-05 09:00:00   
63972          329         6.00               2280  2024-03-05 10:00:00   
63975          329         7.00               3216  2024-03-05 11:00:00   
63978          329         6.80               2858  2024-03-05 12:00:00   
63981          329         6.90                644  2024-03-05 13:00:00   

       weather_condition_clear-day  weather_condition_clear-night  \
0                             

Unnamed: 0,temperature,pedestrians_count,weather_condition_clear-day,weather_condition_clear-night,weather_condition_cloudy,weather_condition_fog,weather_condition_partly-cloudy-day,weather_condition_partly-cloudy-night,weather_condition_rain,weather_condition_snow,weather_condition_wind,holiday,time_cos,time_sin,day_cos,day_sin,dayofyear_cos,dayofyear_sin
0,13.51,151,0,0,1,0,0,0,0,0,0,0,0.866025,-5.000000e-01,0.623490,0.781831,-0.047321,-0.998880
3,13.46,19,0,0,1,0,0,0,0,0,0,0,0.965926,-2.588190e-01,0.623490,0.781831,-0.047321,-0.998880
6,13.26,10,0,0,0,0,0,0,1,0,0,0,1.000000,0.000000e+00,-0.222521,0.974928,-0.030120,-0.999546
9,13.51,6,0,0,0,0,0,0,1,0,0,0,0.965926,2.588190e-01,-0.222521,0.974928,-0.030120,-0.999546
12,13.44,22,0,0,0,0,0,0,1,0,0,0,0.866025,5.000000e-01,-0.222521,0.974928,-0.030120,-0.999546
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63969,6.00,1507,0,0,0,0,0,0,1,0,0,0,-0.707107,7.071068e-01,0.623490,0.781831,0.436651,0.899631
63972,6.00,2280,0,0,0,0,0,0,1,0,0,0,-0.866025,5.000000e-01,0.623490,0.781831,0.436651,0.899631
63975,7.00,3216,0,0,0,0,0,0,1,0,0,0,-0.965926,2.588190e-01,0.623490,0.781831,0.436651,0.899631
63978,6.80,2858,0,0,0,0,0,0,1,0,0,0,-1.000000,1.224647e-16,0.623490,0.781831,0.436651,0.899631


In [5]:
train, test = train_test_split(df_mid, test_size=0.2)

In [11]:
forest = LinearRegression(positive=True)
forest.fit(train.drop(columns='pedestrians_count'), train['pedestrians_count'])

In [12]:
forest.score(test.drop(columns='pedestrians_count'), test['pedestrians_count'])

0.3056961247118155

In [9]:
reals = np.array(df_mid_old['pedestrians_count'])
preds = np.array(forest.predict(df_mid_old.drop(columns=['pedestrians_count', 'date'])))
dates = np.array([datetime.strptime(d, "%Y-%m-%d %H:%M:%S") for d in df_mid_old['date']])

In [10]:
%matplotlib tk
matplotlib.use("TkAgg")
plt.figure()
plt.plot(dates, reals)
plt.plot(dates, preds)
plt.show()

In [49]:
df_mid_old.loc[4970:5020]

Unnamed: 0,temperature,pedestrians_count,date,weather_condition_clear-day,weather_condition_clear-night,weather_condition_cloudy,weather_condition_fog,weather_condition_partly-cloudy-day,weather_condition_partly-cloudy-night,weather_condition_rain,weather_condition_snow,weather_condition_wind,holiday,time_cos,time_sin,day_cos,day_sin,dayofyear_cos,dayofyear_sin
4971,-1.15,77,2021-12-06 23:00:00,0,0,1,0,0,0,0,0,0,0,0.9659258,-0.258819,1.0,0.0,0.908818,-0.417194
4974,-0.09,11,2021-12-07 00:00:00,0,0,1,0,0,0,0,0,0,0,1.0,0.0,0.62349,0.781831,0.915864,-0.401488
4977,-0.14,11,2021-12-07 01:00:00,0,0,1,0,0,0,0,0,0,0,0.9659258,0.258819,0.62349,0.781831,0.915864,-0.401488
4980,0.13,12,2021-12-07 02:00:00,0,0,1,0,0,0,0,0,0,0,0.8660254,0.5,0.62349,0.781831,0.915864,-0.401488
4983,0.51,12,2021-12-07 03:00:00,0,0,1,0,0,0,0,0,0,0,0.7071068,0.7071068,0.62349,0.781831,0.915864,-0.401488
4986,0.8,127,2021-12-07 04:00:00,0,0,1,0,0,0,0,0,0,0,0.5,0.8660254,0.62349,0.781831,0.915864,-0.401488
4989,1.12,313,2021-12-07 05:00:00,0,0,0,0,0,1,0,0,0,0,0.258819,0.9659258,0.62349,0.781831,0.915864,-0.401488
4992,1.31,786,2021-12-07 06:00:00,0,0,0,0,0,1,0,0,0,0,6.123234000000001e-17,1.0,0.62349,0.781831,0.915864,-0.401488
4995,1.16,931,2021-12-07 07:00:00,0,0,0,0,0,1,0,0,0,0,-0.258819,0.9659258,0.62349,0.781831,0.915864,-0.401488
4998,1.33,1187,2021-12-07 08:00:00,0,0,0,0,1,0,0,0,0,0,-0.5,0.8660254,0.62349,0.781831,0.915864,-0.401488
