In [1]:
import numpy as np
import pandas as pd

from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

from is_holiday import check_holiday
from utilites import hour_features, day_features, dayofyear_features

In [2]:
def encode(original_dataframe: pd.DataFrame, 
    feature_to_encode: str) -> pd.DataFrame:
    
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]], dtype=int)
    res = pd.concat([original_dataframe, dummies], axis=1)
    return res.drop(columns=feature_to_encode)

def load_and_preprocess(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    # Drop unnecessary columns
    df = df[['timestamp', 'location_id', 'weather_condition', 'temperature', 'pedestrians_count']]
    
    # Rearrange date
    df['date'] = df.apply(lambda row: str(datetime.strptime(row['timestamp'], '%Y-%m-%dT%H:%M:%SZ')), axis = 1)
    df = df.drop(columns='timestamp')

    # One-hot encoding for weather
    df = encode(df, 'weather_condition')
    return df


In [3]:
path = 'data/hystreet_fussgaengerfrequenzen_seit2021.csv'
df = load_and_preprocess(path)
df

Unnamed: 0,location_id,temperature,pedestrians_count,date,weather_condition_clear-day,weather_condition_clear-night,weather_condition_cloudy,weather_condition_fog,weather_condition_partly-cloudy-day,weather_condition_partly-cloudy-night,weather_condition_rain,weather_condition_snow,weather_condition_wind
0,329,13.51,151,2021-09-28 22:00:00,0,0,1,0,0,0,0,0,0
1,331,13.51,118,2021-09-28 22:00:00,0,0,1,0,0,0,0,0,0
2,330,13.51,203,2021-09-28 22:00:00,0,0,1,0,0,0,0,0,0
3,329,13.46,19,2021-09-28 23:00:00,0,0,1,0,0,0,0,0,0
4,331,13.46,7,2021-09-28 23:00:00,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
63979,331,6.80,2053,2024-03-05 12:00:00,0,0,0,0,0,0,1,0,0
63980,330,6.80,1394,2024-03-05 12:00:00,0,0,0,0,0,0,1,0,0
63981,329,6.90,644,2024-03-05 13:00:00,0,0,0,0,0,0,1,0,0
63982,331,6.90,477,2024-03-05 13:00:00,0,0,0,0,0,0,1,0,0


In [4]:
df_mid = df.loc[df['location_id'] == 330]
print(df_mid)
df_mid = df_mid.drop(columns='location_id')

# Temporary drop date
df_mid['holiday'] = df_mid.apply(lambda row: check_holiday(row['date']), axis = 1)

df_mid['time_cos'] = df_mid.apply(lambda row: hour_features(row['date'])[0], axis = 1)
df_mid['time_sin'] = df_mid.apply(lambda row: hour_features(row['date'])[1], axis = 1)

df_mid['day_cos'] = df_mid.apply(lambda row: day_features(row['date'])[0], axis = 1)
df_mid['day_sin'] = df_mid.apply(lambda row: day_features(row['date'])[1], axis = 1)

df_mid['dayofyear_cos'] = df_mid.apply(lambda row: dayofyear_features(row['date'])[0], axis = 1)
df_mid['dayofyear_sin'] = df_mid.apply(lambda row: dayofyear_features(row['date'])[1], axis = 1)


df_mid = df_mid.drop(columns='date')
# df_mid['date'][0].split()
# 
df_mid

       location_id  temperature  pedestrians_count                 date  \
2              330        13.51                203  2021-09-28 22:00:00   
5              330        13.46                 25  2021-09-28 23:00:00   
8              330        13.26                 17  2021-09-29 00:00:00   
11             330        13.51                 15  2021-09-29 01:00:00   
14             330        13.44                 18  2021-09-29 02:00:00   
...            ...          ...                ...                  ...   
63971          330         6.00                995  2024-03-05 09:00:00   
63974          330         6.00               1257  2024-03-05 10:00:00   
63977          330         7.00               1561  2024-03-05 11:00:00   
63980          330         6.80               1394  2024-03-05 12:00:00   
63983          330         6.90                433  2024-03-05 13:00:00   

       weather_condition_clear-day  weather_condition_clear-night  \
2                             

Unnamed: 0,temperature,pedestrians_count,weather_condition_clear-day,weather_condition_clear-night,weather_condition_cloudy,weather_condition_fog,weather_condition_partly-cloudy-day,weather_condition_partly-cloudy-night,weather_condition_rain,weather_condition_snow,weather_condition_wind,holiday,time_cos,time_sin,day_cos,day_sin,dayofyear_cos,dayofyear_sin
2,13.51,203,0,0,1,0,0,0,0,0,0,0,0.866025,-5.000000e-01,0.623490,0.781831,-0.047321,-0.998880
5,13.46,25,0,0,1,0,0,0,0,0,0,0,0.965926,-2.588190e-01,0.623490,0.781831,-0.047321,-0.998880
8,13.26,17,0,0,0,0,0,0,1,0,0,0,1.000000,0.000000e+00,-0.222521,0.974928,-0.030120,-0.999546
11,13.51,15,0,0,0,0,0,0,1,0,0,0,0.965926,2.588190e-01,-0.222521,0.974928,-0.030120,-0.999546
14,13.44,18,0,0,0,0,0,0,1,0,0,0,0.866025,5.000000e-01,-0.222521,0.974928,-0.030120,-0.999546
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63971,6.00,995,0,0,0,0,0,0,1,0,0,0,-0.707107,7.071068e-01,0.623490,0.781831,0.436651,0.899631
63974,6.00,1257,0,0,0,0,0,0,1,0,0,0,-0.866025,5.000000e-01,0.623490,0.781831,0.436651,0.899631
63977,7.00,1561,0,0,0,0,0,0,1,0,0,0,-0.965926,2.588190e-01,0.623490,0.781831,0.436651,0.899631
63980,6.80,1394,0,0,0,0,0,0,1,0,0,0,-1.000000,1.224647e-16,0.623490,0.781831,0.436651,0.899631


In [5]:
train, test = train_test_split(df_mid, test_size=0.2)

In [6]:
forest = RandomForestRegressor()
forest.fit(train.drop(columns='pedestrians_count'), train['pedestrians_count'])

In [7]:
forest.score(test.drop(columns='pedestrians_count'), test['pedestrians_count'])

0.8941539101450569