In [1]:
import numpy as np
import pandas as pd

from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

from is_holiday import check_holiday
from utilites import hour_features, day_features, dayofyear_features

from datetime import datetime
import matplotlib.pyplot as plt
import matplotlib

In [2]:
def encode(original_dataframe: pd.DataFrame, 
    feature_to_encode: str) -> pd.DataFrame:
    
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]], dtype=int)
    res = pd.concat([original_dataframe, dummies], axis=1)
    return res.drop(columns=feature_to_encode)

def load_and_preprocess(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    # Drop unnecessary columns
    df = df[['timestamp', 'location_id', 'weather_condition', 'temperature', 'pedestrians_count']]
    
    # Rearrange date
    df['date'] = df.apply(lambda row: str(datetime.strptime(row['timestamp'], '%Y-%m-%dT%H:%M:%SZ')), axis = 1)
    df = df.drop(columns='timestamp')

    # One-hot encoding for weather
    df = encode(df, 'weather_condition')
    return df


In [3]:
path = 'data/hystreet_fussgaengerfrequenzen_seit2021.csv'
df = load_and_preprocess(path)
df

Unnamed: 0,location_id,temperature,pedestrians_count,date,weather_condition_clear-day,weather_condition_clear-night,weather_condition_cloudy,weather_condition_fog,weather_condition_partly-cloudy-day,weather_condition_partly-cloudy-night,weather_condition_rain,weather_condition_snow,weather_condition_wind
0,329,13.51,151,2021-09-28 22:00:00,0,0,1,0,0,0,0,0,0
1,331,13.51,118,2021-09-28 22:00:00,0,0,1,0,0,0,0,0,0
2,330,13.51,203,2021-09-28 22:00:00,0,0,1,0,0,0,0,0,0
3,329,13.46,19,2021-09-28 23:00:00,0,0,1,0,0,0,0,0,0
4,331,13.46,7,2021-09-28 23:00:00,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
63979,331,6.80,2053,2024-03-05 12:00:00,0,0,0,0,0,0,1,0,0
63980,330,6.80,1394,2024-03-05 12:00:00,0,0,0,0,0,0,1,0,0
63981,329,6.90,644,2024-03-05 13:00:00,0,0,0,0,0,0,1,0,0
63982,331,6.90,477,2024-03-05 13:00:00,0,0,0,0,0,0,1,0,0


In [4]:
df_seen = df.loc[df['location_id'] == 330]
df_seen = df_seen.loc[df['pedestrians_count'] <= 5000]

matplotlib.use("TkAgg")
plt.figure()
plt.plot([datetime.strptime(d, "%Y-%m-%d %H:%M:%S") for d in df_seen['date']], df_seen['pedestrians_count'])
plt.show()

plt.figure()
plt.hist(df_seen['pedestrians_count'], bins=50)
plt.show()

window_size = 24*365  # You can adjust this as needed
m_avg = df_seen['pedestrians_count'].rolling(window=window_size, center=True).mean()

# Plot original data and centered moving average
plt.figure()
plt.plot([datetime.strptime(d, "%Y-%m-%d %H:%M:%S") for d in df_seen['date']], df_seen['pedestrians_count'], label='Pedestrians Count')
plt.plot([datetime.strptime(d, "%Y-%m-%d %H:%M:%S") for d in df_seen['date']], m_avg, label='Moving Average', linestyle='--')
plt.xlabel('Date')
plt.ylabel('Pedestrians Count')
plt.title('Pedestrians Count with Centered Moving Average')
plt.legend()
plt.show()

df_seen = df_seen.drop(columns='location_id')

# Temporary drop date
df_seen['holiday'] = df_seen.apply(lambda row: check_holiday(row['date']), axis = 1)

df_seen['time_cos'] = df_seen.apply(lambda row: hour_features(row['date'])[0], axis = 1)
df_seen['time_sin'] = df_seen.apply(lambda row: hour_features(row['date'])[1], axis = 1)

df_seen['day_cos'] = df_seen.apply(lambda row: day_features(row['date'])[0], axis = 1)
df_seen['day_sin'] = df_seen.apply(lambda row: day_features(row['date'])[1], axis = 1)

df_seen['dayofyear_cos'] = df_seen.apply(lambda row: dayofyear_features(row['date'])[0], axis = 1)
df_seen['dayofyear_sin'] = df_seen.apply(lambda row: dayofyear_features(row['date'])[1], axis = 1)

df_unseen = df_seen[[datetime.strptime(d, '%Y-%m-%d %H:%M:%S') <= datetime(2022, 5, 5) for d in df_seen['date']]]
df_seen = df_seen[[datetime.strptime(d, '%Y-%m-%d %H:%M:%S') > datetime(2022, 5, 5) for d in df_seen['date']]]
df_mid_old = df_seen.copy()

df_seen = df_seen.drop(columns='date')
# df_mid['date'][0].split()
# 
df_seen

Unnamed: 0,temperature,pedestrians_count,weather_condition_clear-day,weather_condition_clear-night,weather_condition_cloudy,weather_condition_fog,weather_condition_partly-cloudy-day,weather_condition_partly-cloudy-night,weather_condition_rain,weather_condition_snow,weather_condition_wind,holiday,time_cos,time_sin,day_cos,day_sin,dayofyear_cos,dayofyear_sin
15707,9.26,11,0,0,0,0,0,1,0,0,0,0,0.965926,2.588190e-01,-0.900969,0.433884,-0.548843,0.835925
15710,8.76,21,0,0,0,0,0,1,0,0,0,0,0.866025,5.000000e-01,-0.900969,0.433884,-0.548843,0.835925
15713,7.99,149,0,0,0,0,0,1,0,0,0,0,0.707107,7.071068e-01,-0.900969,0.433884,-0.548843,0.835925
15716,7.82,307,0,0,0,0,0,1,0,0,0,0,0.500000,8.660254e-01,-0.900969,0.433884,-0.548843,0.835925
15719,8.31,689,0,0,0,0,1,0,0,0,0,0,0.258819,9.659258e-01,-0.900969,0.433884,-0.548843,0.835925
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63971,6.00,995,0,0,0,0,0,0,1,0,0,0,-0.707107,7.071068e-01,0.623490,0.781831,0.436651,0.899631
63974,6.00,1257,0,0,0,0,0,0,1,0,0,0,-0.866025,5.000000e-01,0.623490,0.781831,0.436651,0.899631
63977,7.00,1561,0,0,0,0,0,0,1,0,0,0,-0.965926,2.588190e-01,0.623490,0.781831,0.436651,0.899631
63980,6.80,1394,0,0,0,0,0,0,1,0,0,0,-1.000000,1.224647e-16,0.623490,0.781831,0.436651,0.899631


In [7]:
train, test = train_test_split(df_seen, test_size=0.2)

In [14]:
forest = RandomForestRegressor(verbose=True, oob_score=True, n_estimators=2000)
forest.fit(train.drop(columns='pedestrians_count'), train['pedestrians_count'])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed:  2.8min finished


In [15]:
forest.score(test.drop(columns='pedestrians_count'), test['pedestrians_count'])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed:    2.1s finished


0.9128174823097381

In [16]:
forest.score(df_unseen.drop(columns=['pedestrians_count', 'date']), df_unseen['pedestrians_count'])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed:    2.2s finished


0.8066067638285555

In [17]:
reals = np.array(df_unseen['pedestrians_count'])
preds = np.array(forest.predict(df_unseen.drop(columns=['pedestrians_count', 'date'])))
dates = np.array([datetime.strptime(d, "%Y-%m-%d %H:%M:%S") for d in df_unseen['date']])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed:    2.0s finished


In [12]:
%matplotlib tk
matplotlib.use("TkAgg")
plt.figure()
plt.plot(dates, reals)
plt.plot(dates, preds)
plt.show()

In [13]:
forest.oob_prediction_

array([ 399.36842105,  120.19444444,  654.46341463, ..., 1894.25      ,
        979.97435897, 1532.94444444])