In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
import os
from pathlib import Path

In [4]:
from bikesharing.ml_logic.data import get_raw_data, get_weather_data, get_polygons
from bikesharing.ml_logic.encoders import encode_district_label,encode_temporal_features
from bikesharing.ml_logic.preprocessor import group_rental_data_by_hour,preprocess_features
from bikesharing.ml_logic.feature_engineering import is_holiday, is_weekend, feature_selection
from bikesharing.interface.main import preprocess
from bikesharing.params import *

In [5]:
query =f'''
    SELECT *
    FROM `{GCP_PROJECT}.{BQ_DATASET}.raw_data_mvg`
'''

rental_data_df = get_raw_data(gcp_project=GCP_PROJECT , query=query , cache_path=Path(f'{LOCAL_DATA_PATH}/raw/mvg_rentals_from_{START_YEAR}_to_{END_YEAR}.csv'))

[34m
Load rental_data from local CSV...[0m


  df = pd.read_csv(cache_path, header='infer' if data_has_header else None)


✅ Data loaded, with shape (2804147, 10)


In [6]:
rental_relavent_cols_df = rental_data_df[['STARTTIME' , 'STARTLAT' , 'STARTLON']]
rental_relavent_cols_df = rental_relavent_cols_df.drop_duplicates()

In [7]:
encoded_df = pd.read_csv('../raw_data/encode_df.csv')
encoded_df.drop(columns=['Unnamed: 0'] , inplace=True)

In [8]:
encoded_df

Unnamed: 0,STARTTIME,Altstadt-Lehel,Au - Haidhausen,Aubing-Lochhausen-Langwied,Berg am Laim,Bogenhausen,Feldmoching,Hadern,Harlaching,Hasenbergl-Lerchenau Ost,...,Schwanthalerhöhe,Sendling,Sendling-Westpark,Südgiesing,Thalkirchen,Trudering,Trudering-Riem,Untergiesing,Untergiesing-Harlaching,Untermenzing-Allach
0,2019-01-01 15:29:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2019-01-05 12:19:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2019-01-06 08:31:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2019-01-07 17:32:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2019-01-08 08:59:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2631247,2022-07-20 13:13:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2631248,2022-07-28 17:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2631249,2022-08-11 18:08:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2631250,2022-08-11 20:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [9]:
grouped_df = group_rental_data_by_hour(encoded_df)
grouped_df

Unnamed: 0,rent_date_hour,Altstadt-Lehel,Au - Haidhausen,Aubing-Lochhausen-Langwied,Berg am Laim,Bogenhausen,Feldmoching,Hadern,Harlaching,Hasenbergl-Lerchenau Ost,...,Schwanthalerhöhe,Sendling,Sendling-Westpark,Südgiesing,Thalkirchen,Trudering,Trudering-Riem,Untergiesing,Untergiesing-Harlaching,Untermenzing-Allach
0,2019-01-01 00:00:00,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2019-01-01 01:00:00,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,2019-01-01 02:00:00,1.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,2019-01-01 03:00:00,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,2019-01-01 04:00:00,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34627,2022-12-31 19:00:00,5.0,4.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,3.0,5.0,0.0,1.0,1.0,0.0,0.0,2.0,2.0,0.0
34628,2022-12-31 20:00:00,4.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,2.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
34629,2022-12-31 21:00:00,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34630,2022-12-31 22:00:00,5.0,3.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [10]:
weather_data_df = get_weather_data(cache_path=Path(f'{LOCAL_DATA_PATH}/raw/histotical_weather_data_{START_YEAR}_to_{END_YEAR}.csv'))
weather_data_df['time'] = pd.to_datetime(weather_data_df['time'])
weather_data_df.drop(columns=['relativehumidity_2m'] , inplace=True)
weather_data_df

[34m
Load weather_data from local CSV...[0m
✅ Data loaded, with shape (35064, 6)


Unnamed: 0,time,temperature_2m,apparent_temperature,windspeed_10m,precipitation
0,2019-01-01 00:00:00,3.3,0.5,9.0,0.2
1,2019-01-01 01:00:00,3.4,0.4,9.7,0.1
2,2019-01-01 02:00:00,3.5,0.2,12.0,0.2
3,2019-01-01 03:00:00,3.5,0.0,13.5,0.1
4,2019-01-01 04:00:00,3.5,-0.0,14.1,0.0
...,...,...,...,...,...
35059,2022-12-31 19:00:00,6.5,3.9,8.0,0.0
35060,2022-12-31 20:00:00,5.9,3.4,6.8,0.0
35061,2022-12-31 21:00:00,5.8,3.1,7.2,0.0
35062,2022-12-31 22:00:00,6.1,3.1,8.8,0.0


In [11]:
merged_df = grouped_df.merge(weather_data_df, right_on='time' , left_on='rent_date_hour' , how='outer')
merged_df['rent_date_hour'] = merged_df['time']
merged_df = merged_df.sort_values(by='rent_date_hour').drop(columns=['time'])
merged_df.fillna(0 , inplace=True)
merged_df['filter_time'] = merged_df['rent_date_hour'].apply(lambda x: str(x)[5:10])
merged_df = merged_df[merged_df['filter_time'] != '02-29'].drop(columns=['filter_time'])

In [12]:
merged_df

Unnamed: 0,rent_date_hour,Altstadt-Lehel,Au - Haidhausen,Aubing-Lochhausen-Langwied,Berg am Laim,Bogenhausen,Feldmoching,Hadern,Harlaching,Hasenbergl-Lerchenau Ost,...,Thalkirchen,Trudering,Trudering-Riem,Untergiesing,Untergiesing-Harlaching,Untermenzing-Allach,temperature_2m,apparent_temperature,windspeed_10m,precipitation
0,2019-01-01 00:00:00,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.3,0.5,9.0,0.2
1,2019-01-01 01:00:00,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,3.4,0.4,9.7,0.1
2,2019-01-01 02:00:00,1.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,3.5,0.2,12.0,0.2
3,2019-01-01 03:00:00,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,3.5,0.0,13.5,0.1
4,2019-01-01 04:00:00,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.5,-0.0,14.1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34627,2022-12-31 19:00:00,5.0,4.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,2.0,2.0,0.0,6.5,3.9,8.0,0.0
34628,2022-12-31 20:00:00,4.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,5.9,3.4,6.8,0.0
34629,2022-12-31 21:00:00,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.8,3.1,7.2,0.0
34630,2022-12-31 22:00:00,5.0,3.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,6.1,3.1,8.8,0.0


In [13]:
holidays = is_holiday(merged_df[['rent_date_hour']])
merged_df = merged_df.merge(holidays , on='rent_date_hour' , how='inner')

weekends = is_weekend(merged_df[['rent_date_hour']])
merged_df = merged_df.merge(weekends , on='rent_date_hour' , how='inner')

encoded_date = encode_temporal_features(merged_df[['rent_date_hour']])
merged_df = merged_df.merge(encoded_date , on='rent_date_hour' , how='inner')

In [14]:
merged_df

Unnamed: 0,rent_date_hour,Altstadt-Lehel,Au - Haidhausen,Aubing-Lochhausen-Langwied,Berg am Laim,Bogenhausen,Feldmoching,Hadern,Harlaching,Hasenbergl-Lerchenau Ost,...,is_holiday,is_weekend,hour_sin,hour_cos,month_sin,month_cos,day_sin,day_cos,weekday_sin,weekday_cos
0,2019-01-01 00:00:00,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,2.588190e-01,0.965926,5.000000e-01,0.866025,2.012985e-01,0.97953,0.974928,-0.222521
1,2019-01-01 01:00:00,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,1,0,5.000000e-01,0.866025,5.000000e-01,0.866025,2.012985e-01,0.97953,0.974928,-0.222521
2,2019-01-01 02:00:00,1.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,1,0,7.071068e-01,0.707107,5.000000e-01,0.866025,2.012985e-01,0.97953,0.974928,-0.222521
3,2019-01-01 03:00:00,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,8.660254e-01,0.500000,5.000000e-01,0.866025,2.012985e-01,0.97953,0.974928,-0.222521
4,2019-01-01 04:00:00,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,9.659258e-01,0.258819,5.000000e-01,0.866025,2.012985e-01,0.97953,0.974928,-0.222521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35035,2022-12-31 19:00:00,5.0,4.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0,1,-8.660254e-01,0.500000,-2.449294e-16,1.000000,-2.449294e-16,1.00000,-0.781831,0.623490
35036,2022-12-31 20:00:00,4.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,1,-7.071068e-01,0.707107,-2.449294e-16,1.000000,-2.449294e-16,1.00000,-0.781831,0.623490
35037,2022-12-31 21:00:00,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,1,-5.000000e-01,0.866025,-2.449294e-16,1.000000,-2.449294e-16,1.00000,-0.781831,0.623490
35038,2022-12-31 22:00:00,5.0,3.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0,1,-2.588190e-01,0.965926,-2.449294e-16,1.000000,-2.449294e-16,1.00000,-0.781831,0.623490


In [15]:
districts = ['Altstadt-Lehel', 'Au - Haidhausen',
    'Aubing-Lochhausen-Langwied', 'Berg am Laim', 'Bogenhausen',
    'Feldmoching', 'Hadern', 'Harlaching', 'Hasenbergl-Lerchenau Ost',
    'Laim', 'Lochhausen', 'Ludwigsvorstadt-Isarvorstadt', 'Maxvorstadt',
    'Milbertshofen-Am Hart', 'Moosach', 'Neuhausen-Nymphenburg',
    'Obergiesing', 'Obermenzing', 'Obersendling', 'Pasing',
    'Pasing-Obermenzing', 'Ramersdorf-Perlach', 'Schwabing-Freimann',
    'Schwabing-West', 'Schwanthalerhöhe', 'Sendling', 'Sendling-Westpark',
    'Südgiesing', 'Thalkirchen', 'Trudering', 'Trudering-Riem',
    'Untergiesing', 'Untergiesing-Harlaching', 'Untermenzing-Allach']

X = merged_df.drop(columns=districts)
y = merged_df[districts]

print(f'X_shape: {X.shape}')
print(f'y_shape: {y.shape}')

features = ['temperature_2m', 'apparent_temperature',
    'windspeed_10m', 'precipitation', 'is_holiday', 'is_weekend',
    'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'day_sin', 'day_cos', 'weekday_sin' , 'weekday_cos']

selected_merged_df = feature_selection(X , features)

X_shape: (35040, 15)
y_shape: (35040, 34)


In [16]:
selected_merged_df

Unnamed: 0,temperature_2m,apparent_temperature,windspeed_10m,precipitation,is_holiday,is_weekend,hour_sin,hour_cos,month_sin,month_cos,day_sin,day_cos,weekday_sin,weekday_cos
0,3.3,0.5,9.0,0.2,1,0,2.588190e-01,0.965926,5.000000e-01,0.866025,2.012985e-01,0.97953,0.974928,-0.222521
1,3.4,0.4,9.7,0.1,1,0,5.000000e-01,0.866025,5.000000e-01,0.866025,2.012985e-01,0.97953,0.974928,-0.222521
2,3.5,0.2,12.0,0.2,1,0,7.071068e-01,0.707107,5.000000e-01,0.866025,2.012985e-01,0.97953,0.974928,-0.222521
3,3.5,0.0,13.5,0.1,1,0,8.660254e-01,0.500000,5.000000e-01,0.866025,2.012985e-01,0.97953,0.974928,-0.222521
4,3.5,-0.0,14.1,0.0,1,0,9.659258e-01,0.258819,5.000000e-01,0.866025,2.012985e-01,0.97953,0.974928,-0.222521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35035,6.5,3.9,8.0,0.0,0,1,-8.660254e-01,0.500000,-2.449294e-16,1.000000,-2.449294e-16,1.00000,-0.781831,0.623490
35036,5.9,3.4,6.8,0.0,0,1,-7.071068e-01,0.707107,-2.449294e-16,1.000000,-2.449294e-16,1.00000,-0.781831,0.623490
35037,5.8,3.1,7.2,0.0,0,1,-5.000000e-01,0.866025,-2.449294e-16,1.000000,-2.449294e-16,1.00000,-0.781831,0.623490
35038,6.1,3.1,8.8,0.0,0,1,-2.588190e-01,0.965926,-2.449294e-16,1.000000,-2.449294e-16,1.00000,-0.781831,0.623490


In [17]:
preproc_df = preprocess_features(selected_merged_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[features] = scaler.fit_transform(X)


In [18]:
preproc_df

Unnamed: 0,temperature_2m,apparent_temperature,windspeed_10m,precipitation,is_holiday,is_weekend,hour_sin,hour_cos,month_sin,month_cos,day_sin,day_cos,weekday_sin,weekday_cos
0,0.355408,0.342007,0.227848,0.017391,1,0,0.629410,0.982963,0.75,0.933013,0.600779,0.989739,1.000000,0.356896
1,0.357616,0.340149,0.245570,0.008696,1,0,0.750000,0.933013,0.75,0.933013,0.600779,0.989739,1.000000,0.356896
2,0.359823,0.336431,0.303797,0.017391,1,0,0.853553,0.853553,0.75,0.933013,0.600779,0.989739,1.000000,0.356896
3,0.359823,0.332714,0.341772,0.008696,1,0,0.933013,0.750000,0.75,0.933013,0.600779,0.989739,1.000000,0.356896
4,0.359823,0.332714,0.356962,0.000000,1,0,0.982963,0.629410,0.75,0.933013,0.600779,0.989739,1.000000,0.356896
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35035,0.426049,0.405204,0.202532,0.000000,0,1,0.066987,0.750000,0.50,1.000000,0.500000,1.000000,0.099031,0.801938
35036,0.412804,0.395911,0.172152,0.000000,0,1,0.146447,0.853553,0.50,1.000000,0.500000,1.000000,0.099031,0.801938
35037,0.410596,0.390335,0.182278,0.000000,0,1,0.250000,0.933013,0.50,1.000000,0.500000,1.000000,0.099031,0.801938
35038,0.417219,0.390335,0.222785,0.000000,0,1,0.370590,0.982963,0.50,1.000000,0.500000,1.000000,0.099031,0.801938


In [19]:
preproc_df.to_csv('~/.lewagon/bikesharing/data/processed/X_processed_from_2019_to_2022.csv' , index=False)

In [None]:
X,y = preprocess()
X

[34m
Preprocessing Data...[0m
[34m
Load rental_data from local CSV...[0m


  df = pd.read_csv(cache_path, header='infer' if data_has_header else None)


✅ Data loaded, with shape (2804147, 10)
[34m
Load weather_data from local CSV...[0m
✅ Data loaded, with shape (35064, 6)
X_shape: (35040, 14)
y_shape: (35040, 34)


Unnamed: 0,temperature_2m,relativehumidity_2m,apparent_temperature,windspeed_10m,precipitation,hour_sin,hour_cos,month_sin,month_cos,day_sin,day_cos,is_holiday,is_weekend
0,0.355408,1.000000,0.342007,0.227848,0.017391,0.629410,0.982963,0.75,0.933013,0.600779,0.989739,1,0
1,0.357616,0.987013,0.340149,0.245570,0.008696,0.750000,0.933013,0.75,0.933013,0.600779,0.989739,1,0
2,0.359823,1.000000,0.336431,0.303797,0.017391,0.853553,0.853553,0.75,0.933013,0.600779,0.989739,1,0
3,0.359823,0.987013,0.332714,0.341772,0.008696,0.933013,0.750000,0.75,0.933013,0.600779,0.989739,1,0
4,0.359823,1.000000,0.332714,0.356962,0.000000,0.982963,0.629410,0.75,0.933013,0.600779,0.989739,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
35035,0.426049,0.779221,0.405204,0.202532,0.000000,0.066987,0.750000,0.50,1.000000,0.500000,1.000000,0,1
35036,0.412804,0.779221,0.395911,0.172152,0.000000,0.146447,0.853553,0.50,1.000000,0.500000,1.000000,0,1
35037,0.410596,0.753247,0.390335,0.182278,0.000000,0.250000,0.933013,0.50,1.000000,0.500000,1.000000,0,1
35038,0.417219,0.714286,0.390335,0.222785,0.000000,0.370590,0.982963,0.50,1.000000,0.500000,1.000000,0,1


In [None]:
y

Unnamed: 0,Altstadt-Lehel,Au - Haidhausen,Aubing-Lochhausen-Langwied,Berg am Laim,Bogenhausen,Feldmoching,Hadern,Harlaching,Hasenbergl-Lerchenau Ost,Laim,...,Schwanthalerhöhe,Sendling,Sendling-Westpark,Südgiesing,Thalkirchen,Trudering,Trudering-Riem,Untergiesing,Untergiesing-Harlaching,Untermenzing-Allach
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,1.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,3.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35035,5.0,4.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,3.0,5.0,0.0,1.0,1.0,0.0,0.0,2.0,2.0,0.0
35036,4.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,2.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
35037,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35038,5.0,3.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [None]:
X,y = preprocess()
X

[34m
Load preprocessed data from local CSV...[0m


Unnamed: 0,temperature_2m,relativehumidity_2m,apparent_temperature,windspeed_10m,precipitation,hour_sin,hour_cos,month_sin,month_cos,day_sin,day_cos,is_holiday,is_weekend
0,0.355408,1.000000,0.342007,0.227848,0.017391,0.629410,0.982963,0.75,0.933013,0.600779,0.989739,1,0
1,0.357616,0.987013,0.340149,0.245570,0.008696,0.750000,0.933013,0.75,0.933013,0.600779,0.989739,1,0
2,0.359823,1.000000,0.336431,0.303797,0.017391,0.853553,0.853553,0.75,0.933013,0.600779,0.989739,1,0
3,0.359823,0.987013,0.332714,0.341772,0.008696,0.933013,0.750000,0.75,0.933013,0.600779,0.989739,1,0
4,0.359823,1.000000,0.332714,0.356962,0.000000,0.982963,0.629410,0.75,0.933013,0.600779,0.989739,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
35035,0.426049,0.779221,0.405204,0.202532,0.000000,0.066987,0.750000,0.50,1.000000,0.500000,1.000000,0,1
35036,0.412804,0.779221,0.395911,0.172152,0.000000,0.146447,0.853553,0.50,1.000000,0.500000,1.000000,0,1
35037,0.410596,0.753247,0.390335,0.182278,0.000000,0.250000,0.933013,0.50,1.000000,0.500000,1.000000,0,1
35038,0.417219,0.714286,0.390335,0.222785,0.000000,0.370590,0.982963,0.50,1.000000,0.500000,1.000000,0,1


In [None]:
y

Unnamed: 0,Altstadt-Lehel,Au - Haidhausen,Aubing-Lochhausen-Langwied,Berg am Laim,Bogenhausen,Feldmoching,Hadern,Harlaching,Hasenbergl-Lerchenau Ost,Laim,...,Schwanthalerhöhe,Sendling,Sendling-Westpark,Südgiesing,Thalkirchen,Trudering,Trudering-Riem,Untergiesing,Untergiesing-Harlaching,Untermenzing-Allach
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,1.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,3.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35035,5.0,4.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,3.0,5.0,0.0,1.0,1.0,0.0,0.0,2.0,2.0,0.0
35036,4.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,2.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
35037,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35038,5.0,3.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
