In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
import os
from pathlib import Path

In [2]:
from bikesharing.ml_logic.data import get_raw_data ,get_weather_data ,get_polygons
from bikesharing.params import *

- Remove duplicates
- Deal with missing values
- Scale the features
- Encode features
- Perform cyclical engineering

In [3]:
dfs = []
meta = {'years':[], 'n_columns':[], 'n_rows':[]}
for year in range(2019,2023,1):
    df = pd.read_csv(f'../raw_data/MVG_Rad_Fahrten_{year}.csv', sep=';')
    cols = [col.strip() for col in df.columns]
    df.columns = cols
    dfs.append(df)
    meta['years'].append(year)
    meta['n_columns'].append(df.shape[1])
    meta['n_rows'].append(df.shape[0])

  df = pd.read_csv(f'../raw_data/MVG_Rad_Fahrten_{year}.csv', sep=';')
  df = pd.read_csv(f'../raw_data/MVG_Rad_Fahrten_{year}.csv', sep=';')
  df = pd.read_csv(f'../raw_data/MVG_Rad_Fahrten_{year}.csv', sep=';')


In [4]:
len(dfs)

4

In [5]:
for df in dfs:
    print(df.shape)

(753678, 11)
(721752, 11)
(619573, 11)
(709144, 11)


In [6]:
data = pd.concat(dfs, axis=0)

In [7]:
def pre_process_df(df):
  #vstrip column names
  cols = [col.strip() for col in df.columns]
  df.columns = cols

  # remove column 'Row'
  if 'Row' in df.columns:
    df.drop(columns='Row', inplace=True)

  # select relevant columns only
  #df = df[['STARTTIME', 'STARTLAT', 'STARTLON', 'RENTAL_IS_STATION']].copy()
  
  # make string replacements values
  df_obj = df.select_dtypes(include='object')
  df[df_obj.columns] = df_obj.applymap(lambda x: x.strip().replace(',', '.') if isinstance(x, str) else x)

  # handle datetime
  df.STARTTIME = pd.to_datetime(df.STARTTIME)

  # handle numeric columns
  df.replace('NULL', np.NAN, inplace=True)
  df.replace('', np.NAN, inplace=True)
  df[['STARTLAT', 'STARTLON', 'RENTAL_IS_STATION','ENDLAT', 'ENDLON', 'RETURN_IS_STATION']] = df[['STARTLAT', 'STARTLON', 'RENTAL_IS_STATION','ENDLAT', 'ENDLON', 'RETURN_IS_STATION']].astype(np.float32)
  
  return df

In [8]:
data = pre_process_df(data)

KeyboardInterrupt: 

In [15]:
def load_data_to_bq(
        data: pd.DataFrame,
        gcp_project:str,
        bq_dataset:str,
        table: str,
        truncate: bool
    ) -> None:
    """
    - Save the DataFrame to BigQuery
    - Empty the table beforehand if `truncate` is True, append otherwise
    """

    assert isinstance(data, pd.DataFrame)
    full_table_name = f"{gcp_project}.{bq_dataset}.{table}"

    #print(Fore.BLUE + f"\nSave data to BigQuery @ {full_table_name}...:" + Style.RESET_ALL)


    # reset column names
    data.columns = [f'_{col}' if isinstance(col, int) else col for col in data.columns]

    # Load data onto full_table_name
    client = bigquery.Client()

    write_mode = "WRITE_TRUNCATE" if truncate else "WRITE_APPEND"
    job_config = bigquery.LoadJobConfig(write_disposition=write_mode)

    job = client.load_table_from_dataframe(data, full_table_name, job_config=job_config)

    # 🎯 HINT for "*** TypeError: expected bytes, int found":
    # After preprocessing the data, your original column names are gone (print it to check),
    # so ensure that your column names are *strings* that start with either
    # a *letter* or an *underscore*, as BQ does not accept anything else

    print(f"✅ Data saved to bigquery, with shape {data.shape}")


In [31]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2804147 entries, 0 to 709143
Data columns (total 10 columns):
 #   Column               Dtype         
---  ------               -----         
 0   STARTTIME            datetime64[ns]
 1   ENDTIME              object        
 2   STARTLAT             float32       
 3   STARTLON             float32       
 4   ENDLAT               float32       
 5   ENDLON               float32       
 6   RENTAL_IS_STATION    float32       
 7   RENTAL_STATION_NAME  object        
 8   RETURN_IS_STATION    float32       
 9   RETURN_STATION_NAME  object        
dtypes: datetime64[ns](1), float32(6), object(3)
memory usage: 171.2+ MB


In [32]:
load_data_to_bq(
        data=data,
        gcp_project=os.environ.get("GCP_PROJECT"),
        bq_dataset=os.environ.get("BQ_DATASET"),
        table='raw_data_mvg',
        truncate=True
    )

✅ Data saved to bigquery, with shape (2804147, 10)


In [5]:
query =f'''
        SELECT *
        FROM `{GCP_PROJECT}.{BQ_DATASET}.raw_data_mvg`
    '''

In [7]:
df = get_raw_data(GCP_PROJECT, query=query, cache_path=Path(f'{LOCAL_DATA_PATH}/raw/MVG_Rad_Fahrten_{START_YEAR}_to_{END_YEAR}.csv'))
df

[34m
Load rental_data from BigQuery server...[0m


KeyboardInterrupt: 

In [8]:
import requests

base_url = 'https://archive-api.open-meteo.com/v1/archive'

params = {
    'latitude': 48.70,
    'longitude': 13.46,
    'start_date' : f'{START_YEAR}-01-01',
    'end_date' : f'{END_YEAR}-12-31',
    'hourly': ['temperature_2m', 'relativehumidity_2m', 'apparent_temperature','windspeed_10m','precipitation']
}

historical_weather_data = requests.get(base_url , params=params).json()

In [4]:
df = get_weather_data(cache_path=Path(f'{LOCAL_DATA_PATH}/raw/Histotical_Weather_Data_{START_YEAR}_to_{END_YEAR}.csv'))
df

[34m
Load data from local CSV...[0m
✅ Data loaded, with shape (35064, 6)


Unnamed: 0,time,temperature_2m,relativehumidity_2m,apparent_temperature,windspeed_10m,precipitation
0,2019-01-01T00:00,3.3,100,0.5,9.0,0.2
1,2019-01-01T01:00,3.4,99,0.4,9.7,0.1
2,2019-01-01T02:00,3.5,100,0.2,12.0,0.2
3,2019-01-01T03:00,3.5,99,0.0,13.5,0.1
4,2019-01-01T04:00,3.5,100,-0.0,14.1,0.0
...,...,...,...,...,...,...
35059,2022-12-31T19:00,6.5,83,3.9,8.0,0.0
35060,2022-12-31T20:00,5.9,83,3.4,6.8,0.0
35061,2022-12-31T21:00,5.8,81,3.1,7.2,0.0
35062,2022-12-31T22:00,6.1,78,3.1,8.8,0.0


In [3]:
dict = get_polygons()
dict

{'Maxvorstadt': <POLYGON ((11.539 48.143, 11.54 48.143, 11.542 48.143, 11.544 48.142, 11.545...>,
 'Schwabing-West': <POLYGON ((11.55 48.161, 11.55 48.161, 11.551 48.16, 11.551 48.16, 11.552 48...>,
 'Au - Haidhausen': <POLYGON ((11.569 48.122, 11.569 48.122, 11.569 48.122, 11.57 48.122, 11.57 ...>,
 'Sendling': <POLYGON ((11.535 48.13, 11.536 48.13, 11.536 48.13, 11.536 48.129, 11.536 4...>,
 'Schwanthalerhöhe': <POLYGON ((11.526 48.137, 11.526 48.137, 11.526 48.136, 11.526 48.136, 11.52...>,
 'Moosach': <POLYGON ((11.466 48.205, 11.466 48.204, 11.467 48.204, 11.468 48.204, 11.46...>,
 'Berg am Laim': <POLYGON ((11.601 48.124, 11.602 48.124, 11.604 48.123, 11.606 48.121, 11.60...>,
 'Trudering': <POLYGON ((11.644 48.115, 11.644 48.115, 11.646 48.114, 11.647 48.113, 11.64...>,
 'Ramersdorf': <POLYGON ((7.14 50.718, 7.143 50.718, 7.145 50.717, 7.147 50.717, 7.147 50.7...>,
 'Obergiesing': <POLYGON ((11.574 48.112, 11.574 48.112, 11.574 48.112, 11.574 48.112, 11.57...>,
 'Untergiesing': 

# RNN

**Lecture Code**

In [3]:
from typing import Dict, List, Tuple, Sequence

def get_folds(
    df: pd.DataFrame,
    fold_length: int,
    fold_stride: int) -> List[pd.DataFrame]:
    """
    This function slides through the Time Series dataframe of shape (n_timesteps, n_features) to create folds
    - of equal `fold_length`
    - using `fold_stride` between each fold

    Args:
        df (pd.DataFrame): Overall dataframe
        fold_length (int): How long each fold should be in rows
        fold_stride (int): How many timesteps to move forward between taking each fold

    Returns:
        List[pd.DataFrame]: A list where each fold is a dataframe within
    """
    folds = []
    for idx in range(0, len(df), fold_stride):
        # Exits the loop as soon as the last fold index would exceed the last index
        if (idx + fold_length) > len(df):
            break
        fold = df.iloc[idx:idx + fold_length, :]
        folds.append(fold)
    return folds

def train_test_split(fold:pd.DataFrame,
                     train_test_ratio: float,
                     input_length: int) -> Tuple[pd.DataFrame]:
    """From a fold dataframe, take a train dataframe and test dataframe based on
    the split ratio.
    - df_train should contain all the timesteps until round(train_test_ratio * len(fold))
    - df_test should contain all the timesteps needed to create all (X_test, y_test) tuples

    Args:
        fold (pd.DataFrame): A fold of timesteps
        train_test_ratio (float): The ratio between train and test 0-1
        input_length (int): How long each X_i will be

    Returns:
        Tuple[pd.DataFrame]: A tuple of two dataframes (fold_train, fold_test)
    """

    # TRAIN SET
    last_train_idx = round(train_test_ratio * len(fold))
    fold_train = fold.iloc[0:last_train_idx, :]

    # TEST SET
    first_test_idx = last_train_idx - input_length
    fold_test = fold.iloc[first_test_idx:, :]

    return (fold_train, fold_test)

def get_Xi_yi(
    fold:pd.DataFrame,
    input_length:int,
    output_length:int) -> Tuple[pd.DataFrame]:
    """given a fold, it returns one sequence (X_i, y_i) as based on the desired
    input_length and output_length with the starting point of the sequence being chosen at random based

    Args:
        fold (pd.DataFrame): A single fold
        input_length (int): How long each X_i should be
        output_length (int): How long each y_i should be

    Returns:
        Tuple[pd.DataFrame]: A tuple of two dataframes (X_i, y_i)
    """

    first_possible_start = 0
    last_possible_start = len(fold) - (input_length + output_length) + 1
    random_start = np.random.randint(first_possible_start, last_possible_start)
    X_i = fold.iloc[random_start:random_start+input_length]
    y_i = fold.iloc[random_start+input_length:
                  random_start+input_length+output_length][TARGET]

    return (X_i, y_i)

def get_X_y(
    fold:pd.DataFrame,
    number_of_sequences:int,
    input_length:int,
    output_length:int) -> Tuple[np.array]:
    """Given a fold generate X and y based on the number of desired sequences
    of the given input_length and output_length

    Args:
        fold (pd.DataFrame): Fold dataframe
        number_of_sequences (int): The number of X_i and y_i pairs to include
        input_length (int): Length of each X_i
        output_length (int): Length of each y_i

    Returns:
        Tuple[np.array]: A tuple of numpy arrays (X, y)
    """
    X, y = [], []

    for i in range(number_of_sequences):
        (Xi, yi) = get_Xi_yi(fold, input_length, output_length)
        X.append(Xi)
        y.append(yi)

    return np.array(X), np.array(y)


In [4]:
from bikesharing.interface.main import preprocess
from bikesharing.ml_logic.data import get_raw_data
from bikesharing.params import *

query =f'''
        SELECT *
        FROM `{GCP_PROJECT}.{BQ_DATASET}.raw_data_mvg`
    '''
df = get_raw_data(GCP_PROJECT, query=query, cache_path=Path(f'{LOCAL_DATA_PATH}/raw/mvg_rentals_from_{START_YEAR}_to_{END_YEAR}.csv'))



[34m
Load rental_data from local CSV...[0m


  df = pd.read_csv(cache_path, header='infer' if data_has_header else None)


✅ Data loaded, with shape (2804147, 10)


## Preprocessing

In [5]:
from bikesharing.ml_logic.data import get_polygons, get_weather_data
from bikesharing.ml_logic.preprocessor import group_rental_data_by_hour, preprocess_features
from bikesharing.ml_logic.encoders import encode_district_label, encode_temporal_features
from bikesharing.ml_logic.feature_engineering import is_holiday, is_weekend, feature_selection

In [None]:
# 2. drop cols
rental_relavent_cols_df = df[['STARTTIME' , 'STARTLAT' , 'STARTLON']]

# 3. clean(rm duplicates)
rental_relavent_cols_df = rental_relavent_cols_df.drop_duplicates()

# 4. encode y
encoded_rental_df = encode_district_label(rental_relavent_cols_df , get_polygons())

# 5. aggregate by hour
aggregated_rental_df = group_rental_data_by_hour(encoded_rental_df)

In [None]:
aggregated_rental_df.to_csv(f'{LOCAL_DATA_PATH}/processed/aggregated_rental_df.csv', index=False)

In [7]:
aggregated_rental_df = pd.read_csv(f'{LOCAL_DATA_PATH}/processed/aggregated_rental_df.csv')
aggregated_rental_df.rent_date_hour = pd.to_datetime(aggregated_rental_df.rent_date_hour)

In [8]:
# 6. join with weather data
weather_data_df = get_weather_data(cache_path=Path(f'{LOCAL_DATA_PATH}/raw/histotical_weather_data_{START_YEAR}_to_{END_YEAR}.csv'))
weather_data_df['time'] = pd.to_datetime(weather_data_df['time'])
merged_df = aggregated_rental_df.merge(weather_data_df, right_on='time' , left_on='rent_date_hour' , how='outer')
merged_df['rent_date_hour'] = merged_df['time']
merged_df = merged_df.sort_values(by='rent_date_hour').drop(columns=['time'])

# 7. feature enginering & merge
holidays = is_holiday(merged_df[['rent_date_hour']])
merged_df = merged_df.merge(holidays , on='rent_date_hour' , how='inner')

weekends = is_weekend(merged_df[['rent_date_hour']])
merged_df = merged_df.merge(weekends , on='rent_date_hour' , how='inner')

encoded_date = encode_temporal_features(merged_df[['rent_date_hour']])
merged_df = merged_df.merge(encoded_date , on='rent_date_hour' , how='inner')

[34m
Load weather_data from local CSV...[0m
✅ Data loaded, with shape (35064, 6)


In [9]:
merged_df

Unnamed: 0,rent_date_hour,Altstadt-Lehel,Au - Haidhausen,Aubing-Lochhausen-Langwied,Berg am Laim,Bogenhausen,Feldmoching,Hadern,Harlaching,Hasenbergl-Lerchenau Ost,...,windspeed_10m,precipitation,is_holiday,is_weekend,hour_sin,hour_cos,month_sin,month_cos,day_sin,day_cos
0,2019-01-01 00:00:00,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9.0,0.2,1,0,2.588190e-01,0.965926,5.000000e-01,0.866025,2.012985e-01,0.97953
1,2019-01-01 01:00:00,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,9.7,0.1,1,0,5.000000e-01,0.866025,5.000000e-01,0.866025,2.012985e-01,0.97953
2,2019-01-01 02:00:00,1.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,12.0,0.2,1,0,7.071068e-01,0.707107,5.000000e-01,0.866025,2.012985e-01,0.97953
3,2019-01-01 03:00:00,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,13.5,0.1,1,0,8.660254e-01,0.500000,5.000000e-01,0.866025,2.012985e-01,0.97953
4,2019-01-01 04:00:00,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,14.1,0.0,1,0,9.659258e-01,0.258819,5.000000e-01,0.866025,2.012985e-01,0.97953
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35059,2022-12-31 19:00:00,5.0,4.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,8.0,0.0,0,1,-8.660254e-01,0.500000,-2.449294e-16,1.000000,-2.449294e-16,1.00000
35060,2022-12-31 20:00:00,4.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,6.8,0.0,0,1,-7.071068e-01,0.707107,-2.449294e-16,1.000000,-2.449294e-16,1.00000
35061,2022-12-31 21:00:00,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,7.2,0.0,0,1,-5.000000e-01,0.866025,-2.449294e-16,1.000000,-2.449294e-16,1.00000
35062,2022-12-31 22:00:00,5.0,3.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,8.8,0.0,0,1,-2.588190e-01,0.965926,-2.449294e-16,1.000000,-2.449294e-16,1.00000


In [10]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import MinMaxScaler

def preprocess_features(df: pd.DataFrame):

    df = df.fillna(0)
    def create_preprocessor() -> ColumnTransformer:

        # SCALE PIPE
        scaler_pipe = Pipeline([
            ('scaler', MinMaxScaler())
        ])

        return scaler_pipe

    X = df[['temperature_2m', 'relativehumidity_2m', 'apparent_temperature',
       'windspeed_10m', 'precipitation','hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'day_sin', 'day_cos']]

    preprocessor = create_preprocessor()
    X_processed = preprocessor.fit_transform(X)

    return pd.concat([pd.DataFrame(X_processed) , df[['is_holiday', 'is_weekend']]] , axis=1)


In [27]:
# 8. feature selection
districts = ['Altstadt-Lehel', 'Au - Haidhausen',
       'Aubing-Lochhausen-Langwied', 'Berg am Laim', 'Bogenhausen',
       'Feldmoching', 'Hadern', 'Harlaching', 'Hasenbergl-Lerchenau Ost',
       'Laim', 'Lochhausen', 'Ludwigsvorstadt-Isarvorstadt', 'Maxvorstadt',
       'Milbertshofen-Am Hart', 'Moosach', 'Neuhausen-Nymphenburg',
       'Obergiesing', 'Obermenzing', 'Obersendling', 'Pasing',
       'Pasing-Obermenzing', 'Ramersdorf-Perlach', 'Schwabing-Freimann',
       'Schwabing-West', 'Schwanthalerhöhe', 'Sendling', 'Sendling-Westpark',
       'Südgiesing', 'Thalkirchen', 'Trudering', 'Trudering-Riem',
       'Untergiesing', 'Untergiesing-Harlaching', 'Untermenzing-Allach']

    
X = merged_df.drop(columns=districts)
y = merged_df[districts].fillna(0)

features = ['temperature_2m', 'relativehumidity_2m', 'apparent_temperature',
       'windspeed_10m', 'precipitation', 'is_holiday', 'is_weekend',
       'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'day_sin', 'day_cos']

selected_merged_df = feature_selection(X , features)

# 9. preproc-pipeline (Keep date_time for RNN)
X_processed = preprocess_features(selected_merged_df)

cache_path_X_preproc=Path(f'{LOCAL_DATA_PATH}/processed/X_processed_from_{START_YEAR}_to_{END_YEAR}.csv')
cache_path_y_preproc=Path(f'{LOCAL_DATA_PATH}/processed/y_processed_from_{START_YEAR}_to_{END_YEAR}.csv')

X_processed.columns = features
X_processed.to_csv(cache_path_X_preproc , header=True , index=False)
y.to_csv(cache_path_y_preproc , header=True , index=False)


In [28]:
X_processed.head()

Unnamed: 0,temperature_2m,relativehumidity_2m,apparent_temperature,windspeed_10m,precipitation,is_holiday,is_weekend,hour_sin,hour_cos,month_sin,month_cos,day_sin,day_cos
0,0.355408,1.0,0.342007,0.227848,0.017391,0.62941,0.982963,0.75,0.933013,0.600779,0.989739,1,0
1,0.357616,0.987013,0.340149,0.24557,0.008696,0.75,0.933013,0.75,0.933013,0.600779,0.989739,1,0
2,0.359823,1.0,0.336431,0.303797,0.017391,0.853553,0.853553,0.75,0.933013,0.600779,0.989739,1,0
3,0.359823,0.987013,0.332714,0.341772,0.008696,0.933013,0.75,0.75,0.933013,0.600779,0.989739,1,0
4,0.359823,1.0,0.332714,0.356962,0.0,0.982963,0.62941,0.75,0.933013,0.600779,0.989739,1,0


In [29]:
y.head()

Unnamed: 0,Altstadt-Lehel,Au - Haidhausen,Aubing-Lochhausen-Langwied,Berg am Laim,Bogenhausen,Feldmoching,Hadern,Harlaching,Hasenbergl-Lerchenau Ost,Laim,...,Schwanthalerhöhe,Sendling,Sendling-Westpark,Südgiesing,Thalkirchen,Trudering,Trudering-Riem,Untergiesing,Untergiesing-Harlaching,Untermenzing-Allach
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,1.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,3.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Sampling

In [30]:
data = pd.concat([X_processed, y], axis=1)
data.shape

(35064, 47)

In [31]:
districts = y.columns
districts

Index(['Altstadt-Lehel', 'Au - Haidhausen', 'Aubing-Lochhausen-Langwied',
       'Berg am Laim', 'Bogenhausen', 'Feldmoching', 'Hadern', 'Harlaching',
       'Hasenbergl-Lerchenau Ost', 'Laim', 'Lochhausen',
       'Ludwigsvorstadt-Isarvorstadt', 'Maxvorstadt', 'Milbertshofen-Am Hart',
       'Moosach', 'Neuhausen-Nymphenburg', 'Obergiesing', 'Obermenzing',
       'Obersendling', 'Pasing', 'Pasing-Obermenzing', 'Ramersdorf-Perlach',
       'Schwabing-Freimann', 'Schwabing-West', 'Schwanthalerhöhe', 'Sendling',
       'Sendling-Westpark', 'Südgiesing', 'Thalkirchen', 'Trudering',
       'Trudering-Riem', 'Untergiesing', 'Untergiesing-Harlaching',
       'Untermenzing-Allach'],
      dtype='object')

In [32]:
FOLD_LENGTH = 17520
FOLD_STRIDE = 2184
TRAIN_TEST_RATIO = 0.8
INPUT_LENGTH = 336 # 24 h * 14 d
OUTPUT_LENGTH = 24

In [33]:
folds = get_folds(data, FOLD_LENGTH, FOLD_STRIDE)

In [34]:
(fold_train, fold_test) = train_test_split(folds[0], TRAIN_TEST_RATIO, INPUT_LENGTH)

In [35]:
X_processed.columns

Index(['temperature_2m', 'relativehumidity_2m', 'apparent_temperature',
       'windspeed_10m', 'precipitation', 'is_holiday', 'is_weekend',
       'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'day_sin', 'day_cos'],
      dtype='object')

In [36]:
TARGET = districts
N_TARGETS = len(districts)
N_FEATURES = len(X_processed.columns)

In [37]:
X_train_i, y_train_i = get_Xi_yi(fold_train, INPUT_LENGTH, OUTPUT_LENGTH)
X_test_i, y_test_i = get_Xi_yi(fold_test, INPUT_LENGTH, OUTPUT_LENGTH)

In [38]:
X_train_i

Unnamed: 0,temperature_2m,relativehumidity_2m,apparent_temperature,windspeed_10m,precipitation,is_holiday,is_weekend,hour_sin,hour_cos,month_sin,...,Schwanthalerhöhe,Sendling,Sendling-Westpark,Südgiesing,Thalkirchen,Trudering,Trudering-Riem,Untergiesing,Untergiesing-Harlaching,Untermenzing-Allach
3318,0.567329,0.688312,0.544610,0.220253,0.0,0.982963,0.370590,0.75,0.066987,0.173895,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0
3319,0.626932,0.545455,0.592937,0.248101,0.0,0.933013,0.250000,0.75,0.066987,0.173895,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3320,0.657837,0.493506,0.624535,0.245570,0.0,0.853553,0.146447,0.75,0.066987,0.173895,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3321,0.684327,0.441558,0.665428,0.215190,0.0,0.750000,0.066987,0.75,0.066987,0.173895,...,2.0,1.0,1.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0
3322,0.708609,0.389610,0.698885,0.179747,0.0,0.629410,0.017037,0.75,0.066987,0.173895,...,2.0,12.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3649,0.545254,0.922078,0.555762,0.027848,0.0,0.750000,0.933013,0.50,0.000000,0.697431,...,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
3650,0.536424,0.922078,0.544610,0.037975,0.0,0.853553,0.853553,0.50,0.000000,0.697431,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
3651,0.527594,0.909091,0.535316,0.055696,0.0,0.933013,0.750000,0.50,0.000000,0.697431,...,0.0,1.0,3.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
3652,0.527594,0.909091,0.533457,0.058228,0.0,0.982963,0.629410,0.50,0.000000,0.697431,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [39]:
N_TRAIN = 8000 # number_of_sequences_train
N_TEST =  2000 # number_of_sequences_test

X_train, y_train = get_X_y(fold_train, N_TRAIN, INPUT_LENGTH, OUTPUT_LENGTH)
X_test, y_test = get_X_y(fold_test, N_TEST, INPUT_LENGTH, OUTPUT_LENGTH)

In [40]:
print(f'X_train: {X_train.shape}, y_train: {y_train.shape}')
print(f'X_test: {X_test.shape}, y_test: {y_test.shape}')

X_train: (8000, 336, 47), y_train: (8000, 24, 34)
X_test: (2000, 336, 47), y_test: (2000, 24, 34)


In [41]:
X_train[0,0,:]

array([0.29580574, 0.7012987 , 0.23234201, 0.6       , 0.        ,
       1.        , 0.5       , 0.9330127 , 0.75      , 0.92486744,
       0.76387627, 0.        , 0.        , 0.        , 0.        ,
       0.        , 1.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 1.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       2.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        ])

In [42]:
y_train[0,0,:]

array([0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0.,
       0., 0., 0., 0., 0., 2., 1., 0., 0., 0., 1., 0., 0., 0., 2., 2., 0.])

## Modelling

In [43]:
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import optimizers, metrics
from tensorflow.keras.regularizers import L1L2
from tensorflow.keras.layers.experimental.preprocessing import Normalization

2023-06-09 09:01:07.465021: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-09 09:01:07.504687: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-09 09:01:07.704419: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-09 09:01:07.705648: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [52]:
def init_model(X_train, y_train):
    # E1D1
    # n_features ==> no of features at each timestep in the data.
    #
    encoder_inputs = layers.Input(shape=X_train[0].shape)
    encoder_l1 = layers.LSTM(100, return_state=True)
    encoder_outputs1 = encoder_l1(encoder_inputs)

    encoder_states1 = encoder_outputs1[1:]

    #
    decoder_inputs = layers.RepeatVector(OUTPUT_LENGTH)(encoder_outputs1[0])

    #
    decoder_l1 = layers.LSTM(100, return_sequences=True)(decoder_inputs,initial_state = encoder_states1)
    decoder_outputs1 = layers.TimeDistributed(layers.Dense(N_FEATURES))(decoder_l1)

    #
    model_e1d1 = models.Model(encoder_inputs,decoder_outputs1)
    
    # 2 - Compiler
    # ======================    
    adam = optimizers.Adam(learning_rate=0.02)    
    model_e1d1.compile(loss='mse', optimizer=adam, metrics=["mae"])
    
    return model_e1d1


In [60]:
def init_model(X_train, y_train):
        
    # 0 - Normalization
    # ======================    
    normalizer = Normalization()
    normalizer.adapt(X_train)
    
    # 1 - RNN architecture
    # ======================    
    model = models.Sequential()
    ## 1.0 - All the rows will be standardized through the already adapted normalization layer
    model.add(normalizer)
    ## 1.1 - Recurrent Layer
    model.add(layers.LSTM(64, 
                          activation='tanh', 
                          return_sequences = True,
                          kernel_regularizer=L1L2(l1=0.05, l2=0.05),
                          ))
    ## 1.2 - Predictive Dense Layers
    output_length = y_train.shape[1:]
    model.add(layers.TimeDistributed(layers.Dense(1)))

    #model.add(layers.Dense(output_length, activation='linear'))

    # 2 - Compiler
    # ======================    
    adam = optimizers.Adam(learning_rate=0.02)    
    model.compile(loss='mse', optimizer=adam, metrics=["mae"])
    
    return model

In [53]:
model = init_model(X_train, y_train)
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 336, 47)]    0           []                               
                                                                                                  
 lstm_6 (LSTM)                  [(None, 100),        59200       ['input_4[0][0]']                
                                 (None, 100),                                                     
                                 (None, 100)]                                                     
                                                                                                  
 repeat_vector_3 (RepeatVector)  (None, 24, 100)     0           ['lstm_6[0][0]']                 
                                                                                            

2023-06-09 09:05:14.287112: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-06-09 09:05:14.288209: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-06-09 09:05:14.290647: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [49]:
def plot_history(history):
    
    fig, ax = plt.subplots(1,2, figsize=(20,7))
    # --- LOSS: MSE --- 
    ax[0].plot(history.history['loss'])
    ax[0].plot(history.history['val_loss'])
    ax[0].set_title('MSE')
    ax[0].set_ylabel('Loss')
    ax[0].set_xlabel('Epoch')
    ax[0].legend(['Train', 'Validation'], loc='best')
    ax[0].grid(axis="x",linewidth=0.5)
    ax[0].grid(axis="y",linewidth=0.5)
    
    # --- METRICS:MAE ---
    
    ax[1].plot(history.history['mae'])
    ax[1].plot(history.history['val_mae'])
    ax[1].set_title('MAE')
    ax[1].set_ylabel('MAE')
    ax[1].set_xlabel('Epoch')
    ax[1].legend(['Train', 'Validation'], loc='best')
    ax[1].grid(axis="x",linewidth=0.5)
    ax[1].grid(axis="y",linewidth=0.5)
                        
    return ax

In [54]:
from tensorflow import keras
from keras.callbacks import EarlyStopping

def fit_model(model: keras.Model, verbose=1) -> Tuple[keras.Model, dict]:

    es = EarlyStopping(monitor = "val_loss",
                      patience = 3,
                      mode = "min",
                      restore_best_weights = True)


    history = model.fit(X_train, y_train,
                        validation_split = 0.3,
                        shuffle = False,
                        batch_size = 32,
                        epochs = 50,
                        callbacks = [es],
                        verbose = verbose)

    return model, history

In [55]:
# 1 - Initialising the RNN model
# ====================================

model = init_model(X_train, y_train)
model.summary()

# 2 - Training
# ====================================
model, history = fit_model(model)

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 336, 47)]    0           []                               
                                                                                                  
 lstm_8 (LSTM)                  [(None, 100),        59200       ['input_5[0][0]']                
                                 (None, 100),                                                     
                                 (None, 100)]                                                     
                                                                                                  
 repeat_vector_4 (RepeatVector)  (None, 24, 100)     0           ['lstm_8[0][0]']                 
                                                                                            

2023-06-09 09:05:27.646459: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-06-09 09:05:27.648653: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-06-09 09:05:27.649892: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 1/50


2023-06-09 09:05:28.524085: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-06-09 09:05:28.528611: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-06-09 09:05:28.531614: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

ValueError: in user code:

    File "/home/sandra/.pyenv/versions/3.10.6/envs/bike_sharing_demand/lib/python3.10/site-packages/keras/engine/training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "/home/sandra/.pyenv/versions/3.10.6/envs/bike_sharing_demand/lib/python3.10/site-packages/keras/engine/training.py", line 1268, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/sandra/.pyenv/versions/3.10.6/envs/bike_sharing_demand/lib/python3.10/site-packages/keras/engine/training.py", line 1249, in run_step  **
        outputs = model.train_step(data)
    File "/home/sandra/.pyenv/versions/3.10.6/envs/bike_sharing_demand/lib/python3.10/site-packages/keras/engine/training.py", line 1051, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/home/sandra/.pyenv/versions/3.10.6/envs/bike_sharing_demand/lib/python3.10/site-packages/keras/engine/training.py", line 1109, in compute_loss
        return self.compiled_loss(
    File "/home/sandra/.pyenv/versions/3.10.6/envs/bike_sharing_demand/lib/python3.10/site-packages/keras/engine/compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/home/sandra/.pyenv/versions/3.10.6/envs/bike_sharing_demand/lib/python3.10/site-packages/keras/losses.py", line 142, in __call__
        losses = call_fn(y_true, y_pred)
    File "/home/sandra/.pyenv/versions/3.10.6/envs/bike_sharing_demand/lib/python3.10/site-packages/keras/losses.py", line 268, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/home/sandra/.pyenv/versions/3.10.6/envs/bike_sharing_demand/lib/python3.10/site-packages/keras/losses.py", line 1470, in mean_squared_error
        return backend.mean(tf.math.squared_difference(y_pred, y_true), axis=-1)

    ValueError: Dimensions must be equal, but are 13 and 34 for '{{node mean_squared_error/SquaredDifference}} = SquaredDifference[T=DT_FLOAT](model_4/time_distributed_4/Reshape_1, IteratorGetNext:1)' with input shapes: [32,24,13], [32,24,34].
