
Below i am working on merging the multimodal data by date/location
<br>
Let us start by visualzing the multimodal_counters location v/s the bike_counters

In [22]:
import pandas as pd
import numpy as np
import folium
import os

data = pd.read_parquet(os.path.join("..", "Datasets", "train.parquet"))
mult_df = pd.read_csv(os.path.join("..", "Datasets", "multimodal_data.csv"))


# Create a map centered at the mean coordinates of the combined data
map_center = data[["latitude", "longitude"]].mean(axis=0)
m = folium.Map(location=map_center, zoom_start=13)

# Add markers for the first dataset (data)
for _, row in (
    data[["counter_name", "latitude", "longitude"]]
    .drop_duplicates("counter_name")
    .iterrows()
):
    folium.Marker(
        location=row[["latitude", "longitude"]].values.tolist(),
        popup=row["counter_name"],
        icon=folium.Icon(color="blue")  # Choose a color for the first dataset
    ).add_to(m)

# Add markers for the second dataset (mult_df)
for _, row in (
    mult_df[["site_ID", "Latitude", "Longitude"]]
    .drop_duplicates("site_ID")
    .iterrows()
):
    folium.Marker(
        location=row[["Latitude", "Longitude"]].values.tolist(),
        popup=row["site_ID"],
        icon=folium.Icon(color="red")  # Choose a color for the second dataset
    ).add_to(m)

# Display the map
m


We notice that the multimodal data we have are mainly counted in the middle of paris. We will therefore investigate the following two options:
- Taking an average value for the multimodal_count for the entire city
- Inputting counter specific values for the bike_counters in the city center only. 
<br><br>
We start by option 1:

# Option 1
***

In [19]:
# Import Multimodal Data
mult_df = pd.read_csv(os.path.join("..", "Datasets", "multimodal_data.csv"))

# Convert the 'date' columns to datetime format
mult_df['date'] = pd.to_datetime(mult_df['date']).astype('datetime64[us]')

mult_df.head()

Unnamed: 0,date,site_ID,count,Latitude,Longitude
0,2020-07-01 00:00:00,10004,654,48.858273,2.349109
1,2020-07-01 01:00:00,10004,345,48.858273,2.349109
2,2020-07-01 02:00:00,10004,254,48.858273,2.349109
3,2020-07-01 03:00:00,10004,116,48.858273,2.349109
4,2020-07-01 04:00:00,10004,96,48.858273,2.349109


In [None]:
class MergeMultimodal(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_copy = X.copy()

        # Import Multimodal Data
        mult_df = pd.read_csv(os.path.join("..", "Datasets", "multimodal_data.csv"))
        mult_df['date'] = pd.to_datetime(mult_df['date']).astype('datetime64[us]')

        # Averaging the count
        mult_df = pd.DataFrame(mult_df.groupby(['date'])['count'].sum()).reset_index()

        # Merging data
        merged_data = pd.merge_asof(X_copy, mult_df, on='date')
        merged_data.rename(columns={'count': 'average_multimodal_count'})
        merged_data.drop(columns='date', inplace=True)
        return merged_data

# Option 2 (incomplete)
***

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

class MergeMultimodal(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_copy = X.copy()
        
        # Split 'coordinates' into 'Latitude' and 'Longitude'
        X_copy[['Latitude', 'Longitude']] = X_copy['coordinates'].str.split(',', expand=True)
        X_copy.drop(columns='coordinates', inplace=True)
        
        # Convert 'Latitude' and 'Longitude' to numeric 
        X_copy['Latitude'] = pd.to_numeric(X_copy['Latitude'])
        X_copy['Longitude'] = pd.to_numeric(X_copy['Longitude'])
        
        # Import Multimodal Data
        mult_df = pd.read_csv(os.path.join("..", "Datasets", "multimodal_data.csv"))
        
        # Convert the 'date' columns to datetime format
        mult_df['date'] = pd.to_datetime(mult_df['date']).astype('datetime64[us]')

        
        

        
        # Return the modified DataFrame
        return X_copy

B = MergeMultimodal().fit_transform(X)
B.head()