# Sources

## Raw Citibike Data

https://s3.amazonaws.com/tripdata/2020-citibike-tripdata.zip

https://citibikenyc.com/system-data

## Subway Data
https://data.ny.gov/Transportation/MTA-Subway-Stations/39hk-dx4f/data_preview

#  Module: Dependencies

In [None]:
! pip install pycaret[full]
! pip install geopy
! pip install diskcache

# unfortunately these versions are locked now due to Pycaret
! pip install numpy==1.24.3
! pip install pandas==2.0.3

In [None]:
! pip install setuptools
! pip install folium

In [1]:
# core libraries for the project
import pandas as pd
import numpy as np
from pycaret.regression import *
import datetime

# libraries for Software Engineering Practices
from abc import ABC, abstractmethod
import time
import functools
from typing import Callable, Any, List, Tuple

# libraries for presentation
import seaborn as sns
import matplotlib.pyplot as plt
import IPython.display as ipydisplay
import folium
import json
from folium.plugins import Fullscreen
from IPython.display import IFrame

# libraries for feature engineering
from scipy.stats.mstats import winsorize
from sklearn.metrics.pairwise import haversine_distances
from geopy.distance import geodesic # will be dropped soon
from sklearn.cluster import KMeans # used for clusterization, not final
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder # not final

import warnings
import os
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
color = sns.color_palette("tab10")
sns.set_style('darkgrid')

# Module: Managers

## Download Manager

## Data Provider

In [None]:
class DataProvider():
  df_raw: pd.DataFrame
  last_file_name: str
  identifier: str

  def __init__(self):
    self.df_raw = pd.DataFrame()
    self.last_file_name = ''

  def load_jan2020(self):
    ''' This function is a shortcut '''
    # downloading only if file isnt available, obviously
    if not os.path.exists('/content/202001-citibike-tripdata_1.zip'):
      self.last_file_name = '202001-citibike-tripdata_1.csv'
      self.identifier = "Jan/2020"
    return self


  def load(self, year:str, month:str):
    '''This function is an API. The user requests some data and the magic happens underneath'''
    display(f'Downloading: {year}/{month}')
    file_name = f"{year}-citibike-tripdata"
    if not os.path.exists(f'/content/{file_name}.zip'):
      ! wget https://s3.amazonaws.com/tripdata/{file_name}+".zip"
      ! unzip {file_name}+".zip"
      ! unzip {file_name}/{year}-{month}-citibike-tripdata.zip

      ipydisplay.clear_output()
      display('Files ready')
      self.last_file_name = file_name+'.csv'
    return self

  def get_df(self) -> pd.DataFrame:
    if self.df_raw.empty:
      with open(self.last_file_name, 'r') as file:
        self.df_raw = pd.read_csv(file, parse_dates=['started_at', 'ended_at'])
    return self.df_raw.copy()

In [5]:
class GIS_Data_Provider():
  @classmethod
  def get_subway_stations(self) -> pd.DataFrame:
    ''' Wrapping the download from github cached data.'''
    if not os.path.exists('/content/MTA_Subway_Stations_20250217.csv'):
      display("downloading cached data from github")
      ! wget https://raw.githubusercontent.com/bicdev/tcc/refs/heads/main/cached%20data/MTA_Subway_Stations_20250217.csv -q
    return pd.read_csv('/content/MTA_Subway_Stations_20250217.csv')

In [6]:
GIS_Data_Provider.get_subway_stations()

'downloading cached data from github'

'wget' n�o � reconhecido como um comando interno
ou externo, um programa oper�vel ou um arquivo em lotes.


FileNotFoundError: [Errno 2] No such file or directory: '/content/MTA_Subway_Stations_20250217.csv'

In [None]:
#! wget https://s3.amazonaws.com/tripdata/2020-citibike-tripdata.zip
#! unzip 2020-citibike-tripdata.zip

## Experiment Manager

In [4]:
class ExperimentPipeline(ABC):
  '''Classic Pipeline Data Pattern'''
  def __init__(self, steps: List[Tuple[Callable, dict]] = None) -> None:
    self.steps = steps if steps is not None else []
    self.reports = {}

  def add_step(self, step: Callable, kwargs: dict = None) -> None:
    assert callable(step), 'Step must be a function or method'
    assert hasattr(step, '__name__'), 'Step must be named'
    self.steps.append((step, kwargs or {}))

  @abstractmethod
  def run(self, data: pd.DataFrame, mode: str) -> pd.DataFrame:
    pass

  def __repr__(self):
    '''Printable representation of the pipeline'''
    step_names = [
        (step.__name__ if hasattr(step, '__name__') else str(step), kwargs)
        for step, kwargs in self.steps
      ]
    return f"Pipeline(steps={step_names})"


In [5]:
class FeaturePipeline(ExperimentPipeline):
  def run(self, df: pd.DataFrame) -> pd.DataFrame:
    '''Pipeline logic for managing feature engineering methods. Each method
    call modifies the resulting dataframe,
    introducing the new feature, and, if sucessful, cleaning up the data.'''
    result = df.copy()

    for i, (step, kwargs) in enumerate(self.steps):
      step_name = step.__name__ # __name__ is a private property that returns the objects' name in code

      try:
        result, cleanup = step(result, **kwargs)
        if 'cleanup' not in self.reports:
          self.reports['cleanup'] = cleanup
        else:
          self.reports['cleanup'].extend(cleanup)
      except Exception as e:
        print(f"Error in step {i+1}: {step_name} with kwargs {kwargs} - {e}")
        raise  # Re-raise the exception to stop the pipeline

    result = result.drop(self.reports['cleanup'], axis=1)
    return result

In [6]:
class CleaningPipeline(ExperimentPipeline):
  def run(self, data: pd.DataFrame) -> pd.DataFrame:
    result = data.copy()

    for i, (step, kwargs) in enumerate(self.steps):
      step_name = step.__name__ # __name__ is a private property that returns the objects' name in code
      try:
        result, report = step(result, **kwargs)
        self.reports[step_name] = report
      except Exception as e:
        print(f"Error in step {i+1}: {step_name} with kwargs {kwargs} - {e}")
        raise  # Re-raise the exception to stop the pipeline
    return result

In [None]:
class ModelPipeline(ExperimentPipeline):
  def run(self, data: pd.DataFrame) -> pd.DataFrame:


In [7]:
class Experiment():
  def __init__(self, data_used: str, features: FeaturePipeline, cleaning: CleaningPipeline) -> None:
    self.data_used = data_used
    self.features = features
    self.cleaning = cleaning
    self.pipeline_definition = {'features': str(features), 'cleaning': str(cleaning)}

  def run(self, df: pd.DataFrame) -> pd.DataFrame:
    df_clean = self.cleaning.run(df)
    df_features = self.features.run(df_clean)
    self.pipeline_results = {'features': self.features.reports, 'cleaning': self.cleaning.reports}
    return df_features


#  Module: Data

## Cleaning

In [8]:
def step_handle_duplicates(data: pd.DataFrame) -> Tuple[pd.DataFrame, int]:
  ''' Drops duplicates from dataset. Returns number of rows affected '''
  before = data.shape[0]
  after = data.drop_duplicates()
  delta = before - after.shape[0]
  return after, delta

In [9]:
def step_drop_unused(data: pd.DataFrame) -> Tuple[pd.DataFrame, int]:
  ''' Drops useless columns from dataset. '''
  before = data.shape[1]
  after = data.drop([
      'start_station_name', # station name where trip started
      'end_station_name', # station name where trip ended
      'start_station_id', # unique id of station where trip started
      'end_station_id', # unique id of station where trip ended,
      'rideable_type', # unused for now due to lack of representation
  ], axis=1)
  delta = before - after.shape[1]
  return after, delta

In [10]:
def step_handle_missing(data: pd.DataFrame) ->  Tuple[pd.DataFrame, int]:
  ''' Drops rows with missing values from dataset. Returns number of rows affected '''
  before = data.shape[0]
  after = data.dropna()
  delta = before - after.shape[0]
  return after, delta

## Feature Engineering

In [11]:
def features_timestamp(df: pd.DataFrame, **kwargs) -> pd.DataFrame:
  '''This method extracts multiple features from timestamp based data'''
  df['day'] = df['started_at'].dt.day
  df['hour'] = df['started_at'].dt.hour
  df['weekday'] = df['started_at'].dt.dayofweek
  df['is_weekend'] = df['weekday'] >= 5
  df['trip_duration'] = (df['ended_at'] - df['started_at']).dt.total_seconds()

  return df, ['started_at', 'ended_at']

In [12]:
cache = {} # memory based cache, not final

In [13]:
def calculate_trip_distance(row: pd.Series) -> float:
    # TODO: This guy is slow!
    start_coords = (row['start_lat'], row['start_lng'])
    end_coords = (row['end_lat'], row['end_lng'])
    return geodesic(start_coords, end_coords).kilometers

def faster_trip_distance(row):
  '''Faster distance calculation between coordinates pairs, uses Haversine formula.'''
  start_coords = (row['start_lat'], row['start_lng'])
  end_coords = (row['end_lat'], row['end_lng'])
  if (start_coords, end_coords) in cache:
    return cache[(start_coords, end_coords)]
  coords_1_rad = np.radians(start_coords)
  coords_1_rad = coords_1_rad.reshape(1, -1)  # Reshape to (1, 2)
  coords_2_rad = np.radians(end_coords)
  coords_2_rad = coords_2_rad.reshape(1, -1)

  earth_radius_km = 6371.0 # Earth's radius in kilometers
  distance_rad = haversine_distances(coords_1_rad, coords_2_rad)
  distance_km = distance_rad * earth_radius_km
  cache[(start_coords, end_coords)] = distance_km[0][0]
  return distance_km[0][0]

def feature_trip_distance(df: pd.DataFrame, **kwargs) -> pd.DataFrame:
  '''Wrapper for trip distance feature extraction'''
  display(f'slow guy started')
  display(f'cache size: {len(cache)}')
  start = time.perf_counter()
  #df['trip_distance'] = df.apply(calculate_trip_distance, axis=1)
  df['trip_distance'] = df.apply(faster_trip_distance, axis=1)
  finish = time.perf_counter()
  display(f'slow guy took: {finish - start} seconds')
  display(f'cache size: {len(cache)}')
  return df, ['start_lat', 'start_lng', 'end_lat', 'end_lng']

In [14]:
def feature_member_ratio(df: pd.DataFrame, **kwargs) -> pd.DataFrame:
  '''This feature determines the ratio of members to casuals in a specific hour of a day'''
  df_member = pd.DataFrame({
      'is_member' : df['member_casual'] == 'member',
      'hour' : df['hour'],
      'day' : df['day']
      })
  df_member = df_member[['is_member', 'hour', 'day']].groupby(['day','hour']).agg(member_ratio=('is_member', 'mean'))
  #df_final = pd.merge(df_member, df.drop_duplicates(subset=['day','hour']), on=['day','hour'])
  return df_member, ['member_casual']

In [15]:
def aggregate_by_hour(df: pd.DataFrame, **kwargs) -> pd.DataFrame:
  member_ratio, cleanup = feature_member_ratio(df)

  hourly = df.groupby(['day','hour']).agg(
    trip_amount=('ride_id', 'count'),  # Count of 'ride_id'
    avg_trip_duration=('trip_duration', 'mean'),  # Average of 'trip_duration'
    avg_trip_distance=('trip_distance', 'mean')  # Average of 'trip_distance'
  ).reset_index()

  df_hourly = pd.merge(hourly, df.drop_duplicates(subset=['day','hour']), on=['day','hour'])
  df_final = pd.merge(member_ratio, df_hourly.drop_duplicates(subset=['day', 'hour']), on=['day','hour'])
  return df_final, ['ride_id','trip_duration','trip_distance', cleanup[0]]

In [16]:
def frequency_of_trips_per_station(df: pd.DataFrame) -> pd.DataFrame:
  df_frequency = df.groupby('start_station_id')['ride_id'].count().reset_index()
  df_frequency.rename(columns={'ride_id': 'ride_count', 'start_station_id': 'station_id'}, inplace=True)
  df_frequency.sort_values(by='ride_count', ascending=False).reset_index()

  return df_frequency

In [17]:
def clusterize_stations_by_frequency(df: pd.DataFrame, n_clusters: int) -> pd.DataFrame:
  station_data = pd.DataFrame(df)

  X = station_data[['ride_count']]
  scaler = StandardScaler()
  X_scaled = scaler.fit_transform(X)

  kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
  station_data['cluster'] = kmeans.fit_predict(X_scaled)

  return station_data.groupby('cluster')['ride_count'].agg(['mean', 'median', 'min', 'max', 'count'])

In [18]:
def clusterize_by_geolocation(df: pd.DataFrame, n_clusters: int) -> pd.DataFrame:
  station_locations = df[['start_lat', 'start_lng']].drop_duplicates().dropna()
  kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
  station_locations['cluster'] = kmeans.fit_predict(station_locations[['start_lat', 'start_lng']])

  return station_locations

In [19]:
subway_cache = {}

In [None]:
print(len(subway_cache.keys()))

0


In [20]:
def calculate_distance_to_subway(subway: pd.DataFrame, station_lat, station_lng, **kwargs) -> pd.DataFrame:
  #start = time.perf_counter()
  min_distance = float('inf') # to find nearest, we start with infinite distance then shrink with each comparison
  for index, row in subway.iterrows():
    subway_lat, subway_lng = row['GTFS Latitude'], row['GTFS Longitude']

    station_coords = (station_lat, station_lng)
    subway_coords = (subway_lat, subway_lng)

    if (station_coords, subway_coords) in cache:
      distance = subway_cache[(station_coords, subway_coords)]
    else:
      #print(f"cache miss, index: {index}")
      distance = geodesic(station_coords, subway_coords).kilometers
      subway_cache[(station_coords, subway_coords)] = distance
    min_distance = min(min_distance, distance)
  end = time.perf_counter()
  #print(f"{end-start}")
  return min_distance

def feature_distance_to_subway(df_stations: pd.DataFrame, df_subway: pd.DataFrame, **kwargs) -> pd.DataFrame:
  df_subway_distances = df_stations.copy()
  df_subway_distances['distance_to_subway'] = 0

  for index, row in df_subway_distances.iterrows():
    #print(f"index: {index}")
    df_subway_distances['distance_to_subway'] = df_subway_distances.apply(
        lambda var:
        calculate_distance_to_subway(
            df_subway,
            var['start_lat'],
            var['start_lng']
          ), axis=1)

  return df_subway_distances

#  Module: Visualize

In [21]:
def feature_overview(data: pd.DataFrame) -> pd.DataFrame:
  ''' Returns a DataFrame with key insights from dataset, intented to showcase the metrics of which we determine data quality. '''
  return pd.DataFrame({
      'feature': data.columns.values,  # feature names
      'data_type': data.dtypes.values,  # data types
      'null_value(%)': data.isna().mean().values * 100,  # percentage of null values
      'neg_value(%)': [len(data[col][data[col] < 0]) / len(data) * 100 if col in data.select_dtypes(include=[np.number]).columns else 0 for col in data.columns],  # percentage of negative values
      '0_value(%)': [len(data[col][data[col] == 0]) / len(data) * 100 if col in data.select_dtypes(include=[np.number]).columns else 0 for col in data.columns],  # percentage of zero values
      'duplicate': data.duplicated().sum(),  # amount of duplicates
      'n_unique': data.nunique().values,  # amount of unique values
      'sample_unique': [data[col].unique() for col in data.columns]  # sample of unique values
  }).round(3)

In [22]:
def present_distribution_across_cluster(cluster_summary: pd.DataFrame):
  plt.figure(figsize=(8, 8))
  plt.pie(cluster_summary['count'], labels=cluster_summary.index, autopct='%1.1f%%', startangle=90)
  plt.title('Distribution of Stations across Clusters')
  plt.axis('equal')
  plt.show()

In [23]:
def present_geolocation_of_clusters(station_locations: pd.DataFrame, colors = []):
  center_lat = station_locations['start_lat'].mean()
  center_lon = station_locations['start_lng'].mean()
  m = folium.Map(location=[center_lat, center_lon], zoom_start=12)

  for index, row in station_locations.iterrows():
      cluster = int(row['cluster'])
      folium.CircleMarker(
          location=[row['start_lat'], row['start_lng']],
          radius=5,
          color=colors[cluster % len(colors)],
          fill=True,
          fill_color=colors[cluster % len(colors)],
          fill_opacity=0.7,
          popup=f"Station: {index}, Cluster: {cluster}"
      ).add_to(m)

  m.save('geoclusterized.html')

  Fullscreen().add_to(m)
  display(m)

In [24]:
def present_subway_locations(subway_stations: pd.DataFrame):
  center_lat = subway_stations['GTFS Latitude'].mean()
  center_lon = subway_stations['GTFS Longitude'].mean()
  m = folium.Map(location=[center_lat, center_lon], zoom_start=12)
  for index, row in subway_stations.iterrows():
      folium.CircleMarker(
          location=[row['GTFS Latitude'], row['GTFS Longitude']],
          radius=5,
          color='red',
          fill=True,
          fill_color='red',
          fill_opacity=0.7,
          popup=f"Station: {subway_stations['Stop Name']}"
      ).add_to(m)

  m.save('subway_locations.html')

  Fullscreen().add_to(m)
  display(m)

In [25]:
def visualize_frequency_clusters():
  df_freq = frequency_of_trips_per_station(df_base)
  present_distribution_across_cluster(clusterize_stations_by_frequency(df_freq, 3))

#  Module: Train

## Global Model

In [26]:
def train_global_model(df: pd.DataFrame):
  global_model_setup = setup(
      data=df,
      target='trip_amount',
      numeric_features=['day', 'hour', 'avg_trip_duration', 'avg_trip_distance', 'member_ratio'],
      categorical_features=['weekday', 'is_weekend'],
      use_gpu = False,
      session_id=123)
  model_comparison = compare_models()
  best_model = create_model('xgboost')
  return best_model


def tune_global_model(model):
  tuned_model = tune_model(model)
  evaluate_model(tuned_model)
  return tuned_model

## Local Model

In [27]:
def prep_local_model(df: pd.DataFrame, df_global_model, slice_size: int) -> pd.DataFrame:
  df_base = df.copy()
  #print(df_base.columns)

  df_with_dates, cleanup = features_timestamp(df_base)
  #print(df_with_dates.columns)

  df_with_dates.drop(cleanup, axis=1, inplace=True)
  #print(df_with_dates.columns)

  df_aggregated = df_with_dates.groupby(['start_station_id', 'day', 'hour']).agg(
      trip_amount=('ride_id', 'count'),  # Count of 'ride_id'
  ).reset_index()
  #print(df_aggregated.columns)

  coords = df_base[['start_station_id', 'start_lat', 'start_lng']].drop_duplicates(subset=['start_station_id'])
  #print(coords.columns)

  df_with_coords = pd.merge(df_aggregated, coords, on='start_station_id', how='left')
  #print(df_with_coords.columns)

  trip_count = df_base[['start_station_id','ride_id']].groupby('start_station_id').count().reset_index()
  top5_ids = trip_count['start_station_id'].unique()[:slice_size]
  df_filtered = df_with_coords[df_with_coords['start_station_id'].isin(top5_ids)]
  #print(df_filtered.columns)
  df_filtered = df_filtered[['start_station_id', 'start_lat', 'start_lng']].drop_duplicates()
  #print(df_filtered.columns)

  station_data = GIS_Data_Provider.get_subway_stations()[['GTFS Latitude', 'GTFS Longitude', 'Station ID']]
  #print(station_data.columns)

  #print(f"calculate distance of {df_filtered['start_station_id']} stations")
  df_distances = feature_distance_to_subway(df_filtered, station_data)
  #print(df_distances.columns)

  df_station_features = df_distances.merge(trip_count, on='start_station_id').rename(columns={'ride_id': 'trip_count'})
  #print(df_station_features.columns)

  #print(df_filtered.columns)
  #print(df_global_model.columns)
  df_share = pd.merge(
    df_with_coords,
    df_global_model[['day', 'hour', 'trip_amount']],
    on=['day', 'hour'], how='left').rename(
        columns={'trip_amount_x': 'trip_amount','trip_amount_y':'total'})
  df_share['share'] = (df_share['trip_amount'] / df_share['total'])*100
  #print(df_share.columns)

  df_final = pd.merge(
    df_station_features[['start_station_id', 'distance_to_subway']],
    df_share,
    on=['start_station_id'], how='left')
  df_final.drop(['trip_amount'], axis=1, inplace=True)
  #print(df_final.columns)

  return df_final

In [28]:
def train_local_model(df_local_features: pd.DataFrame):
  station_setups = {}
  station_models = {}
  for station_id in df_local_features['start_station_id'].unique():
    station_setups[station_id] = setup(
      data=df_local_features,
      target='share',
      numeric_features=['day', 'hour', 'distance_to_subway', 'start_lat', 'start_lng', 'total'],
      categorical_features=['start_station_id'],
      use_gpu = False,
      session_id=123)
    best_model = compare_models()
    station_models[station_id] = create_model('xgboost')

  return station_models

In [29]:
def tune_local_model(model):
  tuned_model = tune_model(model)
  evaluate_model(tuned_model)
  return tuned_model

In [30]:
def inference(df_unseen: pd.DataFrame, model):
  return predict_model(model, data=df_unseen)

#  Module: Evaluate

# Main: Run

In [31]:
# run once
provider = DataProvider()
provider.load_jan2020()

'Files ready'

<__main__.DataProvider at 0x7e31d176fa90>

In [32]:
# run to restart dataframe
df_base = provider.get_df()

In [33]:
cleaning_steps = [
    (step_handle_duplicates,  {}), # these are pure functions
    (step_drop_unused,        {}),
    (step_handle_missing,     {}),
]
cleaning_pipeline = CleaningPipeline(steps=cleaning_steps)

In [34]:
feature_steps = [
    (features_timestamp,    {}),
    (feature_trip_distance, {}),
    #(feature_member_ratio, {}), # unfortunately I had to couple this feature into the aggregation
    (aggregate_by_hour,     {}),
]
feature_pipeline = FeaturePipeline(steps=feature_steps)

In [35]:
experiment = Experiment(
    data_used=provider.identifier,
    features=feature_pipeline,
    cleaning=cleaning_pipeline
)
experiment.pipeline_definition # just to check the pipeline

{'features': "Pipeline(steps=[('features_timestamp', {}), ('feature_trip_distance', {}), ('aggregate_by_hour', {})])",
 'cleaning': "Pipeline(steps=[('step_handle_duplicates', {}), ('step_drop_unused', {}), ('step_handle_missing', {})])"}

In [36]:
df_global_model = experiment.run(df_base) # runs the experiment

'slow guy started'

'cache size: 0'

'slow guy took: 53.001789658999996 seconds'

'cache size: 135064'

In [37]:
df_global_model

Unnamed: 0,day,hour,member_ratio,trip_amount,avg_trip_duration,avg_trip_distance,weekday,is_weekend
0,1,0,0.797561,410,5411.295278,1.763542,2,False
1,1,1,0.798371,491,1140.109837,1.711576,2,False
2,1,2,0.838624,378,1264.673077,1.777670,2,False
3,1,3,0.815385,195,822.333092,1.627566,2,False
4,1,4,0.921569,102,683.903598,1.519381,2,False
...,...,...,...,...,...,...,...,...
739,31,19,0.939876,2262,887.059826,1.567907,4,False
740,31,20,0.925479,1409,707.252162,1.459661,4,False
741,31,21,0.923211,573,7944.695380,1.473518,4,False
742,31,22,0.894061,623,771.196636,1.606232,4,False


In [None]:
experiment.pipeline_results # checks pipeline's results

{'features': {'cleanup': ['started_at',
   'ended_at',
   'start_lat',
   'start_lng',
   'end_lat',
   'end_lng',
   'ride_id',
   'trip_duration',
   'trip_distance',
   'member_casual']},
 'cleaning': {'step_handle_duplicates': 0,
  'step_drop_unused': 5,
  'step_handle_missing': 2438}}

In [None]:
feature_overview(df_global_model) # peek at final dataframe

Unnamed: 0,feature,data_type,null_value(%),neg_value(%),0_value(%),duplicate,n_unique,sample_unique
0,day,int32,0.0,0.0,0.0,0,31,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
1,hour,int32,0.0,0.0,4.167,0,24,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
2,member_ratio,float64,0.0,0.0,0.0,0,711,"[0.7975609756097561, 0.7983706720977597, 0.838..."
3,trip_amount,int64,0.0,0.0,0.0,0,638,"[410, 491, 378, 195, 102, 65, 98, 134, 306, 40..."
4,avg_trip_duration,float64,0.0,0.0,0.0,0,744,"[5411.295278048781, 1140.1098370672098, 1264.6..."
5,avg_trip_distance,float64,0.0,0.0,0.0,0,744,"[1.7635418672415926, 1.7115760695428668, 1.777..."
6,weekday,int32,0.0,0.0,12.903,0,7,"[2, 3, 4, 5, 6, 0, 1]"
7,is_weekend,bool,0.0,0.0,0.0,0,2,"[False, True]"


In [None]:
model = train_global_model(df_global_model)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,trip_amount
2,Target type,Regression
3,Original data shape,"(744, 8)"
4,Transformed data shape,"(744, 14)"
5,Transformed train set shape,"(520, 14)"
6,Transformed test set shape,"(224, 14)"
7,Numeric features,5
8,Categorical features,2
9,Preprocess,True


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,162.5739,53329.3158,228.5727,0.9606,0.4988,0.3985,2.397
xgboost,Extreme Gradient Boosting,182.7006,79895.1521,274.8197,0.9413,0.5402,0.4464,0.26
et,Extra Trees Regressor,181.8177,79897.7155,277.6807,0.9396,0.4137,0.3971,0.262
gbr,Gradient Boosting Regressor,202.2219,83149.9706,284.4059,0.9383,0.5647,0.5275,0.209
rf,Random Forest Regressor,200.2953,97523.6225,301.8569,0.9271,0.4464,0.4352,0.343
lightgbm,Light Gradient Boosting Machine,205.812,100099.5106,300.9135,0.9258,0.5296,0.5241,0.161
dt,Decision Tree Regressor,238.7269,155020.0577,384.9685,0.8838,0.4978,0.4434,0.125
ada,AdaBoost Regressor,378.3704,215418.0126,460.469,0.8385,0.9061,1.5548,0.156
lasso,Lasso Regression,693.3596,721492.0704,848.9437,0.4517,1.1983,2.9369,0.122
llar,Lasso Least Angle Regression,693.3596,721492.5801,848.944,0.4517,1.1983,2.9369,0.071


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,247.4953,137269.9214,370.4996,0.9007,0.6223,0.7117
1,192.0677,89027.7999,298.3753,0.9311,0.5702,0.6275
2,196.4967,78981.2187,281.036,0.9344,0.4397,0.3516
3,124.6036,29224.0137,170.9503,0.978,0.3384,0.2633
4,169.8628,60374.9443,245.7131,0.9297,0.5748,0.5613
5,224.7999,162408.6027,402.9995,0.9008,0.7033,0.4624
6,147.6209,45627.9448,213.607,0.9671,0.5138,0.228
7,192.178,78475.2322,280.1343,0.9532,0.5584,0.4582
8,165.4217,57888.0026,240.5993,0.9562,0.4244,0.2957
9,166.4594,59673.8413,244.2823,0.9618,0.657,0.5037


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
tuned_model = tune_global_model(model)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,269.4372,163969.0702,404.9309,0.8813,0.6254,0.7111
1,168.9544,67281.2045,259.3862,0.9479,0.5254,0.5335
2,169.0516,57711.5105,240.2322,0.9521,0.3595,0.2311
3,144.2978,30674.2186,175.1406,0.9769,0.4947,0.3651
4,183.6359,66515.9554,257.9069,0.9225,0.5799,0.652
5,224.9868,126385.1956,355.507,0.9228,0.4644,0.4586
6,156.2527,55983.4312,236.6082,0.9597,0.3739,0.324
7,178.3376,71512.9301,267.419,0.9573,0.4473,0.3131
8,239.2452,112424.8033,335.2981,0.9149,0.4515,0.402
9,182.1889,60746.0429,246.4671,0.9611,0.5959,0.5823


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [39]:
predictions = predict_model(tuned_model)
predictions

NameError: name 'tuned_model' is not defined

In [38]:
# prompt: using the "predictions" dataframe, calculate the delta between "trip_amount" and "prediction_label" and then the correlation between this delta and "trip_amount", then plot it using a line plot on matplotlib

import matplotlib.pyplot as plt

# Calculate the delta
predictions['delta'] = predictions['trip_amount'] - predictions['prediction_label']

# Calculate the correlation
correlation = predictions['delta'].corr(predictions['trip_amount'])
print(f"Correlation between delta and trip_amount: {correlation}")

# Create the line plot
plt.figure(figsize=(10, 6))
plt.plot(predictions['trip_amount'], predictions['delta'])
plt.xlabel('Trip Amount')
plt.ylabel('Delta (Trip Amount - Prediction)')
plt.title('Delta vs. Trip Amount')
plt.grid(True)
plt.show()


NameError: name 'predictions' is not defined

In [None]:
metrics = pull()
metrics

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extreme Gradient Boosting,174.7249,67752.628,260.2933,0.9549,0.5126,0.3776


In [51]:
df_local_model = df_base.copy()

In [52]:
df_local_model = prep_local_model(df_local_model, df_global_model,5)

In [None]:
local_model = train_local_model(df_local_model)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,share
2,Target type,Regression
3,Original data shape,"(476, 8)"
4,Transformed data shape,"(476, 12)"
5,Transformed train set shape,"(333, 12)"
6,Transformed test set shape,"(143, 12)"
7,Numeric features,6
8,Categorical features,1
9,Preprocess,True


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,0.04,0.0057,0.0702,0.6451,0.0507,0.4006,0.182
rf,Random Forest Regressor,0.0396,0.0053,0.068,0.6415,0.0507,0.4008,0.233
knn,K Neighbors Regressor,0.043,0.006,0.0711,0.6314,0.053,0.4737,0.06
et,Extra Trees Regressor,0.0409,0.0064,0.0739,0.6182,0.0538,0.3857,0.181
catboost,CatBoost Regressor,0.0425,0.0072,0.0787,0.5837,0.056,0.3986,0.997
dt,Decision Tree Regressor,0.0394,0.0063,0.0764,0.5464,0.0575,0.3684,0.056
xgboost,Extreme Gradient Boosting,0.0451,0.0073,0.0824,0.5215,0.059,0.4326,0.161
ada,AdaBoost Regressor,0.0587,0.007,0.0801,0.5075,0.0616,0.8628,0.088
lightgbm,Light Gradient Boosting Machine,0.0559,0.013,0.1049,0.3364,0.0721,0.5007,0.295
huber,Huber Regressor,0.0595,0.0236,0.1323,0.2187,0.092,0.5103,0.077


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0491,0.0067,0.0821,-0.2526,0.0669,0.4274
1,0.0456,0.0084,0.0915,0.4728,0.069,0.4736
2,0.0497,0.0065,0.0805,0.3296,0.0646,0.3928
3,0.0538,0.017,0.1305,0.8117,0.0652,0.3434
4,0.0403,0.0047,0.0687,0.2211,0.0577,0.4958
5,0.0464,0.0081,0.09,0.7438,0.0585,0.4118
6,0.05,0.0081,0.0899,0.8687,0.0644,0.3948
7,0.024,0.0011,0.0329,0.8096,0.0292,0.3364
8,0.0462,0.008,0.0894,0.7496,0.0569,0.4509
9,0.0456,0.0047,0.0683,0.461,0.0575,0.5989


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,Description,Value
0,Session id,123
1,Target,share
2,Target type,Regression
3,Original data shape,"(476, 8)"
4,Transformed data shape,"(476, 12)"
5,Transformed train set shape,"(333, 12)"
6,Transformed test set shape,"(143, 12)"
7,Numeric features,6
8,Categorical features,1
9,Preprocess,True


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,0.04,0.0057,0.0702,0.6451,0.0507,0.4006,0.121
rf,Random Forest Regressor,0.0396,0.0053,0.068,0.6415,0.0507,0.4008,0.296
knn,K Neighbors Regressor,0.043,0.006,0.0711,0.6314,0.053,0.4737,0.136
et,Extra Trees Regressor,0.0409,0.0064,0.0739,0.6182,0.0538,0.3857,0.296
catboost,CatBoost Regressor,0.0425,0.0072,0.0787,0.5837,0.056,0.3986,0.927
dt,Decision Tree Regressor,0.0394,0.0063,0.0764,0.5464,0.0575,0.3684,0.093
xgboost,Extreme Gradient Boosting,0.0451,0.0073,0.0824,0.5215,0.059,0.4326,0.103
ada,AdaBoost Regressor,0.0587,0.007,0.0801,0.5075,0.0616,0.8628,0.094
lightgbm,Light Gradient Boosting Machine,0.0559,0.013,0.1049,0.3364,0.0721,0.5007,0.1
huber,Huber Regressor,0.0595,0.0236,0.1323,0.2187,0.092,0.5103,0.163


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0491,0.0067,0.0821,-0.2526,0.0669,0.4274
1,0.0456,0.0084,0.0915,0.4728,0.069,0.4736
2,0.0497,0.0065,0.0805,0.3296,0.0646,0.3928
3,0.0538,0.017,0.1305,0.8117,0.0652,0.3434
4,0.0403,0.0047,0.0687,0.2211,0.0577,0.4958
5,0.0464,0.0081,0.09,0.7438,0.0585,0.4118
6,0.05,0.0081,0.0899,0.8687,0.0644,0.3948
7,0.024,0.0011,0.0329,0.8096,0.0292,0.3364
8,0.0462,0.008,0.0894,0.7496,0.0569,0.4509
9,0.0456,0.0047,0.0683,0.461,0.0575,0.5989


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,Description,Value
0,Session id,123
1,Target,share
2,Target type,Regression
3,Original data shape,"(476, 8)"
4,Transformed data shape,"(476, 12)"
5,Transformed train set shape,"(333, 12)"
6,Transformed test set shape,"(143, 12)"
7,Numeric features,6
8,Categorical features,1
9,Preprocess,True


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,0.04,0.0057,0.0702,0.6451,0.0507,0.4006,0.119
rf,Random Forest Regressor,0.0396,0.0053,0.068,0.6415,0.0507,0.4008,0.244
knn,K Neighbors Regressor,0.043,0.006,0.0711,0.6314,0.053,0.4737,0.057
et,Extra Trees Regressor,0.0409,0.0064,0.0739,0.6182,0.0538,0.3857,0.314
catboost,CatBoost Regressor,0.0425,0.0072,0.0787,0.5837,0.056,0.3986,0.937
dt,Decision Tree Regressor,0.0394,0.0063,0.0764,0.5464,0.0575,0.3684,0.053
xgboost,Extreme Gradient Boosting,0.0451,0.0073,0.0824,0.5215,0.059,0.4326,0.099
ada,AdaBoost Regressor,0.0587,0.007,0.0801,0.5075,0.0616,0.8628,0.089
lightgbm,Light Gradient Boosting Machine,0.0559,0.013,0.1049,0.3364,0.0721,0.5007,0.135
huber,Huber Regressor,0.0595,0.0236,0.1323,0.2187,0.092,0.5103,0.074


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0491,0.0067,0.0821,-0.2526,0.0669,0.4274
1,0.0456,0.0084,0.0915,0.4728,0.069,0.4736
2,0.0497,0.0065,0.0805,0.3296,0.0646,0.3928
3,0.0538,0.017,0.1305,0.8117,0.0652,0.3434
4,0.0403,0.0047,0.0687,0.2211,0.0577,0.4958
5,0.0464,0.0081,0.09,0.7438,0.0585,0.4118
6,0.05,0.0081,0.0899,0.8687,0.0644,0.3948
7,0.024,0.0011,0.0329,0.8096,0.0292,0.3364
8,0.0462,0.008,0.0894,0.7496,0.0569,0.4509
9,0.0456,0.0047,0.0683,0.461,0.0575,0.5989


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,Description,Value
0,Session id,123
1,Target,share
2,Target type,Regression
3,Original data shape,"(476, 8)"
4,Transformed data shape,"(476, 12)"
5,Transformed train set shape,"(333, 12)"
6,Transformed test set shape,"(143, 12)"
7,Numeric features,6
8,Categorical features,1
9,Preprocess,True


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,0.04,0.0057,0.0702,0.6451,0.0507,0.4006,0.119
rf,Random Forest Regressor,0.0396,0.0053,0.068,0.6415,0.0507,0.4008,0.263
knn,K Neighbors Regressor,0.043,0.006,0.0711,0.6314,0.053,0.4737,0.06
et,Extra Trees Regressor,0.0409,0.0064,0.0739,0.6182,0.0538,0.3857,0.267
catboost,CatBoost Regressor,0.0425,0.0072,0.0787,0.5837,0.056,0.3986,0.929
dt,Decision Tree Regressor,0.0394,0.0063,0.0764,0.5464,0.0575,0.3684,0.053
xgboost,Extreme Gradient Boosting,0.0451,0.0073,0.0824,0.5215,0.059,0.4326,0.143
ada,AdaBoost Regressor,0.0587,0.007,0.0801,0.5075,0.0616,0.8628,0.087
lightgbm,Light Gradient Boosting Machine,0.0559,0.013,0.1049,0.3364,0.0721,0.5007,0.224
huber,Huber Regressor,0.0595,0.0236,0.1323,0.2187,0.092,0.5103,0.076


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0491,0.0067,0.0821,-0.2526,0.0669,0.4274
1,0.0456,0.0084,0.0915,0.4728,0.069,0.4736
2,0.0497,0.0065,0.0805,0.3296,0.0646,0.3928
3,0.0538,0.017,0.1305,0.8117,0.0652,0.3434
4,0.0403,0.0047,0.0687,0.2211,0.0577,0.4958
5,0.0464,0.0081,0.09,0.7438,0.0585,0.4118
6,0.05,0.0081,0.0899,0.8687,0.0644,0.3948
7,0.024,0.0011,0.0329,0.8096,0.0292,0.3364
8,0.0462,0.008,0.0894,0.7496,0.0569,0.4509
9,0.0456,0.0047,0.0683,0.461,0.0575,0.5989


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,Description,Value
0,Session id,123
1,Target,share
2,Target type,Regression
3,Original data shape,"(476, 8)"
4,Transformed data shape,"(476, 12)"
5,Transformed train set shape,"(333, 12)"
6,Transformed test set shape,"(143, 12)"
7,Numeric features,6
8,Categorical features,1
9,Preprocess,True


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,0.04,0.0057,0.0702,0.6451,0.0507,0.4006,0.12
rf,Random Forest Regressor,0.0396,0.0053,0.068,0.6415,0.0507,0.4008,0.358
knn,K Neighbors Regressor,0.043,0.006,0.0711,0.6314,0.053,0.4737,0.061
et,Extra Trees Regressor,0.0409,0.0064,0.0739,0.6182,0.0538,0.3857,0.181
catboost,CatBoost Regressor,0.0425,0.0072,0.0787,0.5837,0.056,0.3986,0.917
dt,Decision Tree Regressor,0.0394,0.0063,0.0764,0.5464,0.0575,0.3684,0.056
xgboost,Extreme Gradient Boosting,0.0451,0.0073,0.0824,0.5215,0.059,0.4326,0.107
ada,AdaBoost Regressor,0.0587,0.007,0.0801,0.5075,0.0616,0.8628,0.09
lightgbm,Light Gradient Boosting Machine,0.0559,0.013,0.1049,0.3364,0.0721,0.5007,0.159
huber,Huber Regressor,0.0595,0.0236,0.1323,0.2187,0.092,0.5103,0.077


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0491,0.0067,0.0821,-0.2526,0.0669,0.4274
1,0.0456,0.0084,0.0915,0.4728,0.069,0.4736
2,0.0497,0.0065,0.0805,0.3296,0.0646,0.3928
3,0.0538,0.017,0.1305,0.8117,0.0652,0.3434
4,0.0403,0.0047,0.0687,0.2211,0.0577,0.4958
5,0.0464,0.0081,0.09,0.7438,0.0585,0.4118
6,0.05,0.0081,0.0899,0.8687,0.0644,0.3948
7,0.024,0.0011,0.0329,0.8096,0.0292,0.3364
8,0.0462,0.008,0.0894,0.7496,0.0569,0.4509
9,0.0456,0.0047,0.0683,0.461,0.0575,0.5989


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
for model in local_model:
  tuned_local_model = tune_local_model(local_model[model])
  local_model[model] = tuned_local_model

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.041,0.0037,0.0607,0.3146,0.0521,0.4569
1,0.0436,0.0083,0.0911,0.4778,0.0674,0.4251
2,0.0475,0.0051,0.0715,0.4707,0.0581,0.3891
3,0.0555,0.0176,0.1327,0.8054,0.0657,0.4325
4,0.0373,0.0036,0.06,0.4051,0.0501,0.4569
5,0.0441,0.0051,0.0711,0.8399,0.0495,0.4422
6,0.075,0.02,0.1415,0.6749,0.0812,0.4134
7,0.0241,0.0009,0.0298,0.8438,0.0263,0.3223
8,0.0336,0.0024,0.0494,0.9235,0.0407,0.5294
9,0.0382,0.0032,0.0568,0.6267,0.0469,0.431


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.041,0.0037,0.0607,0.3146,0.0521,0.4569
1,0.0436,0.0083,0.0911,0.4778,0.0674,0.4251
2,0.0475,0.0051,0.0715,0.4707,0.0581,0.3891
3,0.0555,0.0176,0.1327,0.8054,0.0657,0.4325
4,0.0373,0.0036,0.06,0.4051,0.0501,0.4569
5,0.0441,0.0051,0.0711,0.8399,0.0495,0.4422
6,0.075,0.02,0.1415,0.6749,0.0812,0.4134
7,0.0241,0.0009,0.0298,0.8438,0.0263,0.3223
8,0.0336,0.0024,0.0494,0.9235,0.0407,0.5294
9,0.0382,0.0032,0.0568,0.6267,0.0469,0.431


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.041,0.0037,0.0607,0.3146,0.0521,0.4569
1,0.0436,0.0083,0.0911,0.4778,0.0674,0.4251
2,0.0475,0.0051,0.0715,0.4707,0.0581,0.3891
3,0.0555,0.0176,0.1327,0.8054,0.0657,0.4325
4,0.0373,0.0036,0.06,0.4051,0.0501,0.4569
5,0.0441,0.0051,0.0711,0.8399,0.0495,0.4422
6,0.075,0.02,0.1415,0.6749,0.0812,0.4134
7,0.0241,0.0009,0.0298,0.8438,0.0263,0.3223
8,0.0336,0.0024,0.0494,0.9235,0.0407,0.5294
9,0.0382,0.0032,0.0568,0.6267,0.0469,0.431


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.041,0.0037,0.0607,0.3146,0.0521,0.4569
1,0.0436,0.0083,0.0911,0.4778,0.0674,0.4251
2,0.0475,0.0051,0.0715,0.4707,0.0581,0.3891
3,0.0555,0.0176,0.1327,0.8054,0.0657,0.4325
4,0.0373,0.0036,0.06,0.4051,0.0501,0.4569
5,0.0441,0.0051,0.0711,0.8399,0.0495,0.4422
6,0.075,0.02,0.1415,0.6749,0.0812,0.4134
7,0.0241,0.0009,0.0298,0.8438,0.0263,0.3223
8,0.0336,0.0024,0.0494,0.9235,0.0407,0.5294
9,0.0382,0.0032,0.0568,0.6267,0.0469,0.431


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.041,0.0037,0.0607,0.3146,0.0521,0.4569
1,0.0436,0.0083,0.0911,0.4778,0.0674,0.4251
2,0.0475,0.0051,0.0715,0.4707,0.0581,0.3891
3,0.0555,0.0176,0.1327,0.8054,0.0657,0.4325
4,0.0373,0.0036,0.06,0.4051,0.0501,0.4569
5,0.0441,0.0051,0.0711,0.8399,0.0495,0.4422
6,0.075,0.02,0.1415,0.6749,0.0812,0.4134
7,0.0241,0.0009,0.0298,0.8438,0.0263,0.3223
8,0.0336,0.0024,0.0494,0.9235,0.0407,0.5294
9,0.0382,0.0032,0.0568,0.6267,0.0469,0.431


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [None]:
local_model.keys()

dict_keys([3460.01, 3501.01, 3651.04, 3665.06, 3696.05])

In [None]:
predictions_local = predict_model(local_model[3460.01])
predictions_local

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extreme Gradient Boosting,0.0387,0.0044,0.0663,0.668,0.0516,0.4719


Unnamed: 0,start_station_id,distance_to_subway,day,hour,start_lat,start_lng,total,share,prediction_label
11,3460.01,0.486368,7,17,40.657089,-74.008705,4067,0.049176,0.033798
5,3460.01,0.486368,4,16,40.657089,-74.008705,2283,0.043802,0.063621
72,3501.01,0.599360,26,18,40.655399,-74.010628,1718,0.058207,0.054815
217,3651.04,0.077585,15,12,40.661064,-73.979454,1800,0.055556,0.091231
114,3651.04,0.077585,5,18,40.661064,-73.979454,1382,0.072359,0.112064
...,...,...,...,...,...,...,...,...,...
297,3651.04,0.077585,25,9,40.661064,-73.979454,1005,0.199005,0.122446
274,3651.04,0.077585,23,8,40.661064,-73.979454,4464,0.022401,0.046926
237,3651.04,0.077585,17,14,40.661064,-73.979454,1473,0.067889,0.110196
160,3651.04,0.077585,10,13,40.661064,-73.979454,1900,0.052632,0.099417


### Inference

### explore

In [None]:
#geoclusterized = clusterize_by_geolocation(df_base, 3)
#resent_geolocation_of_clusters(geoclusterized, ['red', 'blue', 'green', 'purple'])

In [None]:
evaluate_model(station_models['6584.12'])

In [None]:
predictions_station = predict_model(station_models['6584.12'])
predictions_station

In [None]:
present_subway_locations(subway_stations)

NameError: name 'subway_stations' is not defined