# Module: Tools

##  Module: Libraries

In [1]:
! pip install pycaret[full]
! pip install geopy
! pip install diskcache

Collecting pycaret[full]
  Downloading pycaret-3.3.2-py3-none-any.whl.metadata (17 kB)
Collecting pandas<2.2.0 (from pycaret[full])
  Downloading pandas-2.1.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting scipy<=1.11.4,>=1.6.1 (from pycaret[full])
  Downloading scipy-1.11.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m188.7 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting joblib<1.4,>=1.2.0 (from pycaret[full])
  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting pyod>=1.1.3 (from pycaret[full])
  Downloading pyod-2.0.3.tar.gz (169 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m169.6/169.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting category-encoders>=2.4.0 (from pycaret[full])
  Downloading category_encoders-2.8.0-py3-no

Collecting diskcache
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: diskcache
Successfully installed diskcache-5.6.3


In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import IPython.display as ipydisplay
import warnings
import json
import datetime
import time
from sklearn.metrics.pairwise import haversine_distances
from abc import ABC, abstractmethod
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from geopy.distance import geodesic
from scipy.stats.mstats import winsorize
from sklearn.preprocessing import OneHotEncoder
from pycaret.regression import *
from typing import Callable, Any, List, Tuple
import functools
import folium
from folium.plugins import Fullscreen
from IPython.display import IFrame

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
color = sns.color_palette("tab10")
sns.set_style('darkgrid')

## Raw Data Provider

In [5]:
class DataProvider():
  df_raw: pd.DataFrame
  last_file_name: str
  identifier: str

  def __init__(self):
    self.df_raw = pd.DataFrame()
    self.last_file_name = ''

  def load_jan2020(self):
    ''' This function is a shortcut '''
    ! wget https://s3.amazonaws.com/tripdata/2020-citibike-tripdata.zip
    ! unzip 2020-citibike-tripdata.zip
    ! unzip 2020-citibike-tripdata/202001-citibike-tripdata.zip # January/2020 data

    display('Clearing folders and raw files')
    ! rm -rf 2020-citibike-tripdata.zip
    ! rm -rf 2020-citibike-tripdata/
    ! rm -rf 202001-citibike-tripdata.zip

    ipydisplay.clear_output()
    display('Files ready')
    self.last_file_name = '202001-citibike-tripdata_1.csv'
    self.identifier = "Jan/2020"
    return self


  def load(self, year:str, month:str):
    '''This function is an API. The user requests some data and the magic happens underneath'''
    display(f'Downloading: {year}/{month}')
    file_name = f"{year}-citibike-tripdata"
    ! wget https://s3.amazonaws.com/tripdata/{file_name}+".zip"
    ! unzip {file_name}+".zip"
    ! unzip {file_name}/{year}-{month}-citibike-tripdata.zip

    ipydisplay.clear_output()
    display('Files ready')
    self.last_file_name = file_name+'.csv'
    return self

  def get_df(self) -> pd.DataFrame:
    if self.df_raw.empty:
      with open(self.last_file_name, 'r') as file:
        self.df_raw = pd.read_csv(file, parse_dates=['started_at', 'ended_at'])
    return self.df_raw.copy()

In [4]:
! wget https://s3.amazonaws.com/tripdata/2020-citibike-tripdata.zip
! unzip 2020-citibike-tripdata.zip

--2025-02-17 14:25:40--  https://s3.amazonaws.com/tripdata/2020-citibike-tripdata.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.111.230, 52.216.61.248, 16.182.72.136, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.111.230|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 751199232 (716M) [application/zip]
Saving to: ‘2020-citibike-tripdata.zip’


2025-02-17 14:25:55 (51.1 MB/s) - ‘2020-citibike-tripdata.zip’ saved [751199232/751199232]

Archive:  2020-citibike-tripdata.zip
   creating: 2020-citibike-tripdata/
  inflating: 2020-citibike-tripdata/202004-citibike-tripdata.zip  
  inflating: 2020-citibike-tripdata/202012-citibike-tripdata.zip  
  inflating: 2020-citibike-tripdata/202006-citibike-tripdata.zip  
  inflating: 2020-citibike-tripdata/202010-citibike-tripdata.zip  
  inflating: 2020-citibike-tripdata/202008-citibike-tripdata.zip  
  inflating: 2020-citibike-tripdata/202002-citibike-tripdata.zip  
  inflating: 2020-citibike-trip

## Experiment Manager

In [6]:
class ExperimentPipeline(ABC):
  '''Classic Pipeline Data Pattern'''
  def __init__(self, steps: List[Tuple[Callable, dict]] = None) -> None:
    self.steps = steps if steps is not None else []
    self.reports = {}

  def add_step(self, step: Callable, kwargs: dict = None) -> None:
    assert callable(step), 'Step must be a function or method'
    assert hasattr(step, '__name__'), 'Step must be named'
    self.steps.append((step, kwargs or {}))

  @abstractmethod
  def run(self, data: pd.DataFrame, mode: str) -> pd.DataFrame:
    pass

  def __repr__(self):
    '''Printable representation of the pipeline'''
    step_names = [(step.__name__ if hasattr(step, '__name__') else str(step), kwargs) for step, kwargs in self.steps]
    return f"Pipeline(steps={step_names})"


In [89]:
class FeaturePipeline(ExperimentPipeline):
  def run(self, df: pd.DataFrame) -> pd.DataFrame:
    '''Pipeline logic for managing feature engineering methods. Each method call modifies the resulting dataframe, introducing the new feature, and, if sucessful, cleaning up the data.'''
    result = df.copy()

    for i, (step, kwargs) in enumerate(self.steps):
      step_name = step.__name__ # __name__ is a private property that returns the objects' name in code

      try:
        result, cleanup = step(result, **kwargs)
        if 'cleanup' not in self.reports:
          self.reports['cleanup'] = cleanup
        else:
          self.reports['cleanup'].extend(cleanup)
      except Exception as e:
        print(f"Error in step {i+1}: {step_name} with kwargs {kwargs} - {e}")
        raise  # Re-raise the exception to stop the pipeline

    #print(self.reports['cleanup'])
    result = result.drop(self.reports['cleanup'], axis=1)
    return result

In [8]:
class CleaningPipeline(ExperimentPipeline):
  def run(self, data: pd.DataFrame) -> pd.DataFrame:
    result = data.copy()

    for i, (step, kwargs) in enumerate(self.steps):
      step_name = step.__name__ # __name__ is a private property that returns the objects' name in code
      try:
        result, report = step(result, **kwargs)
        self.reports[step_name] = report
      except Exception as e:
        print(f"Error in step {i+1}: {step_name} with kwargs {kwargs} - {e}")
        raise  # Re-raise the exception to stop the pipeline
    return result

In [9]:
class Experiment():
  def __init__(self, data_used: str, features: FeaturePipeline, cleaning: CleaningPipeline) -> None:
    self.data_used = data_used
    self.features = features
    self.cleaning = cleaning
    self.pipeline_definition = {'features': str(features), 'cleaning': str(cleaning)}

  def run(self, df: pd.DataFrame) -> pd.DataFrame:
    df_clean = self.cleaning.run(df)
    df_features = self.features.run(df_clean)
    self.pipeline_results = {'features': self.features.reports, 'cleaning': self.cleaning.reports}
    return df_features


#  Module: Data

## Acquire

## Cleaning

In [10]:
def step_handle_duplicates(data: pd.DataFrame) -> Tuple[pd.DataFrame, int]:
  ''' Drops duplicates from dataset. Returns number of rows affected '''
  before = data.shape[0]
  after = data.drop_duplicates()
  delta = before - after.shape[0]
  return after, delta

In [11]:
def step_drop_unused(data: pd.DataFrame) -> Tuple[pd.DataFrame, int]:
  ''' Drops useless columns from dataset. '''
  before = data.shape[1]
  after = data.drop([
      'start_station_name', # station name where trip started
      'end_station_name', # station name where trip ended
      'start_station_id', # unique id of station where trip started
      'end_station_id', # unique id of station where trip ended,
      'rideable_type', # unused for now due to lack of representation
  ], axis=1)
  delta = before - after.shape[1]
  return after, delta

In [12]:
def step_handle_missing(data: pd.DataFrame) ->  Tuple[pd.DataFrame, int]:
  ''' Drops rows with missing values from dataset. Returns number of rows affected '''
  before = data.shape[0]
  after = data.dropna()
  delta = before - after.shape[0]
  return after, delta

## Feature Engineering

In [13]:
def features_timestamp(df: pd.DataFrame, **kwargs) -> pd.DataFrame:
  '''This method extracts multiple features from timestamp based data'''
  df['day'] = df['started_at'].dt.day
  df['hour'] = df['started_at'].dt.hour
  df['weekday'] = df['started_at'].dt.dayofweek
  df['is_weekend'] = df['weekday'] >= 5
  df['trip_duration'] = (df['ended_at'] - df['started_at']).dt.total_seconds()

  return df, ['started_at', 'ended_at']

In [15]:
cache = {}

In [32]:
def calculate_trip_distance(row: pd.Series) -> float:
    # TODO: This guy is slow!
    start_coords = (row['start_lat'], row['start_lng'])
    end_coords = (row['end_lat'], row['end_lng'])
    return geodesic(start_coords, end_coords).kilometers

def faster_trip_distance(row):
  start_coords = (row['start_lat'], row['start_lng'])
  end_coords = (row['end_lat'], row['end_lng'])
  if (start_coords, end_coords) in cache:
    return cache[(start_coords, end_coords)]
  coords_1_rad = np.radians(start_coords)
  coords_1_rad = coords_1_rad.reshape(1, -1)  # Reshape to (1, 2)
  coords_2_rad = np.radians(end_coords)
  coords_2_rad = coords_2_rad.reshape(1, -1)

  earth_radius_km = 6371.0 # Earth's radius in kilometers
  distance_rad = haversine_distances(coords_1_rad, coords_2_rad)
  distance_km = distance_rad * earth_radius_km
  cache[(start_coords, end_coords)] = distance_km[0][0]
  return distance_km[0][0]

def feature_trip_distance(df: pd.DataFrame, **kwargs) -> pd.DataFrame:
  '''This feature calculates distance in kilometers between latitude pairs'''
  display(f'slow guy started')
  display(f'cache size: {len(cache)}')
  start = time.perf_counter()
  #df['trip_distance'] = df.apply(calculate_trip_distance, axis=1)
  df['trip_distance'] = df.apply(faster_trip_distance, axis=1)
  finish = time.perf_counter()
  display(f'slow guy took: {finish - start} seconds')
  display(f'cache size: {len(cache)}')
  return df, ['start_lat', 'start_lng', 'end_lat', 'end_lng']

In [157]:
def feature_member_ratio(df: pd.DataFrame, **kwargs) -> pd.DataFrame:
  '''This feature determines the ratio of members to casuals in a specific hour of a day'''
  df_member = pd.DataFrame({
      'is_member' : df['member_casual'] == 'member',
      'hour' : df['hour'],
      'day' : df['day']
      })
  df_member = df_member[['is_member', 'hour', 'day']].groupby(['day','hour']).agg(member_ratio=('is_member', 'mean'))
  #df_final = pd.merge(df_member, df.drop_duplicates(subset=['day','hour']), on=['day','hour'])
  return df_member, ['member_casual']

In [165]:
def aggregate_by_hour(df: pd.DataFrame, **kwargs) -> pd.DataFrame:
  member_ratio, cleanup = feature_member_ratio(df)

  hourly = df.groupby(['day','hour']).agg(
    trip_amount=('ride_id', 'count'),  # Count of 'ride_id'
    avg_trip_duration=('trip_duration', 'mean'),  # Average of 'trip_duration'
    avg_trip_distance=('trip_distance', 'mean')  # Average of 'trip_distance'
  ).reset_index()

  df_hourly = pd.merge(hourly, df.drop_duplicates(subset=['day','hour']), on=['day','hour'])
  df_final = pd.merge(member_ratio, df_hourly.drop_duplicates(subset=['day', 'hour']), on=['day','hour'])
  return df_final, ['ride_id','trip_duration','trip_distance', cleanup[0]]

In [19]:
def frequency_of_trips_per_station(df: pd.DataFrame) -> pd.DataFrame:
  df_frequency = df.groupby('start_station_id')['ride_id'].count().reset_index()
  df_frequency.rename(columns={'ride_id': 'ride_count', 'start_station_id': 'station_id'}, inplace=True)
  df_frequency.sort_values(by='ride_count', ascending=False).reset_index()

  return df_frequency

In [20]:
def clusterize_stations_by_frequency(df: pd.DataFrame, n_clusters: int) -> pd.DataFrame:
  station_data = pd.DataFrame(df)

  X = station_data[['ride_count']]
  scaler = StandardScaler()
  X_scaled = scaler.fit_transform(X)

  kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
  station_data['cluster'] = kmeans.fit_predict(X_scaled)

  return station_data.groupby('cluster')['ride_count'].agg(['mean', 'median', 'min', 'max', 'count'])

In [21]:
def clusterize_by_geolocation(df: pd.DataFrame, n_clusters: int) -> pd.DataFrame:
  station_locations = df[['start_lat', 'start_lng']].drop_duplicates().dropna()
  kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
  station_locations['cluster'] = kmeans.fit_predict(station_locations[['start_lat', 'start_lng']])

  return station_locations

#  Module: Visualize

In [22]:
def feature_overview(data: pd.DataFrame) -> pd.DataFrame:
  ''' Returns a DataFrame with key insights from dataset, intented to showcase the metrics of which we determine data quality. '''
  return pd.DataFrame({
      'feature': data.columns.values,  # feature names
      'data_type': data.dtypes.values,  # data types
      'null_value(%)': data.isna().mean().values * 100,  # percentage of null values
      'neg_value(%)': [len(data[col][data[col] < 0]) / len(data) * 100 if col in data.select_dtypes(include=[np.number]).columns else 0 for col in data.columns],  # percentage of negative values
      '0_value(%)': [len(data[col][data[col] == 0]) / len(data) * 100 if col in data.select_dtypes(include=[np.number]).columns else 0 for col in data.columns],  # percentage of zero values
      'duplicate': data.duplicated().sum(),  # amount of duplicates
      'n_unique': data.nunique().values,  # amount of unique values
      'sample_unique': [data[col].unique() for col in data.columns]  # sample of unique values
  }).round(3)

In [23]:
def present_distribution_across_cluster(cluster_summary: pd.DataFrame):
  plt.figure(figsize=(8, 8))
  plt.pie(cluster_summary['count'], labels=cluster_summary.index, autopct='%1.1f%%', startangle=90)
  plt.title('Distribution of Stations across Clusters')
  plt.axis('equal')
  plt.show()

In [24]:
def present_geolocation_of_clusters(station_locations: pd.DataFrame, colors = []):
  center_lat = station_locations['start_lat'].mean()
  center_lon = station_locations['start_lng'].mean()
  m = folium.Map(location=[center_lat, center_lon], zoom_start=12)

  for index, row in station_locations.iterrows():
      cluster = int(row['cluster'])
      folium.CircleMarker(
          location=[row['start_lat'], row['start_lng']],
          radius=5,
          color=colors[cluster % len(colors)],
          fill=True,
          fill_color=colors[cluster % len(colors)],
          fill_opacity=0.7,
          popup=f"Station: {index}, Cluster: {cluster}"
      ).add_to(m)

  m.save('geoclusterized.html')

  Fullscreen().add_to(m)
  display(m)

#  Module: Train

#  Module: Evaluate

# Main: Run

In [25]:
# run once
provider = DataProvider()
provider.load_jan2020()

'Files ready'

<__main__.DataProvider at 0x7d2aa91e8d50>

In [144]:
# run to restart dataframe
df_base = provider.get_df()

In [145]:
cleaning_steps = [
    (step_handle_duplicates,  {}), # these are pure functions
    (step_drop_unused,        {}),
    (step_handle_missing,     {}),
]
cleaning_pipeline = CleaningPipeline(steps=cleaning_steps)

In [166]:
feature_steps = [
    (features_timestamp,    {}),
    (feature_trip_distance, {}),
    #(feature_member_ratio, {}),
    (aggregate_by_hour,     {}),
    #(feature_trip_distance_vectorized, {}),
    #(, {'feature_name':''}),
]
feature_pipeline = FeaturePipeline(steps=feature_steps)

In [167]:
experiment = Experiment(
    data_used=provider.identifier,
    features=feature_pipeline,
    cleaning=cleaning_pipeline
)

In [160]:
experiment.pipeline_definition

{'features': "Pipeline(steps=[('features_timestamp', {}), ('feature_trip_distance', {}), ('feature_member_ratio', {}), ('aggregate_by_hour', {})])",
 'cleaning': "Pipeline(steps=[('step_handle_duplicates', {}), ('step_drop_unused', {}), ('step_handle_missing', {})])"}

In [168]:
df = experiment.run(df_base)

'slow guy started'

'cache size: 135064'

'slow guy took: 16.60303949199988 seconds'

'cache size: 135064'

In [169]:
experiment.pipeline_results

{'features': {'cleanup': ['started_at',
   'ended_at',
   'start_lat',
   'start_lng',
   'end_lat',
   'end_lng',
   'ride_id',
   'trip_duration',
   'trip_distance',
   'member_casual']},
 'cleaning': {'step_handle_duplicates': 0,
  'step_drop_unused': 5,
  'step_handle_missing': 2438}}

In [170]:
feature_overview(df)

Unnamed: 0,feature,data_type,null_value(%),neg_value(%),0_value(%),duplicate,n_unique,sample_unique
0,day,int32,0.0,0.0,0.0,0,31,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
1,hour,int32,0.0,0.0,4.167,0,24,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
2,member_ratio,float64,0.0,0.0,0.0,0,711,"[0.7975609756097561, 0.7983706720977597, 0.838..."
3,trip_amount,int64,0.0,0.0,0.0,0,638,"[410, 491, 378, 195, 102, 65, 98, 134, 306, 40..."
4,avg_trip_duration,float64,0.0,0.0,0.0,0,744,"[5411.295278048781, 1140.1098370672098, 1264.6..."
5,avg_trip_distance,float64,0.0,0.0,0.0,0,744,"[1.7635418672415926, 1.7115760695428668, 1.777..."
6,weekday,int32,0.0,0.0,12.903,0,7,"[2, 3, 4, 5, 6, 0, 1]"
7,is_weekend,bool,0.0,0.0,0.0,0,2,"[False, True]"


In [171]:
df.head()

Unnamed: 0,day,hour,member_ratio,trip_amount,avg_trip_duration,avg_trip_distance,weekday,is_weekend
0,1,0,0.797561,410,5411.295278,1.763542,2,False
1,1,1,0.798371,491,1140.109837,1.711576,2,False
2,1,2,0.838624,378,1264.673077,1.77767,2,False
3,1,3,0.815385,195,822.333092,1.627566,2,False
4,1,4,0.921569,102,683.903598,1.519381,2,False


In [172]:
setup = setup(
    data=df,
    target='trip_amount',
    numeric_features=['day', 'hour', 'avg_trip_duration', 'avg_trip_distance', 'member_ratio'],
    categorical_features=['weekday', 'is_weekend'],
    use_gpu = True,
    session_id=123)

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] This

Unnamed: 0,Description,Value
0,Session id,123
1,Target,trip_amount
2,Target type,Regression
3,Original data shape,"(744, 8)"
4,Transformed data shape,"(744, 14)"
5,Transformed train set shape,"(520, 14)"
6,Transformed test set shape,"(224, 14)"
7,Numeric features,5
8,Categorical features,2
9,Preprocess,True


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0


In [173]:
best_model = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
xgboost,Extreme Gradient Boosting,182.7006,79895.1521,274.8197,0.9413,0.5402,0.4464,0.18
et,Extra Trees Regressor,181.8177,79897.7155,277.6807,0.9396,0.4137,0.3971,0.359
gbr,Gradient Boosting Regressor,202.2219,83149.9706,284.4059,0.9383,0.5647,0.5275,0.2
rf,Random Forest Regressor,200.2953,97523.6225,301.8569,0.9271,0.4464,0.4352,0.55
lightgbm,Light Gradient Boosting Machine,205.812,100099.5106,300.9135,0.9258,0.5296,0.5241,0.109
dt,Decision Tree Regressor,238.7269,155020.0577,384.9685,0.8838,0.4978,0.4434,0.068
ada,AdaBoost Regressor,378.3704,215418.0126,460.469,0.8385,0.9061,1.5548,0.167
lasso,Lasso Regression,693.3596,721492.0704,848.9437,0.4517,1.1983,2.9369,0.067
llar,Lasso Least Angle Regression,693.3596,721492.5801,848.944,0.4517,1.1983,2.9369,0.068
ridge,Ridge Regression,691.0464,724924.0566,851.0028,0.4508,1.1708,2.8123,0.066


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

In [177]:
best_model = create_model('xgboost')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,247.4953,137269.9214,370.4996,0.9007,0.6223,0.7117
1,192.0677,89027.7999,298.3753,0.9311,0.5702,0.6275
2,196.4967,78981.2187,281.036,0.9344,0.4397,0.3516
3,124.6036,29224.0137,170.9503,0.978,0.3384,0.2633
4,169.8628,60374.9443,245.7131,0.9297,0.5748,0.5613
5,224.7999,162408.6027,402.9995,0.9008,0.7033,0.4624
6,147.6209,45627.9448,213.607,0.9671,0.5138,0.228
7,192.178,78475.2322,280.1343,0.9532,0.5584,0.4582
8,165.4217,57888.0026,240.5993,0.9562,0.4244,0.2957
9,166.4594,59673.8413,244.2823,0.9618,0.657,0.5037


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [178]:
tuned_model = tune_model(best_model)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,269.4372,163969.0702,404.9309,0.8813,0.6254,0.7111
1,168.9544,67281.2045,259.3862,0.9479,0.5254,0.5335
2,169.0516,57711.5105,240.2322,0.9521,0.3595,0.2311
3,144.2978,30674.2186,175.1406,0.9769,0.4947,0.3651
4,183.6359,66515.9554,257.9069,0.9225,0.5799,0.652
5,224.9868,126385.1956,355.507,0.9228,0.4644,0.4586
6,156.2527,55983.4312,236.6082,0.9597,0.3739,0.324
7,178.3376,71512.9301,267.419,0.9573,0.4473,0.3131
8,239.2452,112424.8033,335.2981,0.9149,0.4515,0.402
9,182.1889,60746.0429,246.4671,0.9611,0.5959,0.5823


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [179]:
evaluate_model(tuned_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [180]:
predictions = predict_model(tuned_model)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extreme Gradient Boosting,174.7249,67752.628,260.2933,0.9549,0.5126,0.3776


In [183]:
metrics = pull()
metrics

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extreme Gradient Boosting,174.7249,67752.628,260.2933,0.9549,0.5126,0.3776


### explore

In [None]:
df_freq = frequency_of_trips_per_station(df_base)

In [None]:
present_distribution_across_cluster(clusterize_stations_by_frequency(df_freq, 3))

In [None]:
geoclusterized = clusterize_by_geolocation(df_base, 4)

In [None]:
present_geolocation_of_clusters(geoclusterized, ['red', 'blue', 'green', 'purple'])