# Sources

## Raw Citibike Data

https://s3.amazonaws.com/tripdata/2020-citibike-tripdata.zip

https://citibikenyc.com/system-data

## Subway Data
https://data.ny.gov/Transportation/MTA-Subway-Stations/39hk-dx4f/data_preview

## USGOV Census Data

### Big shoutout to Guilherme Cavo for his noteebok:
https://colab.research.google.com/drive/1D1DzH5OFXBWQtHwg0_3W-FAJIHB9aPxU?usp=sharing

#  Module: Dependencies

In [None]:
# pip install --upgrade --force-reinstall numpy
# pip install --upgrade --force-reinstall pandas
# pip install pycaret[full]
# pip install geopy
# pip install diskcache
# pip install geodatasets

In [1]:
# core libraries for the project
from pycaret.regression import *
import pandas as pd
import numpy as np
import datetime
import geodatasets
from shapely.geometry import Polygon, LineString, Point, MultiPolygon
from shapely import wkt, union_all
import geopandas as gpd

# libraries for Software Engineering Practices
from abc import ABC, abstractmethod
import time
import functools
from typing import Callable, Any, List, Tuple

# libraries for presentation
import seaborn as sns
import matplotlib.pyplot as plt
import IPython.display as ipydisplay
import folium
import json
from folium.plugins import Fullscreen
from IPython.display import IFrame

# libraries for feature engineering
from scipy.stats.mstats import winsorize
from sklearn.metrics.pairwise import haversine_distances
from geopy.distance import geodesic # will be dropped soon
from sklearn.cluster import KMeans # used for clusterization, not final
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder # not final

import warnings
import os
import sys
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
color = sns.color_palette("tab10")
sns.set_style('darkgrid')

In [2]:
import pathlib
import zipfile

In [3]:
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    print(f"Adding {project_root} to sys.path")
    sys.path.insert(0, project_root) # Insert at beginning to prioritize modules here
else:
    print(f"{project_root} already in sys.path")
import importlib
from modules import data_loader

Adding e:\TCC\tcc to sys.path
data_loader imported


In [4]:
importlib.reload(data_loader)

data_loader imported


<module 'modules.data_loader' from 'e:\\TCC\\tcc\\modules\\data_loader.py'>

# Module: Managers

## Data Provider

In [5]:
class GIS_Data_Provider():
  @classmethod
  def old_get_subway_stations(self) -> pd.DataFrame:
    ''' Wrapping the download from github cached data.'''
    if not os.path.exists('/content/MTA_Subway_Stations_20250217.csv'):
      display("downloading cached data from github")
      # wget https://raw.githubusercontent.com/bicdev/tcc/refs/heads/main/cached%20data/MTA_Subway_Stations_20250217.csv -q
    return pd.read_csv('/content/MTA_Subway_Stations_20250217.csv')
  
  @classmethod
  def get_subway_stations(self) -> pd.DataFrame:
    return data_loader.load_subway()

In [6]:
class Census_Data_Provider():
  @classmethod
  def old_get_tract_data(self) -> gpd.geodataframe:
    if not os.path.exists('/content/race_comparison_manhattan.geojson'):
      display("downloading cached data from github")
      # wget https://raw.githubusercontent.com/bicdev/tcc/7c10dd978fcac26e6d6eee38629a25e159697faa/cached%20data/race_comparison_manhattan.geojson -O /content/race_comparison_manhattan.geojson -q
    return gpd.read_file('/content/race_comparison_manhattan.geojson')
  
  @classmethod
  def get_tract_data(self) -> gpd.geodataframe:
    return data_loader.load_census()

In [4]:
p = pathlib.Path("E:/TCC/tcc/cached data/csvs")

NameError: name 'pathlib' is not defined

In [None]:
for month in ["01", "02", "03", "04",  "05", "06", "07", "08", "09", "10", "11", "12"]:
    with zipfile.ZipFile(f"cached data/2020-citibike-tripdata/2020{month}-citibike-tripdata.zip", 'r') as zip_ref:
        zip_ref.extractall('cached data/csvs')

In [7]:
class DataProvider():
  df_raw: pd.DataFrame
  df_census: pd.DataFrame
  last_file_name: str
  identifier: str

  def __init__(self):
    self.df_raw = pd.DataFrame()
    self.df_census = Census_Data_Provider.get_tract_data()
    GIS_Data_Provider.get_subway_stations()
    self.last_file_name = ''

  def load_jan2020(self):
    data_loader.load_citibike('2020', '01')
    self.last_file_name = '202001-citibike-tripdata_1.csv'
    self.identifier = "Jan/2020"
    return self
  
  #def load_and_get2020(self):

  def load(self, year:str, month:str):
    pass

  def get_df(self) -> pd.DataFrame:
    if self.df_raw.empty:
      module_path = os.path.dirname(os.getcwd())
      cache_path = os.path.join(module_path, 'cached data')
      path = os.path.join(cache_path, self.last_file_name)

      with open(path, 'r') as file:
        self.df_raw = pd.read_csv(file, parse_dates=['started_at', 'ended_at'])
    return self.df_raw.copy()

In [None]:
cache_path = os.path.join(os.path.dirname(os.getcwd()), 'cached data')
cache_path

## Experiment Manager

In [8]:
class ExperimentPipeline(ABC):
  '''Classic Pipeline Data Pattern'''
  def __init__(self, steps: List[Tuple[Callable, dict]] = None) -> None:
    self.steps = steps if steps is not None else []
    self.reports = {}

  def add_step(self, step: Callable, kwargs: dict = None) -> None:
    assert callable(step), 'Step must be a function or method'
    assert hasattr(step, '__name__'), 'Step must be named'
    self.steps.append((step, kwargs or {}))

  @abstractmethod
  def run(self, data: pd.DataFrame, mode: str) -> pd.DataFrame:
    pass

  def __repr__(self):
    '''Printable representation of the pipeline'''
    step_names = [
        (step.__name__ if hasattr(step, '__name__') else str(step), kwargs)
        for step, kwargs in self.steps
      ]
    return f"Pipeline(steps={step_names})"


In [9]:
class FeaturePipeline(ExperimentPipeline):
  def run(self, df: pd.DataFrame) -> pd.DataFrame:
    '''Pipeline logic for managing feature engineering methods. Each method
    call modifies the resulting dataframe,
    introducing the new feature, and, if sucessful, cleaning up the data.'''
    result = df.copy()

    for i, (step, kwargs) in enumerate(self.steps):
      step_name = step.__name__ # __name__ is a private property that returns the objects' name in code

      try:
        result, cleanup = step(result, **kwargs)
        if 'cleanup' not in self.reports:
          self.reports['cleanup'] = cleanup
        else:
          self.reports['cleanup'].extend(cleanup)
      except Exception as e:
        print(f"Error in step {i+1}: {step_name} with kwargs {kwargs} - {e}")
        raise  # Re-raise the exception to stop the pipeline

    result = result.drop(self.reports['cleanup'], axis=1)
    return result

In [10]:
class CleaningPipeline(ExperimentPipeline):
  def run(self, data: pd.DataFrame) -> pd.DataFrame:
    result = data.copy()

    for i, (step, kwargs) in enumerate(self.steps):
      step_name = step.__name__ # __name__ is a private property that returns the objects' name in code
      try:
        result, report = step(result, **kwargs)
        self.reports[step_name] = report
      except Exception as e:
        print(f"Error in step {i+1}: {step_name} with kwargs {kwargs} - {e}")
        raise  # Re-raise the exception to stop the pipeline
    return result

In [11]:
class Experiment():
  def __init__(self, data_used: str, features: FeaturePipeline, cleaning: CleaningPipeline) -> None:
    self.data_used = data_used
    self.features = features
    self.cleaning = cleaning
    self.pipeline_definition = {'features': str(features), 'cleaning': str(cleaning)}

  def run(self, df: pd.DataFrame) -> pd.DataFrame:
    df_clean = self.cleaning.run(df)
    df_features = self.features.run(df_clean)
    self.pipeline_results = {'features': self.features.reports, 'cleaning': self.cleaning.reports}
    return df_features


#  Module: Data

## Cleaning

In [12]:
def step_handle_duplicates(data: pd.DataFrame) -> Tuple[pd.DataFrame, int]:
  ''' Drops duplicates from dataset. Returns number of rows affected '''
  before = data.shape[0]
  after = data.drop_duplicates()
  delta = before - after.shape[0]
  return after, delta

In [13]:
def step_drop_unused(data: pd.DataFrame) -> Tuple[pd.DataFrame, int]:
  ''' Drops useless columns from dataset. '''
  before = data.shape[1]
  after = data.drop([
      'start_station_name', # station name where trip started
      'end_station_name', # station name where trip ended
      'start_station_id', # unique id of station where trip started
      'end_station_id', # unique id of station where trip ended,
      'rideable_type', # unused for now due to lack of representation
  ], axis=1)
  delta = before - after.shape[1]
  return after, delta

In [14]:
def step_handle_missing(data: pd.DataFrame) ->  Tuple[pd.DataFrame, int]:
  ''' Drops rows with missing values from dataset. Returns number of rows affected '''
  before = data.shape[0]
  after = data.dropna()
  delta = before - after.shape[0]
  return after, delta

## Feature Engineering

In [15]:
def features_timestamp(df: pd.DataFrame, **kwargs) -> pd.DataFrame:
  '''This method extracts multiple features from timestamp based data'''
  df['day'] = df['started_at'].dt.day
  df['hour'] = df['started_at'].dt.hour
  df['weekday'] = df['started_at'].dt.dayofweek
  df['is_weekend'] = df['weekday'] >= 5
  df['trip_duration'] = (df['ended_at'] - df['started_at']).dt.total_seconds()

  return df, ['started_at', 'ended_at']

In [16]:
cache = {} # memory based cache, not final

In [17]:
def calculate_trip_distance(row: pd.Series) -> float:
    # TODO: This guy is slow
    start_coords = (row['start_lat'], row['start_lng'])
    end_coords = (row['end_lat'], row['end_lng'])
    return geodesic(start_coords, end_coords).kilometers

def faster_trip_distance(row):
  '''Faster distance calculation between coordinates pairs, uses Haversine formula.'''
  start_coords = (row['start_lat'], row['start_lng'])
  end_coords = (row['end_lat'], row['end_lng'])
  if (start_coords, end_coords) in cache:
    return cache[(start_coords, end_coords)]
  coords_1_rad = np.radians(start_coords)
  coords_1_rad = coords_1_rad.reshape(1, -1)  # Reshape to (1, 2)
  coords_2_rad = np.radians(end_coords)
  coords_2_rad = coords_2_rad.reshape(1, -1)

  earth_radius_km = 6371.0 # Earth's radius in kilometers
  distance_rad = haversine_distances(coords_1_rad, coords_2_rad)
  distance_km = distance_rad * earth_radius_km
  cache[(start_coords, end_coords)] = distance_km[0][0]
  return distance_km[0][0]

def feature_trip_distance(df: pd.DataFrame, **kwargs) -> pd.DataFrame:
  '''Wrapper for trip distance feature extraction'''
  #display(f'slow guy started')
  display(f'cache size: {len(cache)}')
  start = time.perf_counter()
  #df['trip_distance'] = df.apply(calculate_trip_distance, axis=1)
  df['trip_distance'] = df.apply(faster_trip_distance, axis=1)
  finish = time.perf_counter()
  display(f'slow guy took: {finish - start} seconds')
  display(f'cache size: {len(cache)}')
  return df, ['start_lat', 'start_lng', 'end_lat', 'end_lng']

In [18]:
def feature_member_ratio(df: pd.DataFrame, **kwargs) -> pd.DataFrame:
  '''This feature determines the ratio of members to casuals in a specific hour of a day'''
  df_member = pd.DataFrame({
      'is_member' : df['member_casual'] == 'member',
      'hour' : df['hour'],
      'day' : df['day']
      })
  df_member = df_member[['is_member', 'hour', 'day']].groupby(['day','hour']).agg(member_ratio=('is_member', 'mean'))
  #df_final = pd.merge(df_member, df.drop_duplicates(subset=['day','hour']), on=['day','hour'])
  return df_member, ['member_casual']

In [19]:
def aggregate_by_hour(df: pd.DataFrame, **kwargs) -> pd.DataFrame:
  member_ratio, cleanup = feature_member_ratio(df)

  hourly = df.groupby(['day','hour']).agg(
    trip_amount=('ride_id', 'count'),  # Count of 'ride_id'
    avg_trip_duration=('trip_duration', 'mean'),  # Average of 'trip_duration'
    avg_trip_distance=('trip_distance', 'mean')  # Average of 'trip_distance'
  ).reset_index()

  df_hourly = pd.merge(hourly, df.drop_duplicates(subset=['day','hour']), on=['day','hour'])
  df_final = pd.merge(member_ratio, df_hourly.drop_duplicates(subset=['day', 'hour']), on=['day','hour'])
  return df_final, ['ride_id','trip_duration','trip_distance', cleanup[0]]

In [20]:
def frequency_of_trips_per_station(df: pd.DataFrame) -> pd.DataFrame:
  df_frequency = df.groupby('start_station_id')['ride_id'].count().reset_index()
  df_frequency.rename(columns={'ride_id': 'ride_count', 'start_station_id': 'station_id'}, inplace=True)
  df_frequency.sort_values(by='ride_count', ascending=False).reset_index()

  return df_frequency

In [21]:
def clusterize_stations_by_frequency(df: pd.DataFrame, n_clusters: int) -> pd.DataFrame:
  station_data = pd.DataFrame(df)

  X = station_data[['ride_count']]
  scaler = StandardScaler()
  X_scaled = scaler.fit_transform(X)

  kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
  station_data['cluster'] = kmeans.fit_predict(X_scaled)

  return station_data.groupby('cluster')['ride_count'].agg(['mean', 'median', 'min', 'max', 'count'])

In [22]:
def clusterize_by_geolocation(df: pd.DataFrame, n_clusters: int) -> pd.DataFrame:
  station_locations = df[['start_lat', 'start_lng']].drop_duplicates().dropna()
  kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
  station_locations['cluster'] = kmeans.fit_predict(station_locations[['start_lat', 'start_lng']])

  return station_locations

In [23]:
subway_cache = {}

In [24]:
print(len(subway_cache.keys()))

0


In [25]:
def calculate_distance_to_subway(subway: pd.DataFrame, station_lat, station_lng, **kwargs) -> pd.DataFrame:
  #start = time.perf_counter()
  min_distance = float('inf') # to find nearest, we start with infinite distance then shrink with each comparison
  for index, row in subway.iterrows():
    subway_lat, subway_lng = row['GTFS Latitude'], row['GTFS Longitude']

    station_coords = (station_lat, station_lng)
    subway_coords = (subway_lat, subway_lng)

    if (station_coords, subway_coords) in cache:
      distance = subway_cache[(station_coords, subway_coords)]
    else:
      distance = geodesic(station_coords, subway_coords).kilometers
      subway_cache[(station_coords, subway_coords)] = distance
    min_distance = min(min_distance, distance)
  end = time.perf_counter()
  return min_distance

def feature_distance_to_subway(df_stations: pd.DataFrame, df_subway: pd.DataFrame, **kwargs) -> pd.DataFrame:
  df_subway_distances = df_stations.copy()
  df_subway_distances['distance_to_subway'] = 0

  for index, row in df_subway_distances.iterrows():
    #print(f"index: {index}")
    df_subway_distances['distance_to_subway'] = df_subway_distances.apply(
        lambda var:
        calculate_distance_to_subway(
            df_subway,
            var['start_lat'],
            var['start_lng']
          ), axis=1)

  return df_subway_distances

## Sociodemographic Selection

In [26]:
def get_tracts_racialized(df_race_census: pd.DataFrame, slice_size: int) -> pd.DataFrame:
  df_racial_delta = df_race_census[[
    'NAME',
    'geometry',
    'Total:  >  Population of one race:  >  White alone',
    'Total:  >  Population of one race:  >  Black or African American alone',
    'Total:  >  Population of one race:  >  American Indian and Alaska Native alone',
    'Total:  >  Population of one race:  >  Asian alone',
    'Total:  >  Population of one race:  >  Native Hawaiian and Other Pacific Islander alone',
    'Total:  >  Population of one race:  >  Some Other Race alone'
  ]]

  df_racial_delta['Racial Ratio'] = pd.Series()
  df_racial_delta['Racial Label'] = pd.Series()

  for i, row in df_racial_delta.iterrows():
    whites = row['Total:  >  Population of one race:  >  White alone']
    non_whites = (
      row['Total:  >  Population of one race:  >  Black or African American alone'] +
      row['Total:  >  Population of one race:  >  American Indian and Alaska Native alone'] +
      row['Total:  >  Population of one race:  >  Asian alone'] +
      row['Total:  >  Population of one race:  >  Native Hawaiian and Other Pacific Islander alone'] +
      row['Total:  >  Population of one race:  >  Some Other Race alone']
    )

    if whites > non_whites:
      label = "Whites"
    elif whites < non_whites:
      label = "Non-Whites"
    else:
      label = "None"

    if whites == 0.0 or non_whites == 0.0:
      ratio = 0
      label = "None"
    else:
      #ratio = min(whites, non_whites) / max(whites, non_whites)
      total = (whites + non_whites)
      bigger = max(whites, non_whites)
      ratio =  bigger / total

    df_racial_delta.loc[i,'Racial Ratio'] = ratio
    df_racial_delta.loc[i,'Racial Label'] = label

  df_racial_delta.sort_values(by='Racial Ratio', ascending=False, inplace=True)
  df_racial_delta.reset_index(inplace=True)
  ids_to_drop = df_racial_delta[df_racial_delta['Racial Label'] == "None"].index
  df_racial_delta.drop(ids_to_drop, inplace=True)

  gdf_top_slice_racialized = gpd.GeoDataFrame(
    df_racial_delta,
    geometry=df_racial_delta['geometry'],
    crs=4326)

  top_whites = gdf_top_slice_racialized[gdf_top_slice_racialized['Racial Label'] == 'Whites'].sort_values(by='Racial Ratio', ascending=False)[:slice_size]
  top_nonwhites = gdf_top_slice_racialized[gdf_top_slice_racialized['Racial Label'] == 'Non-Whites'].sort_values(by='Racial Ratio', ascending=False)[:slice_size]
  sliced_tracts = pd.concat([top_whites, top_nonwhites])
  sliced_tracts.reset_index(inplace=True)

  return sliced_tracts

In [27]:
def build_racialized_df(df: pd.DataFrame, df_tracts: gpd.geodataframe):
  df = df[['start_station_id', 'start_lat', 'start_lng']]
  df.drop_duplicates(subset=['start_station_id'], inplace=True)
  gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.start_lng, df.start_lat), crs="EPSG:4326")
  gdf['racial_bias'] = None
  gdf['racial_ratio'] = 0.0

  for _, t in df_tracts.iterrows():
    for index, p in gdf.iterrows():
      v = p.geometry.within(t.geometry)
      if v:
        gdf.loc[index, 'racial_bias'] = t['Racial Label']
        gdf.loc[index, 'racial_ratio'] = t['Racial Ratio']

  return gdf

## Geographic Selection

In [28]:
def get_manhattan_gdf() -> gpd.GeoDataFrame:
  path = geodatasets.get_path("nybb")
  df_nybb = gpd.read_file(path)
  df_nybb = df_nybb.to_crs(epsg=4326)
  manhattan_polygon = df_nybb.iloc[3]['geometry']
  manhattan_gdf = gpd.GeoDataFrame(geometry=[manhattan_polygon])
  manhattan_gdf.crs = "EPSG:4326"
  return manhattan_gdf

In [29]:
def filter_by_manhattan(df: pd.DataFrame):
  #df['Point Geometry'] = df.apply(lambda row: Point(row['start_lng'], row['start_lat']), axis=1)
  manhattan_gdf = get_manhattan_gdf()
  df = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.start_lng, df.start_lat), crs="EPSG:4326")
  df = gpd.sjoin(df, manhattan_gdf, predicate='within')

  return df

#  Module: Visualize

In [30]:
def feature_overview(data: pd.DataFrame) -> pd.DataFrame:
  ''' Returns a DataFrame with key insights from dataset, intented to showcase the metrics of which we determine data quality. '''
  return pd.DataFrame({
      'feature': data.columns.values,  # feature names
      'data_type': data.dtypes.values,  # data types
      'null_value(%)': data.isna().mean().values * 100,  # percentage of null values
      'neg_value(%)': [len(data[col][data[col] < 0]) / len(data) * 100 if col in data.select_dtypes(include=[np.number]).columns else 0 for col in data.columns],  # percentage of negative values
      '0_value(%)': [len(data[col][data[col] == 0]) / len(data) * 100 if col in data.select_dtypes(include=[np.number]).columns else 0 for col in data.columns],  # percentage of zero values
      'duplicate': data.duplicated().sum(),  # amount of duplicates
      'n_unique': data.nunique().values,  # amount of unique values
      'sample_unique': [data[col].unique() for col in data.columns]  # sample of unique values
  }).round(3)

In [31]:
def present_distribution_across_cluster(cluster_summary: pd.DataFrame):
  plt.figure(figsize=(8, 8))
  plt.pie(cluster_summary['count'], labels=cluster_summary.index, autopct='%1.1f%%', startangle=90)
  plt.title('Distribution of Stations across Clusters')
  plt.axis('equal')
  plt.show()

In [32]:
def present_geolocation_of_clusters(station_locations: pd.DataFrame, colors = []):
  center_lat = station_locations['start_lat'].mean()
  center_lon = station_locations['start_lng'].mean()
  m = folium.Map(location=[center_lat, center_lon], zoom_start=12)

  for index, row in station_locations.iterrows():
      cluster = int(row['cluster'])
      folium.CircleMarker(
          location=[row['start_lat'], row['start_lng']],
          radius=5,
          color=colors[cluster % len(colors)],
          fill=True,
          fill_color=colors[cluster % len(colors)],
          fill_opacity=0.7,
          popup=f"Station: {index}, Cluster: {cluster}"
      ).add_to(m)

  m.save('geoclusterized.html')

  Fullscreen().add_to(m)
  display(m)

In [33]:
def present_subway_locations(subway_stations: pd.DataFrame):
  center_lat = subway_stations['GTFS Latitude'].mean()
  center_lon = subway_stations['GTFS Longitude'].mean()
  m = folium.Map(location=[center_lat, center_lon], zoom_start=12)
  for index, row in subway_stations.iterrows():
      folium.CircleMarker(
          location=[row['GTFS Latitude'], row['GTFS Longitude']],
          radius=5,
          color='red',
          fill=True,
          fill_color='red',
          fill_opacity=0.7,
          popup=f"Station: {subway_stations['Stop Name']}"
      ).add_to(m)

  m.save('subway_locations.html')

  Fullscreen().add_to(m)
  display(m)

In [34]:
def visualize_frequency_clusters():
  df_freq = frequency_of_trips_per_station(df_base)
  present_distribution_across_cluster(clusterize_stations_by_frequency(df_freq, 3))

#  Module: Train

## Global Model

In [52]:
def train_global_model(df: pd.DataFrame):
  global_model_setup = setup(
      data=df,
      target='trip_amount',
      transform_target=True,
      numeric_features=['day', 'hour', 'avg_trip_duration', 'avg_trip_distance', 'member_ratio'],
      categorical_features=['weekday', 'is_weekend'],
      use_gpu = True,
      session_id=123)
  model_comparison = compare_models()
  best_model = create_model(model_comparison)
  return best_model


def tune_global_model(model):
  tuned_model = tune_model(model)
  evaluate_model(tuned_model)
  return tuned_model

## Local Model

### Data prep

In [36]:
def ranks_by_trip_amount(df, slice_size):
  most_trips_ids = pd.DataFrame({})

  for bias in df['racial_bias'].unique():
    df_bias = df[df['racial_bias'] == bias][['start_station_id','ride_id']]
    trip_count = df_bias.groupby('start_station_id').count().reset_index()
    most_trips_ids = pd.concat([most_trips_ids, trip_count.sort_values(by='ride_id', ascending=False)[['start_station_id', 'ride_id']][:slice_size]])

  most_trips_ids = pd.merge(most_trips_ids, df[['racial_bias', 'racial_ratio', 'start_station_id']], on='start_station_id', how='left').drop_duplicates()
  return most_trips_ids

In [37]:
def prep_local_model(df: pd.DataFrame, df_global_model, df_census, slice_size: int) -> pd.DataFrame:
  df_base = df.copy()

  df_tracts = get_tracts_racialized(df_census, slice_size)

  df_racialized = build_racialized_df(df_base, df_tracts)

  df_filtered_racialized = pd.merge(df_base, df_racialized[['start_station_id', 'racial_bias', 'racial_ratio']], on='start_station_id', how='right')
  # this introduces the two new racial features and drops every row of non-racialized boroughs

  df_with_dates, cleanup = features_timestamp(df_filtered_racialized)
  df_with_dates.drop(cleanup, axis=1, inplace=True)

  df_aggregated = df_with_dates.groupby(['start_station_id', 'day', 'hour']).agg(
      trip_amount=('ride_id', 'count'),  # Count of 'ride_id'
  ).reset_index()

  coords = df_filtered_racialized[['start_station_id', 'start_lat', 'start_lng']].drop_duplicates(subset=['start_station_id'])

  df_with_coords = pd.merge(df_aggregated, coords, on='start_station_id', how='left')

  most_trips_ids = ranks_by_trip_amount(df_filtered_racialized, slice_size)

  df_filtered = df_with_coords[df_with_coords['start_station_id'].isin(most_trips_ids['start_station_id'])]
  df_filtered = df_filtered[['start_station_id', 'start_lat', 'start_lng']].drop_duplicates()

  station_data = GIS_Data_Provider.get_subway_stations()[['GTFS Latitude', 'GTFS Longitude', 'Station ID']]

  df_distances = feature_distance_to_subway(df_filtered, station_data)
  df_distances = df_distances.merge(df_racialized[['start_station_id', 'racial_bias', 'racial_ratio']], on='start_station_id', how='left')

  df_station_features = df_distances.merge(most_trips_ids, on='start_station_id').rename(columns={'ride_id': 'trip_count'})

  df_station_features = df_station_features.merge(df_racialized[['start_station_id', 'racial_bias', 'racial_ratio']], on='start_station_id', how='left')

  df_share = pd.merge(
    df_with_coords,
    df_global_model[['day', 'hour', 'trip_amount']],
    on=['day', 'hour'], how='left').rename(
        columns={'trip_amount_x': 'trip_amount','trip_amount_y':'total'})
  
  df_share['share'] = (df_share['trip_amount'] / df_share['total'])*100
  #df_share['share'] = df_share['total'] - df_share['trip_amount']

  df_final = pd.merge(
    df_station_features[['start_station_id', 'distance_to_subway', 'racial_bias', 'racial_ratio']],
    df_share,
    on=['start_station_id'], how='left')
  df_final.drop(['trip_amount'], axis=1, inplace=True)

  return df_final

### Train

In [38]:
def train_tune_eval(df_local_features: pd.DataFrame):
  d_metrics = {}
  for station_id in df_local_features['start_station_id'].unique():
    this_stations_data: pd.DataFrame = df_local_features[df_local_features['start_station_id'] == station_id]
    this_stations_data.drop(columns=['start_station_id', 'total', 'start_lat', 'start_lng'], inplace=True)
    s = setup(
      data=this_stations_data,
      target='share',
      transform_target=True,
      numeric_features=['day', 'hour', 'distance_to_subway', 'racial_ratio'],# 'total'],# 'start_lat', 'start_lng'],
      categorical_features=['racial_bias'],
      #use_gpu = True,
      verbose = False)
    best_model = compare_models()#verbose=False)
    model = create_model(best_model)#, verbose=False)
    tuned_model = tune_model(model, optimize="R2")#, verbose=False)
    evaluate_model(tuned_model)

    d_metrics[station_id] = get_metrics()
    display(f'{station_id} model: {str(tuned_model)}')
    
  return d_metrics

In [39]:
def train_local_model(df_local_features: pd.DataFrame):
  station_setups = {}
  station_models = {}
  for station_id in df_local_features['start_station_id'].unique():
    this_stations_data = df_local_features[df_local_features['start_station_id'] == station_id]

    station_setups[station_id] = setup(
      data=this_stations_data,
      target='share',
      transform_target=True,
      numeric_features=['day', 'hour', 'distance_to_subway', 'start_lat', 'start_lng', 'total', 'racial_ratio'],
      categorical_features=['start_station_id', 'racial_bias'],
      use_gpu = True,
      verbose = False,
      session_id=123)
    best_model = compare_models()
    station_models[station_id] = create_model(best_model)
  return station_models

In [40]:
def tune_local_model(model):
  tuned_model = tune_model(model)
  evaluate_model(tuned_model)
  return tuned_model

In [41]:
def inference(df_unseen: pd.DataFrame, model):
  return predict_model(model, data=df_unseen)

#  Module: Evaluate

# Main: Run

## setup

In [42]:
# run once
provider = DataProvider()
provider.load_jan2020()

<__main__.DataProvider at 0x153853443a0>

In [43]:
# run to restart dataframe
df_base = provider.get_df()

In [44]:
cleaning_steps = [
    (step_handle_duplicates,  {}), # these are pure functions
    (step_drop_unused,        {}),
    (step_handle_missing,     {}),
]
cleaning_pipeline = CleaningPipeline(steps=cleaning_steps)

In [45]:
feature_steps = [
    (features_timestamp,    {}),
    (feature_trip_distance, {}),
    #(feature_member_ratio, {}), # unfortunately I had to couple this feature into the aggregation
    (aggregate_by_hour,     {}),
]
feature_pipeline = FeaturePipeline(steps=feature_steps)

In [46]:
experiment = Experiment(
    data_used="2020_complete",
    features=feature_pipeline,
    cleaning=cleaning_pipeline
)
experiment.pipeline_definition # just to check the pipeline

{'features': "Pipeline(steps=[('features_timestamp', {}), ('feature_trip_distance', {}), ('aggregate_by_hour', {})])",
 'cleaning': "Pipeline(steps=[('step_handle_duplicates', {}), ('step_drop_unused', {}), ('step_handle_missing', {})])"}

In [91]:
cache_dfs = {
    'global' : {},
    'local' : {}
}

In [None]:
for file in p.iterdir():
    if file.is_file():
        month = file.name.split('-')[0][-2:]
        suff = file.name.split('.')[0][-2:]
        if suff == "_1":
            display(f'starting: {month}')
            if month not in cache_dfs['global']:
                df_base = pd.read_csv(file, parse_dates=['started_at', 'ended_at'])
                df_global_model = experiment.run(df_base) # runs the experiment
                cache_dfs['global'][month] = df_global_model
            display(f'global done')
            if month not in cache_dfs['local']:
                df_local_model = prep_local_model(df_base.copy(), df_global_model, provider.df_census, 30)
                cache_dfs['local'][month] = df_local_model
            display(f'local done')
%store cache_dfs

In [1]:
%store -r cache_dfs

In [None]:
dfs_global_merged = pd.DataFrame()

In [None]:
for k, v in cache_dfs['global'].items():
    v['month'] = k
    dfs_global_merged = pd.concat([dfs_global_merged, v], ignore_index=True)
    #print(dfs_global_merged.shape)

In [2]:
dfs_local_merged = pd.DataFrame()

NameError: name 'pd' is not defined

In [219]:
for k, v in cache_dfs['local'].items():
    v['month'] = k
    dfs_local_merged = pd.concat([dfs_local_merged, v], ignore_index=True)

In [47]:
df_global_model = experiment.run(df_base) # runs the experiment

'cache size: 0'

'slow guy took: 27.854617199999893 seconds'

'cache size: 135064'

In [None]:
df_global_model

In [48]:
experiment.pipeline_results # checks pipeline's results

{'features': {'cleanup': ['started_at',
   'ended_at',
   'start_lat',
   'start_lng',
   'end_lat',
   'end_lng',
   'ride_id',
   'trip_duration',
   'trip_distance',
   'member_casual']},
 'cleaning': {'step_handle_duplicates': 0,
  'step_drop_unused': 5,
  'step_handle_missing': 2438}}

In [49]:
feature_overview(df_global_model) # peek at final dataframe

Unnamed: 0,feature,data_type,null_value(%),neg_value(%),0_value(%),duplicate,n_unique,sample_unique
0,day,int32,0.0,0.0,0.0,0,31,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
1,hour,int32,0.0,0.0,4.167,0,24,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
2,member_ratio,float64,0.0,0.0,0.0,0,711,"[0.7975609756097561, 0.7983706720977597, 0.838..."
3,trip_amount,int64,0.0,0.0,0.0,0,638,"[410, 491, 378, 195, 102, 65, 98, 134, 306, 40..."
4,avg_trip_duration,float64,0.0,0.0,0.0,0,744,"[5411.295278048781, 1140.1098370672098, 1264.6..."
5,avg_trip_distance,float64,0.0,0.0,0.0,0,744,"[1.7635418672415926, 1.7115760695428668, 1.777..."
6,weekday,int32,0.0,0.0,12.903,0,7,"[2, 3, 4, 5, 6, 0, 1]"
7,is_weekend,bool,0.0,0.0,0.0,0,2,"[False, True]"


## global training

In [53]:
model = train_global_model(df_global_model)
#model = train_global_model(dfs_global_merged)

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: gfx1031, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: gfx1031, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Dev

Unnamed: 0,Description,Value
0,Session id,123
1,Target,trip_amount
2,Target type,Regression
3,Original data shape,"(744, 8)"
4,Transformed data shape,"(744, 14)"
5,Transformed train set shape,"(520, 14)"
6,Transformed test set shape,"(224, 14)"
7,Numeric features,5
8,Categorical features,2
9,Preprocess,True


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: gfx1031, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: gfx1031, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,163.2123,62681.0717,246.2611,0.9535,0.3165,0.2587,0.13
xgboost,Extreme Gradient Boosting,171.528,70956.4887,260.5561,0.9466,0.3721,0.3125,0.126
lightgbm,Light Gradient Boosting Machine,185.4945,74995.9974,268.9869,0.9449,0.3841,0.3096,0.415
gbr,Gradient Boosting Regressor,188.4818,76030.6217,271.6348,0.9434,0.3742,0.3019,0.124
rf,Random Forest Regressor,192.5525,94192.9948,299.0795,0.928,0.3999,0.3648,0.16
dt,Decision Tree Regressor,259.5135,179824.3635,409.4879,0.8631,0.53,0.4335,0.053
ada,AdaBoost Regressor,383.3817,276887.3754,523.0362,0.7937,0.5291,0.5226,0.11
lr,Linear Regression,670.5927,736036.437,855.5841,0.4429,0.9534,1.4506,0.052
lar,Least Angle Regression,664.347,742965.555,858.9625,0.4332,0.9751,1.4282,0.052
br,Bayesian Ridge,679.2719,755040.5986,866.7735,0.4301,0.9507,1.4591,0.05


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,180.9611,78524.8349,280.2228,0.9432,0.3299,0.29
1,142.2975,44526.4594,211.0129,0.9655,0.3299,0.2634
2,143.7552,37394.9588,193.3778,0.969,0.2851,0.2022
3,137.429,42561.8997,206.3054,0.9679,0.2583,0.2033
4,160.8146,69041.9595,262.7584,0.9196,0.5134,0.5175
5,210.8943,120337.7099,346.8973,0.9265,0.2712,0.1979
6,139.926,45728.7004,213.8427,0.9671,0.2676,0.1834
7,197.1409,82207.037,286.7177,0.9509,0.3913,0.3501
8,158.8767,54101.7638,232.5979,0.9591,0.2403,0.1561
9,160.0275,52385.394,228.8786,0.9665,0.2782,0.223


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
tuned_model = tune_global_model(model)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,149.7727,61563.4485,248.1198,0.9575,0.3754,0.4619
1,148.9762,60942.1943,246.8647,0.9533,0.3946,0.8927
2,133.7653,49353.1838,222.1558,0.9628,0.3525,0.7786
3,150.0184,65718.2144,256.3556,0.9575,0.4111,0.6257
4,156.2058,63544.4625,252.0803,0.9498,0.2655,0.2166
5,153.7008,62169.5852,249.3383,0.9504,0.3172,0.229
6,161.0996,79167.5023,281.3672,0.9479,0.282,0.2185
7,134.6224,42925.0657,207.1837,0.967,0.3615,0.5047
8,161.4943,86273.1654,293.7229,0.9427,0.4811,1.9707
9,162.4585,83513.1413,288.9864,0.9404,0.3061,0.2256


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [126]:
predictions = predict_model(tuned_model)
predictions

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extreme Gradient Boosting,145.8166,60236.9842,245.4322,0.9593,0.3134,0.2697


Unnamed: 0,day,hour,member_ratio,avg_trip_duration,avg_trip_distance,weekday,is_weekend,month,trip_amount,prediction_label
6971,17,8,0.806452,1323.685059,2.057690,5,True,10,992,1018.371887
6185,15,0,0.555556,1103.198486,2.011301,1,False,09,207,245.840820
3096,8,1,0.602151,1059.580566,1.459651,4,False,05,93,66.266029
6643,3,16,0.617921,1696.373535,2.237588,5,True,10,3973,3656.745605
7316,31,17,0.745570,1050.436768,1.995518,5,True,10,2370,2233.386963
...,...,...,...,...,...,...,...,...,...,...
6833,11,14,0.634721,4950.624023,2.186579,6,True,10,3099,2985.665527
6054,9,13,0.786963,1040.258667,1.953840,2,False,09,1887,1873.997803
1266,22,18,0.865431,11781.930664,1.569582,5,True,02,2051,1599.925781
3354,18,19,0.746672,1614.018677,1.915291,0,False,05,2404,2274.067871


In [127]:
metrics = pull()
metrics

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extreme Gradient Boosting,145.8166,60236.9842,245.4322,0.9593,0.3134,0.2697


In [None]:
total_stations = df_base.copy()['start_station_id'].drop_duplicates()

In [None]:
total_stations.shape

In [None]:
df_local_model = df_base.copy()

In [None]:
df_local_model = prep_local_model(df_base.copy(), df_global_model, provider.df_census, 30)

In [None]:
df_local_model.to_csv("df_local_model_slice_30.csv", index=False)

## local training

In [None]:
local_model = train_local_model(dfs_local_merged)

In [None]:
local_model = train_local_model(df_local_model)

In [None]:
print(local_model)

In [210]:
metrics = {}

In [None]:
for model in local_model:
  tuned_local_model = tune_local_model(local_model[model])
  local_model[model] = tuned_local_model
  metrics[model] = pull()

In [None]:
metrics = train_tune_eval(dfs_local_merged)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
dt,Decision Tree Regressor,0.0001,0.0,0.0001,1.0,0.0,0.0,0.012
rf,Random Forest Regressor,6.0517,312.559,16.9132,0.9998,0.0084,0.0032,0.04
et,Extra Trees Regressor,15.9762,1435.5373,36.8239,0.999,0.0257,0.0093,0.031
lightgbm,Light Gradient Boosting Machine,81.063,15698.1716,124.5292,0.989,0.0947,0.0546,0.043
gbr,Gradient Boosting Regressor,281.2559,165337.2156,406.1222,0.8844,0.2273,0.168,0.023
knn,K Neighbors Regressor,308.9401,222778.1,468.8274,0.8456,0.2957,0.2305,0.013
ada,AdaBoost Regressor,545.0313,453326.505,672.8093,0.6818,0.4185,0.3916,0.029
dummy,Dummy Regressor,964.1523,1465147.2625,1209.7305,-0.02,0.8014,1.2114,0.012


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0001,0.0,0.0001,1.0,0.0,0.0
1,0.0001,0.0,0.0001,1.0,0.0,0.0
2,0.0001,0.0,0.0001,1.0,0.0,0.0
3,0.0001,0.0,0.0001,1.0,0.0,0.0
4,0.0001,0.0,0.0001,1.0,0.0,0.0
5,0.0001,0.0,0.0001,1.0,0.0,0.0
6,0.0001,0.0,0.0001,1.0,0.0,0.0
7,0.0001,0.0,0.0001,1.0,0.0,0.0
8,0.0001,0.0,0.0002,1.0,0.0,0.0
9,0.0001,0.0,0.0001,1.0,0.0,0.0


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0001,0.0,0.0001,1.0,0.0,0.0
1,0.0001,0.0,0.0001,1.0,0.0,0.0
2,0.0001,0.0,0.0001,1.0,0.0,0.0
3,0.0001,0.0,0.0001,1.0,0.0,0.0
4,1.2695,146.1035,12.0873,0.9999,0.0055,0.0006
5,0.0001,0.0,0.0001,1.0,0.0,0.0
6,0.0001,0.0,0.0001,1.0,0.0,0.0
7,2.1251,307.097,17.5242,0.9998,0.0068,0.0008
8,1.5077,206.0818,14.3555,0.9999,0.0067,0.0007
9,0.0001,0.0,0.0001,1.0,0.0,0.0


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

'5128.04 model: DecisionTreeRegressor(random_state=1692)'

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
dt,Decision Tree Regressor,0.1836,51.0125,2.2587,1.0,0.0007,0.0001,0.014
rf,Random Forest Regressor,5.8773,258.9907,15.4548,0.9998,0.0159,0.0046,0.049
et,Extra Trees Regressor,17.1889,1477.9898,38.1734,0.9989,0.0297,0.0113,0.041
lightgbm,Light Gradient Boosting Machine,78.7791,12371.3206,110.9999,0.9911,0.0988,0.0596,0.045
gbr,Gradient Boosting Regressor,295.3505,180487.0803,424.3104,0.8706,0.3039,0.2094,0.024
knn,K Neighbors Regressor,309.1555,202464.3531,449.375,0.8544,0.3268,0.2646,0.015
ada,AdaBoost Regressor,546.4747,442757.7522,664.7584,0.683,0.4528,0.4386,0.043
dummy,Dummy Regressor,971.6777,1421470.675,1191.1049,-0.0159,0.86,1.4445,0.012


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.8351,510.1246,22.5859,0.9997,0.0066,0.0005
1,0.0001,0.0,0.0001,1.0,0.0,0.0
2,0.0001,0.0,0.0001,1.0,0.0,0.0
3,0.0001,0.0,0.0001,1.0,0.0,0.0
4,0.0001,0.0,0.0001,1.0,0.0,0.0
5,0.0001,0.0,0.0001,1.0,0.0,0.0
6,0.0001,0.0,0.0001,1.0,0.0,0.0
7,0.0001,0.0,0.0001,1.0,0.0,0.0
8,0.0001,0.0,0.0001,1.0,0.0,0.0
9,0.0001,0.0,0.0001,1.0,0.0,0.0


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,52.4931,23764.527,154.1575,0.9844,0.0868,0.0306
1,34.6799,6812.7778,82.5396,0.995,0.0541,0.021
2,50.7141,19185.1094,138.5103,0.986,0.08,0.0302
3,56.3444,20580.1483,143.4578,0.9833,0.0634,0.0248
4,49.2097,12544.5961,112.0027,0.9898,0.0638,0.0271
5,44.4157,11772.7616,108.5024,0.9917,0.0586,0.0222
6,38.5574,8275.7194,90.971,0.9946,0.0515,0.0203
7,45.2309,10697.9386,103.4308,0.9932,0.0535,0.0225
8,52.4769,18539.1325,136.1585,0.9855,0.063,0.0244
9,43.6229,10677.5705,103.3323,0.9928,0.0586,0.0225


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

'5159.04 model: DecisionTreeRegressor(random_state=1085)'

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
dt,Decision Tree Regressor,0.1996,31.765,2.4328,1.0,0.0011,0.0001,0.011
rf,Random Forest Regressor,5.5805,214.321,14.0151,0.9998,0.0088,0.0033,0.04
et,Extra Trees Regressor,16.1634,2442.4459,46.6454,0.9983,0.0252,0.0087,0.029
lightgbm,Light Gradient Boosting Machine,83.6861,15996.8381,126.0544,0.9887,0.1118,0.063,0.039
gbr,Gradient Boosting Regressor,273.0851,169209.764,409.6404,0.8816,0.2715,0.1642,0.022
knn,K Neighbors Regressor,323.3546,236914.2078,484.8775,0.8348,0.2978,0.2378,0.023
ada,AdaBoost Regressor,510.3769,412519.4559,641.5693,0.709,0.4156,0.3912,0.025
dummy,Dummy Regressor,983.116,1458842.25,1205.8125,-0.0175,0.8522,1.4215,0.012


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0001,0.0,0.0001,1.0,0.0,0.0
1,0.0001,0.0,0.0001,1.0,0.0,0.0
2,0.0001,0.0,0.0001,1.0,0.0,0.0
3,0.0001,0.0,0.0001,1.0,0.0,0.0
4,1.1016,239.0304,15.4606,0.9998,0.0084,0.0006
5,0.0001,0.0,0.0001,1.0,0.0,0.0
6,0.0001,0.0,0.0001,1.0,0.0,0.0
7,0.0001,0.0,0.0001,1.0,0.0,0.0
8,0.8935,78.6196,8.8668,1.0,0.0028,0.0003
9,0.0001,0.0,0.0001,1.0,0.0,0.0


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Exception ignored in: <function tqdm.__del__ at 0x0000022AF351BAF0>
Traceback (most recent call last):
  File "C:\Users\Gabriel\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\tqdm\std.py", line 1148, in __del__
    self.close()
KeyboardInterrupt: 


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2.1866,300.8475,17.345,0.9998,0.0055,0.0009
1,8.5857,946.4163,30.7639,0.9993,0.0105,0.003
2,28.4785,21144.0277,145.4099,0.9862,0.0455,0.0088
3,16.5649,4169.1621,64.569,0.9975,0.0244,0.0063
4,11.7827,2238.8036,47.316,0.9982,0.0169,0.0043
5,17.5623,4721.3572,68.7121,0.9962,0.0297,0.0071
6,12.595,2789.4114,52.8149,0.998,0.0185,0.0046
7,10.6464,2866.8057,53.5426,0.998,0.0171,0.0036
8,6.891,1413.0073,37.59,0.9992,0.0198,0.0034
9,11.6988,2391.3951,48.9019,0.9983,0.0155,0.0041


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

'5167.06 model: DecisionTreeRegressor(random_state=3065)'

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [159]:
df_model_comparison = pd.DataFrame(d)

In [163]:
df_model_comparison.columns = ["start_station_id", "model"]

In [165]:
df_model_per_station = pd.merge(df_metrics, df_model_comparison, how="left", on="start_station_id")

In [144]:
local_model.keys()

dict_keys([5128.04, 5159.04, 5167.06, 5190.07, 5198.04, 5270.05, 5303.08, 5326.06, 5351.03, 5436.09, 5506.14, 5575.08, 5755.01, 5763.03, 5788.13, 5914.03, 5922.08, 5947.04, 6030.04, 6072.06, 6072.14, 6822.09, 6925.09, 7407.13, '5128.04', '5159.04', '5167.06', '5190.07', '5198.04', '5270.05', '5303.08', '5326.06', '5351.03', '5436.09', '5506.10', '5506.14', '5532.01', '5569.06', '5575.08', '5755.01', '5763.03', '5788.13', '5797.01', '5880.02', '5914.03', '5914.08', '5922.07', '5922.08', '5947.04', '5964.01', '6030.04', '6039.06', '6072.06', '6072.14', '6822.09', '6925.09', '7407.13', '7627.10', '7640.04', '7756.10', 5575.12, 6039.06, '5575.12', '7084.12', '7281.09', '6860.12', '7092.06', '7128.08', '7146.04', '7154.07', '7188.13', '7204.08', '7243.04', '7304.08', '7456.03', '7579.01', '7599.02', '7622.12', '7662.13', '7795.09', '7886.02', 7756.1, 7886.02, '5644.05', '7293.10', '5955.12', '6969.08', '7028.04', '7113.08', '7648.16', '7769.06', '7774.02', '7832.04', '7809.13', '7893.05', '

In [157]:
d = []
for k, v in local_model.items():
    d.append((k, str(v)))

In [209]:
metrics.keys()

dict_keys([5128.04, 5159.04, 5167.06, 5190.07, 5198.04, 5270.05, 5303.08, 5326.06, 5351.03, 5436.09, 5506.14, 5575.08, 5755.01, 5763.03, 5788.13, 5914.03, 5922.08, 5947.04, 6030.04, 6072.06, 6072.14, 6822.09, 6925.09, 7407.13, '5128.04', '5159.04', '5167.06', '5190.07', '5198.04', '5270.05', '5303.08', '5326.06', '5351.03', '5436.09', '5506.10', '5506.14', '5532.01', '5569.06', '5575.08', '5755.01', '5763.03', '5788.13', '5797.01', '5880.02', '5914.03', '5914.08', '5922.07', '5922.08', '5947.04', '5964.01', '6030.04', '6039.06', '6072.06', '6072.14', '6822.09', '6925.09', '7407.13', '7627.10', '7640.04', '7756.10', 5575.12, 6039.06, '5575.12', '7084.12', '7281.09', '6860.12', '7092.06', '7128.08', '7146.04', '7154.07', '7188.13', '7204.08', '7243.04', '7304.08', '7456.03', '7579.01', '7599.02', '7622.12', '7662.13', '7795.09', '7886.02', 7756.1, 7886.02, '5644.05', '7293.10', '5955.12', '6969.08', '7028.04', '7113.08', '7648.16', '7769.06', '7774.02', '7832.04', '7809.13', '7893.05', '

In [134]:
df_metrics = pd.DataFrame({
    'station_id': [],
    'MAE':[],
    'MSE': [],
    'RMSE': [],
    'R2': [],
    'RMSLE': [],
    'MAPE': [],
})

In [135]:
for k, v in metrics.items():
  station = pd.DataFrame([v.loc['Mean'].values], columns=['MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE'])
  station['start_station_id'] = k  # Add the station ID

  df_metrics = pd.concat([df_metrics, station], ignore_index=True)

In [137]:
df_metrics = pd.merge(df_metrics, df_local_model[['start_station_id', 'racial_bias', 'racial_ratio']].drop_duplicates(subset=['start_station_id']), on='start_station_id')

In [138]:
metrics_means = {
    'Non-Whites': {},
    'Whites': {}
}

In [139]:
for key in df_metrics[['racial_bias', 'MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE']]:
    if key != 'racial_bias':
        for bias in ['Non-Whites', 'Whites']:
            metrics_means[bias][key] = df_metrics[df_metrics['racial_bias'] == bias][key].mean()

In [140]:
df_metrics_means = pd.DataFrame(metrics_means)

## explore

#### Every station

In [None]:
ny_map = folium.Map(location=[40.70, -73.94], zoom_start=12, tiles="CartoDB positron")

In [None]:
df_all_stations = df_base.drop_duplicates(subset=['start_station_id'])[['start_station_id', 'start_lat', 'start_lng']]
df_all_stations

In [None]:
for _, r in df_final.iterrows():
  folium.Marker(
      location=[r.start_lat, r.start_lng],
      popup=f"Station id: {r.start_station_id}",
  ).add_to(n)

In [None]:
n

#### Manhattan's top5 stations

In [None]:
manhattan_map = folium.Map(location=[40.70, -73.94], zoom_start=12, tiles="CartoDB positron")

In [None]:
df_manhattan_stations = df_local_model.drop_duplicates(subset=['start_station_id'])
df_manhattan_stations

In [None]:
for _, r in df_manhattan_stations.iterrows():
  folium.Marker(
      location=[r.start_lat, r.start_lng],
      popup=f"Station id: {r.start_station_id}",
  ).add_to(manhattan_map)

In [None]:
manhattan_map

#### NYC census tracts

In [None]:
n = folium.Map(location=[40.70, -73.94], zoom_start=12, tiles="CartoDB positron")

In [None]:
gdf_top_slice_racialized = get_tracts_racialized(df_race_census, 50)

In [None]:
tracts_filter = ['159','157','44','38','317.04','218','216','162'] # hardcoded for now

In [None]:
filtered_by_tract = gdf_top_slice_racialized[gdf_top_slice_racialized['NAME'].isin(tracts_filter)]

In [None]:
for _, r in filtered_by_tract.iterrows():
    sim_geo = gpd.GeoSeries(r["geometry"]).simplify(tolerance=0.001)
    geo_j = sim_geo.to_json()

    if r["Racial Label"] == "Whites":
      geo_j = folium.GeoJson(data=geo_j, style_function=lambda x: {"fillColor": "red"})
    else:
      geo_j = folium.GeoJson(data=geo_j, style_function=lambda x: {"fillColor": "blue"})

    folium.Popup(f"Name: {r['NAME']} Delta: {r['Racial Delta']}").add_to(geo_j)
    geo_j.add_to(n)

In [None]:
df_manhattan_top = df_local_model[['start_station_id', 'start_lat', 'start_lng']].drop_duplicates(subset=['start_station_id'])
df_manhattan_top

In [None]:
gdf_manhattan_top = gpd.GeoDataFrame(
    df_manhattan_top,
    geometry=gpd.points_from_xy(df_manhattan_top.start_lng, df_manhattan_top.start_lat),
    crs=4326)
gdf_manhattan_top

In [None]:
pins = df_local_model[['start_station_id', 'start_lat', 'start_lng', 'racial_bias', 'racial_delta_magnitude']].drop_duplicates(subset=['start_station_id'])
pins.sort_values(by='racial_delta_magnitude', ascending=False, inplace=True)
pins.reset_index(drop=True, inplace=True)
pins

In [None]:
for _, r in pins.iterrows():
    lat = r['start_lat']
    lon = r['start_lng']
    folium.Marker(
        location=[lat, lon],
        popup=f"id:{r['start_station_id']}\nbias:{r['racial_bias']}\ndelta:{r['racial_delta_magnitude']}",
    ).add_to(n)

In [None]:
n