## ML for foot traffic score
Prediction for the foot traffic score using the Yellow Taxi Data from 2020 - 2023 to predict the foot traffic score during daytimes (morning, afternoon, evening, night)

### Cleaning and preparing foot traffic scores from 2020 - 2023

**2020**

In [4]:
import pandas as pd
import geopandas as gpd
from sklearn.preprocessing import MinMaxScaler

# 1. Load the CSV file
df_2020 = pd.read_csv('2020_Yellow_Taxi_Trip_Data.csv')

# 2. Convert pickup datetime using the correct format (e.g., "01/01/2021 12:30:10 AM")
df_2020['tpep_pickup_datetime'] = pd.to_datetime(
    df_2020['tpep_pickup_datetime'],
    format='%m/%d/%Y %I:%M:%S %p',
    errors='coerce'
)

df_2020['tpep_dropoff_datetime'] = pd.to_datetime(
    df_2020['tpep_dropoff_datetime'],
    format='%m/%d/%Y %I:%M:%S %p',
    errors='coerce'
)

# 3. Drop rows where conversion failed
df_2020 = df_2020.dropna(subset=['tpep_pickup_datetime'])
df_2020 = df_2020.dropna(subset=['tpep_dropoff_datetime'])

# 3b. Filter for Manhattan trips only (pickup and dropoff)
manhattan_zone_ids = [
    4, 12, 13, 14, 24, 41, 42, 43, 45, 48, 50, 68, 74, 75, 79, 87, 88, 90,
    100, 107, 113, 114, 116, 125, 127, 128, 137, 140, 141, 142, 143, 144, 148,
    151, 152, 153, 158, 161, 162, 163, 164, 166, 170, 186, 230, 231, 232, 233, 234
]
df_2020 = df_2020[
    df_2020['PULocationID'].isin(manhattan_zone_ids) &
    df_2020['DOLocationID'].isin(manhattan_zone_ids)
]

# 4. Extract pickup hour
df_2020['pickup_hour'] = df_2020['tpep_pickup_datetime'].dt.hour
df_2020['dropoff_hour'] = df_2020['tpep_dropoff_datetime'].dt.hour

# 5. Assign time of day
def get_time_of_day(hour):
    if 7 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 18:
        return 'afternoon'
    else:
        return 'evening'  # Consistent with your previous naming
    
#Remove unnecessary columns
df_2020 = df_2020.drop(['VendorID','trip_distance','RatecodeID','store_and_fwd_flag','payment_type',
 'fare_amount','extra','mta_tax','tip_amount','tolls_amount','improvement_surcharge','total_amount','congestion_surcharge'],axis=1)

  df_2020 = pd.read_csv('2020_Yellow_Taxi_Trip_Data.csv')


In [None]:
df_2020['pickup_time'] = df_2020['pickup_hour'].apply(get_time_of_day)
df_2020['dropoff_time'] = df_2020['dropoff_hour'].apply(get_time_of_day)

#Count pick ups and drop offs (raw)
pickup_counts_2020 = df_2020.groupby("PULocationID").size().reset_index(name="pickup_count")
dropoff_counts_2020 = df_2020.groupby("DOLocationID").size().reset_index(name="dropoff_count")
zones = gpd.read_file("taxi_zones/taxi_zones.shp") 

#Merge zones with pick up and drop off data
zones = zones[zones["borough"] == "Manhattan"]
zones_2020 = zones.merge(pickup_counts_2020, left_on="LocationID", right_on="PULocationID", how="left")
zones_2020 = zones_2020.merge(dropoff_counts_2020, left_on="LocationID", right_on="DOLocationID", how="left")
zones_2020 = zones_2020.fillna(0) 

#Normalize pickups and drop offs (normalized)
scaler = MinMaxScaler((1, 10))
zones_2020["dropoff_count2020"] = scaler.fit_transform(zones_2020[["dropoff_count"]])
zones_2020["pickup_count2020"] = scaler.fit_transform(zones_2020[["pickup_count"]])

#weighted sum of normalized score foot traffic score_2020 (0.7drop off + 0.3pickup)
zones_2020["foot_traffic_score2020"] = (0.7 * zones_2020["dropoff_count2020"] + 0.3 * zones_2020["pickup_count2020"])

Get Average Foot traffic score for each zone per daytime

In [28]:
from itertools import product
# Add a 'trip_date' column to df_2020
df_2020['trip_date'] = df_2020['tpep_pickup_datetime'].dt.date
df_2020['trip_date'] = pd.to_datetime(df_2020['trip_date']) # Convert to datetime objects for consistency

# Load taxi zones and filter for Manhattan (needed for all_manhattan_combinations)
zones = gpd.read_file("taxi_zones/taxi_zones.shp")
zones = zones[zones["borough"] == "Manhattan"].copy()
manhattan_location_ids = zones['LocationID'].unique() # Get all unique Manhattan zone IDs


# Calculate DAILY pickup and dropoff counts per zone per daytime
daily_pickup_counts_2020 = df_2020.groupby(['trip_date', 'PULocationID', 'pickup_time']).size().reset_index(name='daily_pickup_count')
daily_dropoff_counts_2020 = df_2020.groupby(['trip_date', 'DOLocationID', 'dropoff_time']).size().reset_index(name='daily_dropoff_count')


# Create a comprehensive set of ALL possible daily zone-daytime combinations
time_of_day_categories = ['morning', 'afternoon', 'evening']
all_2020_dates = df_2020['trip_date'].unique() # Get all unique dates in the 2020 data

all_daily_zone_daytime_combinations = pd.DataFrame(
    list(product(all_2020_dates, manhattan_location_ids, time_of_day_categories)),
    columns=['trip_date', 'LocationID', 'daytime_category']
)
all_daily_zone_daytime_combinations['trip_date'] = pd.to_datetime(all_daily_zone_daytime_combinations['trip_date'])


# Merge daily counts onto the combinations (left join to keep all combinations)
daily_zone_daytime_data_2020 = all_daily_zone_daytime_combinations.merge(
    daily_pickup_counts_2020,
    left_on=['trip_date', 'LocationID', 'daytime_category'],
    right_on=['trip_date', 'PULocationID', 'pickup_time'],
    how='left'
).drop(columns=['PULocationID', 'pickup_time'], errors='ignore')

daily_zone_daytime_data_2020 = daily_zone_daytime_data_2020.merge(
    daily_dropoff_counts_2020,
    left_on=['trip_date', 'LocationID', 'daytime_category'],
    right_on=['trip_date', 'DOLocationID', 'dropoff_time'],
    how='left'
).drop(columns=['DOLocationID', 'dropoff_time'], errors='ignore')

# Fill NaN values (where a zone had no pickups/dropoffs for a specific date/daytime) with 0
daily_zone_daytime_data_2020 = daily_zone_daytime_data_2020.fillna(0)


# Normalize DAILY pickups and drop offs
scaler_pickup = MinMaxScaler(feature_range=(1, 10))
scaler_dropoff = MinMaxScaler(feature_range=(1, 10))

daily_zone_daytime_data_2020["daily_dropoff_count_scaled"] = scaler_dropoff.fit_transform(daily_zone_daytime_data_2020[["daily_dropoff_count"]])
daily_zone_daytime_data_2020["daily_pickup_count_scaled"] = scaler_pickup.fit_transform(daily_zone_daytime_data_2020[["daily_pickup_count"]])


# Calculate the DAILY foot traffic score
daily_zone_daytime_data_2020["daily_foot_traffic_score"] = (
    0.7 * daily_zone_daytime_data_2020["daily_dropoff_count_scaled"] +
    0.3 * daily_zone_daytime_data_2020["daily_pickup_count_scaled"]
)

# Calculate the AVERAGE foot traffic score per zone per daytime
average_zone_daytime_scores_2020 = daily_zone_daytime_data_2020.groupby(['LocationID', 'daytime_category']).agg(
    average_foot_traffic_score=('daily_foot_traffic_score', 'mean'),
    std_foot_traffic_score=('daily_foot_traffic_score', 'std'), # Optional: standard deviation of daily scores
    num_days_recorded=('daily_foot_traffic_score', 'count') # How many days contributed to this average
).reset_index()

print("\n--- Average Foot Traffic Score per Zone per Daytime for 2020 ---")
print(average_zone_daytime_scores_2020.head(10)) # Display first 10 rows
print(f"\nShape of average_zone_daytime_scores_2020: {average_zone_daytime_scores_2020.shape}")
print(average_zone_daytime_scores_2020.info())


--- Average Foot Traffic Score per Zone per Daytime for 2020 ---
   LocationID daytime_category  average_foot_traffic_score  \
0           4        afternoon                    1.118557   
1           4          evening                    1.210089   
2           4          morning                    1.047516   
3          12        afternoon                    1.013198   
4          12          evening                    1.007476   
5          12          morning                    1.014248   
6          13        afternoon                    1.226177   
7          13          evening                    1.231847   
8          13          morning                    1.184655   
9          24        afternoon                    1.117078   

   std_foot_traffic_score  num_days_recorded  
0                0.086755                377  
1                0.292994                377  
2                0.039084                377  
3                0.019630                377  
4               

In [7]:
#FILTERS: View or use specific time-of-day groups

# Filter for MORNING trips
morning_df_2020 = df_2020[(df_2020['pickup_time'] == 'morning') & (df_2020['dropoff_time'] == 'morning')]
print("Morning Manhattan trips sample:")
print(morning_df_2020.head())

# Filter for AFTERNOON trips
afternoon_df_2020 = df_2020[(df_2020['pickup_time'] == 'afternoon') & (df_2020['dropoff_time'] == 'afternoon')]
print("\nAfternoon Manhattan trips sample:")
print(afternoon_df_2020.head())

# Filter for EVENING/NIGHT trips
evening_df_2020 = df_2020[(df_2020['pickup_time'] == 'evening') & (df_2020['dropoff_time'] == 'evening')]
print("\nEvening Manhattan trips sample:")
print(evening_df_2020.head())

Morning Manhattan trips sample:
      tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
11405  2020-07-04 07:18:32   2020-07-04 07:18:50              1.0   
11496  2020-07-04 09:26:57   2020-07-04 09:30:18              1.0   
16478  2020-07-28 09:02:59   2020-07-28 09:08:19              0.0   
35671  2020-07-06 11:47:16   2020-07-06 11:57:06              NaN   
35809  2020-07-06 11:51:00   2020-07-06 11:55:44              NaN   

       PULocationID  DOLocationID  pickup_hour  dropoff_hour pickup_time  \
11405           100           100            7             7     morning   
11496            75            75            9             9     morning   
16478            43            43            9             9     morning   
35671           143           161           11            11     morning   
35809            48           161           11            11     morning   

      dropoff_time  
11405      morning  
11496      morning  
16478      morning  
35671      m

**2021**

In [12]:
import pandas as pd

# 1. Load the CSV file
df_2021 = pd.read_csv('2021_Yellow_Taxi_Trip_Data.csv')

# 2. Convert pickup datetime using the correct format (e.g., "01/01/2021 12:30:10 AM")
df_2021['tpep_pickup_datetime'] = pd.to_datetime(
    df_2021['tpep_pickup_datetime'],
    format='%m/%d/%Y %I:%M:%S %p',
    errors='coerce'
)

df_2021['tpep_dropoff_datetime'] = pd.to_datetime(
    df_2021['tpep_dropoff_datetime'],
    format='%m/%d/%Y %I:%M:%S %p',
    errors='coerce'
)

# 3. Drop rows where conversion failed
df_2021 = df_2021.dropna(subset=['tpep_pickup_datetime'])
df_2021 = df_2021.dropna(subset=['tpep_dropoff_datetime'])

# 3b. Filter for Manhattan trips only (pickup and dropoff)
manhattan_zone_ids = [
    4, 12, 13, 14, 24, 41, 42, 43, 45, 48, 50, 68, 74, 75, 79, 87, 88, 90,
    100, 107, 113, 114, 116, 125, 127, 128, 137, 140, 141, 142, 143, 144, 148,
    151, 152, 153, 158, 161, 162, 163, 164, 166, 170, 186, 230, 231, 232, 233, 234
]
df_2021 = df_2021[
    df_2021['PULocationID'].isin(manhattan_zone_ids) &
    df_2021['DOLocationID'].isin(manhattan_zone_ids)
]

# 4. Extract pickup hour
df_2021['pickup_hour'] = df_2021['tpep_pickup_datetime'].dt.hour
df_2021['dropoff_hour'] = df_2021['tpep_dropoff_datetime'].dt.hour

# 5. Assign time of day
def get_time_of_day(hour):
    if 7 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 18:
        return 'afternoon'
    else:
        return 'evening'  # Consistent with your previous naming

#Remove unnecessary columns
df_2021 = df_2021.drop(['VendorID','trip_distance','RatecodeID','store_and_fwd_flag','payment_type',
 'fare_amount','extra','mta_tax','tip_amount','tolls_amount','improvement_surcharge','total_amount','congestion_surcharge'],axis=1)

  df_2021 = pd.read_csv('2021_Yellow_Taxi_Trip_Data.csv')


In [13]:
df_2021['pickup_time'] = df_2021['pickup_hour'].apply(get_time_of_day)
df_2021['dropoff_time'] = df_2021['dropoff_hour'].apply(get_time_of_day)

#Count pick ups and drop offs
pickup_counts_2021 = df_2021.groupby("PULocationID").size().reset_index(name="pickup_count")
dropoff_counts_2021 = df_2021.groupby("DOLocationID").size().reset_index(name="dropoff_count")

#Merge zones with pick up and drop off data
zones = zones[zones["borough"] == "Manhattan"]
zones_2021 = zones.merge(pickup_counts_2021, left_on="LocationID", right_on="PULocationID", how="left")
zones_2021 = zones_2021.merge(dropoff_counts_2021, left_on="LocationID", right_on="DOLocationID", how="left")
zones_2021 = zones_2021.fillna(0) 

#Normalize pickups and drop offs
scaler = MinMaxScaler((1, 10))
zones_2021["dropoff_count2021"] = scaler.fit_transform(zones_2021[["dropoff_count"]])
zones_2021["pickup_count2021"] = scaler.fit_transform(zones_2021[["pickup_count"]])

#weighted sum of normalized score foot traffic score_2020 (0.7drop off + 0.3pickup)
zones_2021["foot_traffic_score2021"] = (0.7 * zones_2021["dropoff_count2021"] + 0.3 * zones_2021["pickup_count2021"])

In [None]:
# Add a 'trip_date' column to df_2021
df_2021['trip_date'] = df_2021['tpep_pickup_datetime'].dt.date
df_2021['trip_date'] = pd.to_datetime(df_2021['trip_date']) # Convert to datetime objects for consistency

# Load taxi zones and filter for Manhattan (needed for all_manhattan_combinations)
zones = gpd.read_file("taxi_zones/taxi_zones.shp")
zones = zones[zones["borough"] == "Manhattan"].copy()
manhattan_location_ids = zones['LocationID'].unique() # Get all unique Manhattan zone IDs


# Calculate DAILY pickup and dropoff counts per zone per daytime
daily_pickup_counts_2021 = df_2021.groupby(['trip_date', 'PULocationID', 'pickup_time']).size().reset_index(name='daily_pickup_count')
daily_dropoff_counts_2021 = df_2021.groupby(['trip_date', 'DOLocationID', 'dropoff_time']).size().reset_index(name='daily_dropoff_count')


# Create a comprehensive set of ALL possible daily zone-daytime combinations
time_of_day_categories = ['morning', 'afternoon', 'evening']
all_2021_dates = df_2021['trip_date'].unique() # Get all unique dates in the 2020 data

all_daily_zone_daytime_combinations = pd.DataFrame(
    list(product(all_2021_dates, manhattan_location_ids, time_of_day_categories)),
    columns=['trip_date', 'LocationID', 'daytime_category']
)
all_daily_zone_daytime_combinations['trip_date'] = pd.to_datetime(all_daily_zone_daytime_combinations['trip_date'])


# Merge daily counts onto the combinations (left join to keep all combinations)
daily_zone_daytime_data_2021 = all_daily_zone_daytime_combinations.merge(
    daily_pickup_counts_2021,
    left_on=['trip_date', 'LocationID', 'daytime_category'],
    right_on=['trip_date', 'PULocationID', 'pickup_time'],
    how='left'
).drop(columns=['PULocationID', 'pickup_time'], errors='ignore')

daily_zone_daytime_data_2021 = daily_zone_daytime_data_2021.merge(
    daily_dropoff_counts_2021,
    left_on=['trip_date', 'LocationID', 'daytime_category'],
    right_on=['trip_date', 'DOLocationID', 'dropoff_time'],
    how='left'
).drop(columns=['DOLocationID', 'dropoff_time'], errors='ignore')

# Fill NaN values (where a zone had no pickups/dropoffs for a specific date/daytime) with 0
daily_zone_daytime_data_2021 = daily_zone_daytime_data_2021.fillna(0)


# Normalize DAILY pickups and drop offs
scaler_pickup = MinMaxScaler(feature_range=(1, 10))
scaler_dropoff = MinMaxScaler(feature_range=(1, 10))

daily_zone_daytime_data_2021["daily_dropoff_count_scaled"] = scaler_dropoff.fit_transform(daily_zone_daytime_data_2021[["daily_dropoff_count"]])
daily_zone_daytime_data_2021["daily_pickup_count_scaled"] = scaler_pickup.fit_transform(daily_zone_daytime_data_2021[["daily_pickup_count"]])


# Calculate the DAILY foot traffic score
daily_zone_daytime_data_2021["daily_foot_traffic_score"] = (
    0.7 * daily_zone_daytime_data_2021["daily_dropoff_count_scaled"] +
    0.3 * daily_zone_daytime_data_2021["daily_pickup_count_scaled"]
)

# Calculate the AVERAGE foot traffic score per zone per daytime
average_zone_daytime_scores_2021 = daily_zone_daytime_data_2021.groupby(['LocationID', 'daytime_category']).agg(
    average_foot_traffic_score=('daily_foot_traffic_score', 'mean'),
    std_foot_traffic_score=('daily_foot_traffic_score', 'std'), # Optional: standard deviation of daily scores
    num_days_recorded=('daily_foot_traffic_score', 'count') # How many days contributed to this average
).reset_index()

print("\n--- Average Foot Traffic Score per Zone per Daytime for 2021 ---")
print(average_zone_daytime_scores_2021.head(10)) # Display first 10 rows
print(f"\nShape of average_zone_daytime_scores_2021: {average_zone_daytime_scores_2021.shape}")
print(average_zone_daytime_scores_2021.info())


--- Average Foot Traffic Score per Zone per Daytime for 2020 ---
   LocationID daytime_category  average_foot_traffic_score  \
0           4        afternoon                    1.286019   
1           4          evening                    1.454015   
2           4          morning                    1.098795   
3          12        afternoon                    1.043264   
4          12          evening                    1.016260   
5          12          morning                    1.041157   
6          13        afternoon                    1.550791   
7          13          evening                    1.511595   
8          13          morning                    1.355082   
9          24        afternoon                    1.275241   

   std_foot_traffic_score  num_days_recorded  
0                0.092383                386  
1                0.252382                386  
2                0.036534                386  
3                0.030516                386  
4               

In [14]:
# 6. FILTERS: View or use specific time-of-day groups

# Filter for MORNING trips
morning_df_2021 = df_2021[(df_2021['pickup_time'] == 'morning') & (df_2021['dropoff_time'] == 'morning')]
print("Morning Manhattan trips sample:")
print(morning_df_2021.head())

# Filter for AFTERNOON trips
afternoon_df_2021 = df_2021[(df_2021['pickup_time'] == 'afternoon') & (df_2021['dropoff_time'] == 'afternoon')]
print("\nAfternoon Manhattan trips sample:")
print(afternoon_df_2021.head())

# Filter for EVENING/NIGHT trips
evening_df_2021 = df_2021[(df_2021['pickup_time'] == 'evening') & (df_2021['dropoff_time'] == 'evening')]
print("\nEvening Manhattan trips sample:")
print(evening_df_2021.head())

Morning Manhattan trips sample:
     tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
4056  2021-01-01 07:01:47   2021-01-01 07:09:00              1.0   
4077  2021-01-01 07:00:08   2021-01-01 07:09:59              1.0   
4338  2021-01-01 07:46:26   2021-01-01 08:01:36              1.0   
4340  2021-01-01 07:09:54   2021-01-01 07:13:45              1.0   
4341  2021-01-01 07:23:29   2021-01-01 07:30:48              1.0   

      PULocationID  DOLocationID  pickup_hour  dropoff_hour pickup_time  \
4056            75            24            7             7     morning   
4077            75            42            7             7     morning   
4338           142           158            7             8     morning   
4340            79           170            7             7     morning   
4341            79           162            7             7     morning   

     dropoff_time  
4056      morning  
4077      morning  
4338      morning  
4340      morning  
4341    

**2022**

In [16]:
import pandas as pd

# 1. Load the CSV file
df_2022 = pd.read_csv('2022_Yellow_Taxi_Trip_Data.csv')

# 2. Convert pickup datetime using the correct format (e.g., "01/01/2021 12:30:10 AM")
df_2022['tpep_pickup_datetime'] = pd.to_datetime(
    df_2022['tpep_pickup_datetime'],
    format='%m/%d/%Y %I:%M:%S %p',
    errors='coerce'
)

df_2022['tpep_dropoff_datetime'] = pd.to_datetime(
    df_2022['tpep_dropoff_datetime'],
    format='%m/%d/%Y %I:%M:%S %p',
    errors='coerce'
)

# 3. Drop rows where conversion failed
df_2022 = df_2022.dropna(subset=['tpep_pickup_datetime'])
df_2022 = df_2022.dropna(subset=['tpep_dropoff_datetime'])

# 3b. Filter for Manhattan trips only (pickup and dropoff)

df_2022 = df_2022[
    df_2022['PULocationID'].isin(manhattan_zone_ids) &
    df_2022['DOLocationID'].isin(manhattan_zone_ids)
]

# 4. Extract pickup hour
df_2022['pickup_hour'] = df_2022['tpep_pickup_datetime'].dt.hour
df_2022['dropoff_hour'] = df_2022['tpep_dropoff_datetime'].dt.hour

# 5. Assign time of day
def get_time_of_day(hour):
    if 7 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 18:
        return 'afternoon'
    else:
        return 'evening'  # Consistent with your previous naming
    
#Remove unnecessary columns
df_2022 = df_2022.drop(['VendorID','trip_distance','RatecodeID','store_and_fwd_flag','payment_type',
 'fare_amount','extra','mta_tax','tip_amount','tolls_amount','improvement_surcharge','total_amount','congestion_surcharge'],axis=1)

  df_2022 = pd.read_csv('2022_Yellow_Taxi_Trip_Data.csv')


In [17]:
df_2022['pickup_time'] = df_2022['pickup_hour'].apply(get_time_of_day)
df_2022['dropoff_time'] = df_2022['dropoff_hour'].apply(get_time_of_day)

#Count pick ups and drop offs
pickup_counts_2022 = df_2022.groupby("PULocationID").size().reset_index(name="pickup_count")
dropoff_counts_2022 = df_2022.groupby("DOLocationID").size().reset_index(name="dropoff_count")

#Merge zones with pick up and drop off data
zones = zones[zones["borough"] == "Manhattan"]
zones_2022 = zones.merge(pickup_counts_2022, left_on="LocationID", right_on="PULocationID", how="left")
zones_2022 = zones_2022.merge(dropoff_counts_2022, left_on="LocationID", right_on="DOLocationID", how="left")
zones_2022 = zones_2022.fillna(0) 

#Normalize pickups and drop offs
scaler = MinMaxScaler((1, 10))
zones_2022["dropoff_count2022"] = scaler.fit_transform(zones_2022[["dropoff_count"]])
zones_2022["pickup_count2022"] = scaler.fit_transform(zones_2022[["pickup_count"]])

#weighted sum of normalized score foot traffic score_2020 (0.7drop off + 0.3pickup)
zones_2022["foot_traffic_score2022"] = (0.7 * zones_2022["dropoff_count2022"] + 0.3 * zones_2022["pickup_count2022"])

In [31]:
from itertools import product
# Add a 'trip_date' column to df_2022
df_2022['trip_date'] = df_2022['tpep_pickup_datetime'].dt.date
df_2022['trip_date'] = pd.to_datetime(df_2022['trip_date']) # Convert to datetime objects for consistency

# Load taxi zones and filter for Manhattan (needed for all_manhattan_combinations)
zones = gpd.read_file("taxi_zones/taxi_zones.shp")
zones = zones[zones["borough"] == "Manhattan"].copy()
manhattan_location_ids = zones['LocationID'].unique() # Get all unique Manhattan zone IDs


# Calculate DAILY pickup and dropoff counts per zone per daytime
daily_pickup_counts_2022 = df_2022.groupby(['trip_date', 'PULocationID', 'pickup_time']).size().reset_index(name='daily_pickup_count')
daily_dropoff_counts_2022 = df_2022.groupby(['trip_date', 'DOLocationID', 'dropoff_time']).size().reset_index(name='daily_dropoff_count')


# Create a comprehensive set of ALL possible daily zone-daytime combinations
time_of_day_categories = ['morning', 'afternoon', 'evening']
all_2022_dates = df_2022['trip_date'].unique() # Get all unique dates in the 2020 data

all_daily_zone_daytime_combinations = pd.DataFrame(
    list(product(all_2022_dates, manhattan_location_ids, time_of_day_categories)),
    columns=['trip_date', 'LocationID', 'daytime_category']
)
all_daily_zone_daytime_combinations['trip_date'] = pd.to_datetime(all_daily_zone_daytime_combinations['trip_date'])


# Merge daily counts onto the combinations (left join to keep all combinations)
daily_zone_daytime_data_2022 = all_daily_zone_daytime_combinations.merge(
    daily_pickup_counts_2022,
    left_on=['trip_date', 'LocationID', 'daytime_category'],
    right_on=['trip_date', 'PULocationID', 'pickup_time'],
    how='left'
).drop(columns=['PULocationID', 'pickup_time'], errors='ignore')

daily_zone_daytime_data_2022 = daily_zone_daytime_data_2022.merge(
    daily_dropoff_counts_2022,
    left_on=['trip_date', 'LocationID', 'daytime_category'],
    right_on=['trip_date', 'DOLocationID', 'dropoff_time'],
    how='left'
).drop(columns=['DOLocationID', 'dropoff_time'], errors='ignore')

# Fill NaN values (where a zone had no pickups/dropoffs for a specific date/daytime) with 0
daily_zone_daytime_data_2022 = daily_zone_daytime_data_2022.fillna(0)


# Normalize DAILY pickups and drop offs
scaler_pickup = MinMaxScaler(feature_range=(1, 10))
scaler_dropoff = MinMaxScaler(feature_range=(1, 10))

daily_zone_daytime_data_2022["daily_dropoff_count_scaled"] = scaler_dropoff.fit_transform(daily_zone_daytime_data_2022[["daily_dropoff_count"]])
daily_zone_daytime_data_2022["daily_pickup_count_scaled"] = scaler_pickup.fit_transform(daily_zone_daytime_data_2022[["daily_pickup_count"]])


# Calculate the DAILY foot traffic score
daily_zone_daytime_data_2022["daily_foot_traffic_score"] = (
    0.7 * daily_zone_daytime_data_2022["daily_dropoff_count_scaled"] +
    0.3 * daily_zone_daytime_data_2022["daily_pickup_count_scaled"]
)

# Calculate the AVERAGE foot traffic score per zone per daytime
average_zone_daytime_scores_2022 = daily_zone_daytime_data_2022.groupby(['LocationID', 'daytime_category']).agg(
    average_foot_traffic_score=('daily_foot_traffic_score', 'mean'),
    std_foot_traffic_score=('daily_foot_traffic_score', 'std'), # Optional: standard deviation of daily scores
    num_days_recorded=('daily_foot_traffic_score', 'count') # How many days contributed to this average
).reset_index()

print("\n--- Average Foot Traffic Score per Zone per Daytime for 2022 ---")
print(average_zone_daytime_scores_2022.head(10)) # Display first 10 rows
print(f"\nShape of average_zone_daytime_scores_2022: {average_zone_daytime_scores_2022.shape}")
print(average_zone_daytime_scores_2022.info())


--- Average Foot Traffic Score per Zone per Daytime for 2022 ---
   LocationID daytime_category  average_foot_traffic_score  \
0           4        afternoon                    1.279625   
1           4          evening                    1.648151   
2           4          morning                    1.101747   
3          12        afternoon                    1.068417   
4          12          evening                    1.019805   
5          12          morning                    1.088647   
6          13        afternoon                    1.752623   
7          13          evening                    1.745907   
8          13          morning                    1.642062   
9          24        afternoon                    1.278982   

   std_foot_traffic_score  num_days_recorded  
0                0.082010                379  
1                0.252846                379  
2                0.032626                379  
3                0.035571                379  
4               

In [19]:
# 6. FILTERS: View or use specific time-of-day groups
# Filter for MORNING trips
morning_df_2022 = df_2022[(df_2022['pickup_time'] == 'morning') & (df_2022['dropoff_time'] == 'morning')]
print("Morning Manhattan trips sample:")
print(morning_df_2022.head())

# Filter for AFTERNOON trips
afternoon_df_2022 = df_2022[(df_2022['pickup_time'] == 'afternoon') & (df_2022['dropoff_time'] == 'afternoon')]
print("\nAfternoon Manhattan trips sample:")
print(afternoon_df_2022.head())

# Filter for EVENING/NIGHT trips
evening_df_2022 = df_2022[(df_2022['pickup_time'] == 'evening') & (df_2022['dropoff_time'] == 'evening')]
print("\nEvening Manhattan trips sample:")
print(evening_df_2022.head())

Morning Manhattan trips sample:
      tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
17786  2022-01-01 07:04:30   2022-01-01 07:09:38              1.0   
17966  2022-01-01 07:08:43   2022-01-01 07:10:46              2.0   
17974  2022-01-01 07:36:46   2022-01-01 07:44:54              0.0   
17986  2022-01-01 07:48:30   2022-01-01 08:05:50              1.0   
17988  2022-01-01 07:24:58   2022-01-01 07:32:53              1.0   

       PULocationID  DOLocationID  airport_fee  pickup_hour  dropoff_hour  \
17786           163           143          0.0            7             7   
17966           186            48          0.0            7             7   
17974           107           140          0.0            7             7   
17986           186            79          0.0            7             8   
17988           151            50          0.0            7             7   

      pickup_time dropoff_time  
17786     morning      morning  
17966     morning      m

**2023**

In [22]:
import pandas as pd

# 1. Load the CSV file
df_2023 = pd.read_csv('2023_Yellow_Taxi_Trip_Data.csv')

# 2. Convert pickup datetime using the correct format (e.g., "01/01/2021 12:30:10 AM")
df_2023['tpep_pickup_datetime'] = pd.to_datetime(
    df_2023['tpep_pickup_datetime'],
    format='%m/%d/%Y %I:%M:%S %p',
    errors='coerce'
)

df_2023['tpep_dropoff_datetime'] = pd.to_datetime(
    df_2023['tpep_dropoff_datetime'],
    format='%m/%d/%Y %I:%M:%S %p',
    errors='coerce'
)

# 3. Drop rows where conversion failed
df_2023 = df_2023.dropna(subset=['tpep_pickup_datetime'])
df_2023 = df_2023.dropna(subset=['tpep_dropoff_datetime'])

# 3b. Filter for Manhattan trips only (pickup and dropoff)

df_2023 = df_2023[
    df_2023['PULocationID'].isin(manhattan_zone_ids) &
    df_2023['DOLocationID'].isin(manhattan_zone_ids)
]

# 4. Extract pickup hour
df_2023['pickup_hour'] = df_2023['tpep_pickup_datetime'].dt.hour
df_2023['dropoff_hour'] = df_2023['tpep_dropoff_datetime'].dt.hour

# 5. Assign time of day
def get_time_of_day(hour):
    if 7 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 18:
        return 'afternoon'
    else:
        return 'evening'  # Consistent with your previous naming

#Remove unnecessary columns
df_2023 = df_2023.drop(['trip_distance', 'fare_amount','total_amount'],axis=1)

In [23]:
df_2023['pickup_time'] = df_2023['pickup_hour'].apply(get_time_of_day)
df_2023['dropoff_time'] = df_2023['dropoff_hour'].apply(get_time_of_day)

#Count pick ups and drop offs
pickup_counts_2023 = df_2023.groupby("PULocationID").size().reset_index(name="pickup_count")
dropoff_counts_2023 = df_2023.groupby("DOLocationID").size().reset_index(name="dropoff_count")

#Merge zones with pick up and drop off data
zones = zones[zones["borough"] == "Manhattan"]
zones_2023 = zones.merge(pickup_counts_2023, left_on="LocationID", right_on="PULocationID", how="left")
zones_2023 = zones_2023.merge(dropoff_counts_2023, left_on="LocationID", right_on="DOLocationID", how="left")
zones_2023 = zones_2023.fillna(0) 

#Normalize pickups and drop offs
scaler = MinMaxScaler((1, 10))
zones_2023["dropoff_count2023"] = scaler.fit_transform(zones_2023[["dropoff_count"]])
zones_2023["pickup_count2023"] = scaler.fit_transform(zones_2023[["pickup_count"]])

#weighted sum of normalized score foot traffic score_2020 (0.7drop off + 0.3pickup)
zones_2023["foot_traffic_score2023"] = (0.7 * zones_2023["dropoff_count2023"] + 0.3 * zones_2023["pickup_count2023"])

In [32]:
from itertools import product
# Add a 'trip_date' column to df_2023
df_2023['trip_date'] = df_2023['tpep_pickup_datetime'].dt.date
df_2023['trip_date'] = pd.to_datetime(df_2023['trip_date']) # Convert to datetime objects for consistency

# Load taxi zones and filter for Manhattan (needed for all_manhattan_combinations)
zones = gpd.read_file("taxi_zones/taxi_zones.shp")
zones = zones[zones["borough"] == "Manhattan"].copy()
manhattan_location_ids = zones['LocationID'].unique() # Get all unique Manhattan zone IDs


# Calculate DAILY pickup and dropoff counts per zone per daytime
daily_pickup_counts_2023 = df_2023.groupby(['trip_date', 'PULocationID', 'pickup_time']).size().reset_index(name='daily_pickup_count')
daily_dropoff_counts_2023 = df_2023.groupby(['trip_date', 'DOLocationID', 'dropoff_time']).size().reset_index(name='daily_dropoff_count')


# Create a comprehensive set of ALL possible daily zone-daytime combinations
time_of_day_categories = ['morning', 'afternoon', 'evening']
all_2023_dates = df_2023['trip_date'].unique() # Get all unique dates in the 2020 data

all_daily_zone_daytime_combinations = pd.DataFrame(
    list(product(all_2020_dates, manhattan_location_ids, time_of_day_categories)),
    columns=['trip_date', 'LocationID', 'daytime_category']
)
all_daily_zone_daytime_combinations['trip_date'] = pd.to_datetime(all_daily_zone_daytime_combinations['trip_date'])


# Merge daily counts onto the combinations (left join to keep all combinations)
daily_zone_daytime_data_2023 = all_daily_zone_daytime_combinations.merge(
    daily_pickup_counts_2023,
    left_on=['trip_date', 'LocationID', 'daytime_category'],
    right_on=['trip_date', 'PULocationID', 'pickup_time'],
    how='left'
).drop(columns=['PULocationID', 'pickup_time'], errors='ignore')

daily_zone_daytime_data_2023 = daily_zone_daytime_data_2023.merge(
    daily_dropoff_counts_2023,
    left_on=['trip_date', 'LocationID', 'daytime_category'],
    right_on=['trip_date', 'DOLocationID', 'dropoff_time'],
    how='left'
).drop(columns=['DOLocationID', 'dropoff_time'], errors='ignore')

# Fill NaN values (where a zone had no pickups/dropoffs for a specific date/daytime) with 0
daily_zone_daytime_data_2023 = daily_zone_daytime_data_2023.fillna(0)


# Normalize DAILY pickups and drop offs
scaler_pickup = MinMaxScaler(feature_range=(1, 10))
scaler_dropoff = MinMaxScaler(feature_range=(1, 10))

daily_zone_daytime_data_2023["daily_dropoff_count_scaled"] = scaler_dropoff.fit_transform(daily_zone_daytime_data_2023[["daily_dropoff_count"]])
daily_zone_daytime_data_2023["daily_pickup_count_scaled"] = scaler_pickup.fit_transform(daily_zone_daytime_data_2023[["daily_pickup_count"]])


# Calculate the DAILY foot traffic score
daily_zone_daytime_data_2023["daily_foot_traffic_score"] = (
    0.7 * daily_zone_daytime_data_2023["daily_dropoff_count_scaled"] +
    0.3 * daily_zone_daytime_data_2023["daily_pickup_count_scaled"]
)

# Calculate the AVERAGE foot traffic score per zone per daytime
average_zone_daytime_scores_2023 = daily_zone_daytime_data_2020.groupby(['LocationID', 'daytime_category']).agg(
    average_foot_traffic_score=('daily_foot_traffic_score', 'mean'),
    std_foot_traffic_score=('daily_foot_traffic_score', 'std'), # Optional: standard deviation of daily scores
    num_days_recorded=('daily_foot_traffic_score', 'count') # How many days contributed to this average
).reset_index()

print("\n--- Average Foot Traffic Score per Zone per Daytime for 2023 ---")
print(average_zone_daytime_scores_2023.head(10)) # Display first 10 rows
print(f"\nShape of average_zone_daytime_scores_2023: {average_zone_daytime_scores_2023.shape}")
print(average_zone_daytime_scores_2023.info())


--- Average Foot Traffic Score per Zone per Daytime for 2023 ---
   LocationID daytime_category  average_foot_traffic_score  \
0           4        afternoon                    1.118557   
1           4          evening                    1.210089   
2           4          morning                    1.047516   
3          12        afternoon                    1.013198   
4          12          evening                    1.007476   
5          12          morning                    1.014248   
6          13        afternoon                    1.226177   
7          13          evening                    1.231847   
8          13          morning                    1.184655   
9          24        afternoon                    1.117078   

   std_foot_traffic_score  num_days_recorded  
0                0.086755                377  
1                0.292994                377  
2                0.039084                377  
3                0.019630                377  
4               

In [46]:
daily_zone_daytime_data_2023.head()

Unnamed: 0,trip_date,LocationID,daytime_category,daily_pickup_count,daily_dropoff_count,daily_dropoff_count_scaled,daily_pickup_count_scaled,daily_foot_traffic_score
0,2020-01-01,4,morning,0.0,0.0,1.0,1.0,1.0
1,2020-01-01,4,afternoon,0.0,0.0,1.0,1.0,1.0
2,2020-01-01,4,evening,0.0,0.0,1.0,1.0,1.0
3,2020-01-01,12,morning,0.0,0.0,1.0,1.0,1.0
4,2020-01-01,12,afternoon,0.0,0.0,1.0,1.0,1.0


In [47]:
average_zone_daytime_scores_2023.head()

Unnamed: 0,LocationID,daytime_category,average_foot_traffic_score,std_foot_traffic_score,num_days_recorded
0,4,afternoon,1.118557,0.086755,377
1,4,evening,1.210089,0.292994,377
2,4,morning,1.047516,0.039084,377
3,12,afternoon,1.013198,0.01963,377
4,12,evening,1.007476,0.009928,377


In [24]:
# Filter for MORNING trips
morning_df_2023 = df_2023[(df_2023['pickup_time'] == 'morning') & (df_2023['dropoff_time'] == 'morning')]
print("Morning Manhattan trips sample:")
print(morning_df_2023.head())

# Filter for AFTERNOON trips
afternoon_df_2023 = df_2023[(df_2023['pickup_time'] == 'afternoon') & (df_2023['dropoff_time'] == 'afternoon')]
print("\nAfternoon Manhattan trips sample:")
print(afternoon_df_2023.head())

# Filter for EVENING/NIGHT trips
evening_df_2023 = df_2023[(df_2023['pickup_time'] == 'evening') & (df_2023['dropoff_time'] == 'evening')]
print("\nEvening Manhattan trips sample:")
print(evening_df_2023.head())

Morning Manhattan trips sample:
      tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
22337  2023-01-01 07:00:22   2023-01-01 07:03:44              6.0   
22535  2023-01-01 07:47:11   2023-01-01 07:48:42              0.0   
22537  2023-01-01 07:27:55   2023-01-01 07:35:20              6.0   
22541  2023-01-01 07:22:09   2023-01-01 07:37:24              1.0   
22542  2023-01-01 07:48:36   2023-01-01 08:06:16              1.0   

       PULocationID  DOLocationID  tip_amount  pickup_hour  dropoff_hour  \
22337            79           137        0.00            7             7   
22535            79           107        0.00            7             7   
22537           100           107        4.20            7             7   
22541           186            12        5.18            7             7   
22542           100           116        0.00            7             8   

      pickup_time dropoff_time  
22337     morning      morning  
22535     morning      morning

In [26]:
#Save foot traffic score incl. daytime filter of each year as csv
zones_2020.to_csv("foot_traffic_scores2020.csv", index=False)
zones_2021.to_csv("foot_traffic_scores2021.csv", index=False)
zones_2022.to_csv("foot_traffic_scores2022.csv", index=False)
zones_2023.to_csv("foot_traffic_scores2023.csv", index=False)

In [33]:
# --- Concatenate all yearly average scores into a master DataFrame ---
master_prediction_data = pd.concat([
    average_zone_daytime_scores_2020,
    average_zone_daytime_scores_2021,
    average_zone_daytime_scores_2022,
    average_zone_daytime_scores_2023
], ignore_index=True)

print("--- Master Prediction Data (Combined for all years) ---")
print(master_prediction_data.head())
print(master_prediction_data.tail())
print(f"Total shape of master_prediction_data: {master_prediction_data.shape}")
print(master_prediction_data.info())

--- Master Prediction Data (Combined for all years) ---
   LocationID daytime_category  average_foot_traffic_score  \
0           4        afternoon                    1.118557   
1           4          evening                    1.210089   
2           4          morning                    1.047516   
3          12        afternoon                    1.013198   
4          12          evening                    1.007476   

   std_foot_traffic_score  num_days_recorded  
0                0.086755                377  
1                0.292994                377  
2                0.039084                377  
3                0.019630                377  
4                0.009928                377  
     LocationID daytime_category  average_foot_traffic_score  \
799         262          evening                         1.0   
800         262          morning                         1.0   
801         263        afternoon                         1.0   
802         263          evening 

### ML 

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

features = ['LocationID', 'daytime_category'] # 'year' is removed
target = 'average_foot_traffic_score'

X = master_prediction_data[features]
y = master_prediction_data[target]

categorical_features = ['LocationID', 'daytime_category']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough' # 'remainder' will have no effect here as there are no numerical features other than what's encoded
)

print("\n--- Preprocessor Setup Complete (without 'year' feature) ---")


--- Preprocessor Setup Complete (without 'year' feature) ---


**Training Model**

*Random Forest*

In [44]:
# Using a standard random train-test split instead.
print("\n--- Performing Random Train-Test Split (no 'year' column for chronological split) ---")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 20% for testing

print(f"Train set shape: {X_train.shape}, {y_train.shape}")
print(f"Test set shape: {X_test.shape}, {y_test.shape}")

model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))])

print("\n--- Training the RandomForestRegressor model ---")
model.fit(X_train, y_train)
print("Model training complete.")

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\n--- Model Evaluation (on random 20% test data) ---")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R-squared (R2): {r2:.4f}")



--- Performing Random Train-Test Split (no 'year' column for chronological split) ---
Train set shape: (643, 2), (643,)
Test set shape: (161, 2), (161,)

--- Training the RandomForestRegressor model ---
Model training complete.

--- Model Evaluation (on random 20% test data) ---
Mean Absolute Error (MAE): 0.3094
R-squared (R2): 0.4323


**Prediction**

In [45]:
print("\n--- Demonstrating a Prediction (without 'year' in input) ---")

# Example prediction for a specific scenario: Zone 14, Afternoon
# Note: 'year' cannot be specified in the input for prediction as it's not a feature.
new_data_point = pd.DataFrame([[14, 'afternoon']], # LocationID, daytime_category
                              columns=features) # 'features' list no longer contains 'year'

predicted_score = model.predict(new_data_point)[0]

print(f"Predicting for: LocationID=14, Daytime='afternoon' (Prediction will be a general average, not year-specific)")
print(f"Predicted average foot traffic score: {predicted_score:.4f}")

# Example for comparison: Same zone, different daytime categories
new_data_points_comparison = pd.DataFrame([
    [14, 'morning'],
    [14, 'afternoon'],
    [14, 'evening']
], columns=features)

predicted_scores_comparison = model.predict(new_data_points_comparison)

print(f"\nPredicted scores for Zone 14 (different daytimes):")
print(f"  Morning: {predicted_scores_comparison[0]:.4f}")
print(f"  Afternoon: {predicted_scores_comparison[1]:.4f}")
print(f"  Evening: {predicted_scores_comparison[2]:.4f}")


--- Demonstrating a Prediction (without 'year' in input) ---
Predicting for: LocationID=14, Daytime='afternoon' (Prediction will be a general average, not year-specific)
Predicted average foot traffic score: 1.0000

Predicted scores for Zone 14 (different daytimes):
  Morning: 1.0000
  Afternoon: 1.0000
  Evening: 1.0000
