In [61]:
#there are 1.8 million trip records with 263 taxi zones

In [62]:
import pandas as pd
#import geopandas as gpd

# Load trip data
df_trips = pd.read_parquet("Data/fhvhv_tripdata_2022-11.parquet")

# Load zone mapping
df_zones = pd.read_csv("Data/tlc-nyc-taxi-zones/taxi_zones.csv")

# maybe could use this for visualization later on
#gdf_zones = gpd.read_file("Data/tlc-nyc-taxi-zones/NYC Taxi Zones.geojson")

# Confirm it worked
#print("Trips:", df_trips.shape)
#print("Zones:", df_zones.shape)

In [63]:
# Detect column name case (Zone vs zone)
zone_col = 'Zone' if 'Zone' in df_zones.columns else 'zone'

# Lowercase for search
df_zones['zone_lower'] = df_zones[zone_col].str.lower()

#function to get location id and give option to choose from values if there are more than 1
#for example, if someone types "east" it will show all the locations with east and then ask user to pick one
def get_location_id(zone_name):
    matches = []
    zone_name = zone_name.strip().lower()

    for _, row in df_zones.iterrows():
        if zone_name in row['zone_lower']:
            matches.append((row['LocationID'], row[zone_col]))

    if len(matches) == 0:
        print(f"No match found for '{zone_name}'")
        return None
    elif len(matches) == 1:
        loc_id, name = matches[0]
        print(f"Match found: {name} → LocationID: {loc_id}")
        return loc_id
    else:
        print(f"\nMultiple matches for '{zone_name}':")
        for i, (loc_id, name) in enumerate(matches):
            print(f"  {i+1}. {name} (LocationID: {loc_id})")
        selection = input("Enter the number of the correct match: ").strip()
        if selection.isdigit():
            index = int(selection) - 1
            if 0 <= index < len(matches):
                return matches[index][0]
        print("Invalid selection.")
        return None

# save LocationIDs for future algorithms
# Loop until valid start location is found
start_id = None
while start_id is None:
    start_name = input("Enter your START location: ").strip().lower()
    start_id = get_location_id(start_name)

# Loop until valid end location is found
end_id = None
while end_id is None:
    end_name = input("Enter your END location: ").strip().lower()
    end_id = get_location_id(end_name)
#check
#print(f"\nStart ID: {start_id}")
#print(f"End ID: {end_id}")

#check
#print(f"\nStart ID: {start_id}")
#print(f"End ID: {end_id}")

Match found: Clinton East → LocationID: 48
Match found: Co-Op City → LocationID: 51


In [64]:
#first lets sort the data by id so that we can use the search algorithm
#the built in tim sort takes like O(nlogn)
df_trips.sort_values(by=['PULocationID', 'DOLocationID'], inplace=True)
df_trips.reset_index(drop=True, inplace=True)  #Reset index for easier iteration

#map license codes to company names
company_map = {
    'HV0002': 'Juno',
    'HV0003': 'Uber',
    'HV0004': 'Via',
    'HV0005': 'Lyft'
}

# get zone names from LocationID
zone_col = 'Zone' if 'Zone' in df_zones.columns else 'zone'
zone_lookup = dict(zip(df_zones['LocationID'], df_zones[zone_col]))

start_zone = zone_lookup.get(start_id, f"ID {start_id}")
end_zone = zone_lookup.get(end_id, f"ID {end_id}")

#since now the data is sorted, lets run the data through a binary search algorithm
def lower_bound(trips, start_id, end_id):
    low = 0
    high = len(trips) - 1
    result = -1

    while low <= high:
        mid = (low + high) // 2
        pickup = trips.iloc[mid]['PULocationID']
        dropoff = trips.iloc[mid]['DOLocationID']

        if (pickup, dropoff) < (start_id, end_id):
            low = mid + 1
        else:
            if (pickup, dropoff) == (start_id, end_id):
                result = mid  #save first match index
            high = mid - 1

    return result

#collecting all values that match start and end location
def collect_all_matches(trips, start_idx, start_id, end_id):
    results = []
    i = start_idx
    while i < len(trips):
        row = trips.iloc[i]
        if row['PULocationID'] == start_id and row['DOLocationID'] == end_id:
            company = company_map.get(row['hvfhs_license_num'], 'Unknown')
            results.append((row['pickup_datetime'], row['dropoff_datetime'], company))
        else:
            break  #since it's sorted, we can stop early
        i += 1
    return results

#using everything to get results
#binary search
first_match_idx = lower_bound(df_trips, start_id, end_id)

if first_match_idx != -1:
    matches = collect_all_matches(df_trips, first_match_idx, start_id, end_id)
    print(f"\nFound {len(matches)} matching trips from {start_zone} to {end_zone}:\n")
    for i, (pickup, dropoff, company) in enumerate(matches, 1):
        print(f"{i}. From: {start_zone}  →  To: {end_zone}  |  Pickup: {pickup}  |  Dropoff: {dropoff}  |  Company: {company}")
else:
    print(f"No matching trips found from {start_zone} to {end_zone}.")



Found 85 matching trips from Clinton East to Co-Op City:

1. From: Clinton East  →  To: Co-Op City  |  Pickup: 2022-11-02 21:20:51  |  Dropoff: 2022-11-02 21:51:33  |  Company: Uber
2. From: Clinton East  →  To: Co-Op City  |  Pickup: 2022-11-03 00:11:33  |  Dropoff: 2022-11-03 00:34:03  |  Company: Uber
3. From: Clinton East  →  To: Co-Op City  |  Pickup: 2022-11-03 21:12:40  |  Dropoff: 2022-11-03 21:48:10  |  Company: Uber
4. From: Clinton East  →  To: Co-Op City  |  Pickup: 2022-11-04 01:47:38  |  Dropoff: 2022-11-04 02:14:32  |  Company: Uber
5. From: Clinton East  →  To: Co-Op City  |  Pickup: 2022-11-04 07:11:56  |  Dropoff: 2022-11-04 07:52:54  |  Company: Uber
6. From: Clinton East  →  To: Co-Op City  |  Pickup: 2022-11-04 10:58:41  |  Dropoff: 2022-11-04 11:28:33  |  Company: Uber
7. From: Clinton East  →  To: Co-Op City  |  Pickup: 2022-11-05 01:53:13  |  Dropoff: 2022-11-05 02:35:07  |  Company: Lyft
8. From: Clinton East  →  To: Co-Op City  |  Pickup: 2022-11-05 02:06:29 