# Chicago Taxi Trips Data Collection  
**Purpose**: Extract taxi trip data from Chicago Open Data API  
**Dataset**: `ajtu-isnz` - Chicago taxi trips with GPS coordinates  
**Output**: Raw taxi DataFrame for weather correlation and analysis

In [1]:
import requests
import pandas as pd
import os
from datetime import datetime
from dateutil.relativedelta import relativedelta

# Chicago Open Data API configuration
DATASET_ID = "ajtu-isnz"  # Chicago taxi trips dataset
BASE_URL = f"https://data.cityofchicago.org/resource/{DATASET_ID}.json"

# API token for higher rate limits (optional)
headers = {
    "X-App-Token": os.environ.get("CHICAGO_API_TOKEN", "")
}

In [2]:
# Get historical date (2 months ago) to match weather data timeframe
target_date = datetime.now() - relativedelta(months=2)
date_str = target_date.strftime("%Y-%m-%d")

print(f"Fetching taxi data for: {date_str}")

# Build API query for specific date range (full day)
where_clause = (
    f"trip_start_timestamp >= '{date_str}T00:00:00' "
    f"AND trip_start_timestamp <= '{date_str}T23:59:59'"
)

params = {
    "$where": where_clause,
    "$limit": 30000,  # Max records per API call
}

print(f"API Query: {where_clause}")

Fetching taxi data for: 2025-07-03
API Query: trip_start_timestamp >= '2025-07-03T00:00:00' AND trip_start_timestamp <= '2025-07-03T23:59:59'


In [3]:
# Fetch taxi trips data from Chicago Open Data API
response = requests.get(BASE_URL, headers=headers, params=params, timeout=60)
response.raise_for_status()
taxi_data = response.json()

print(f"Raw taxi records retrieved: {len(taxi_data)}")

# Convert to DataFrame for processing
taxi_df = pd.DataFrame(taxi_data)
print(f"DataFrame shape: {taxi_df.shape}")

# Show sample data structure
taxi_df.head(3)

Raw taxi records retrieved: 16688
DataFrame shape: (16688, 23)


Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,...,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,pickup_centroid_location,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_centroid_location,pickup_census_tract,dropoff_census_tract
0,601c5ca6ec9b8d34385fa82db331d8a1f25af018,e3a458804fe9298cd1ab49287e64a8a8354d9a0766f062...,2025-07-03T23:45:00.000,2025-07-04T00:00:00.000,660,0.0,6,8,0.0,0.0,...,Cash,Taxi Affiliation Services,41.944226601,-87.655998182,"{'type': 'Point', 'coordinates': [-87.65599818...",41.899602111,-87.633308037,"{'type': 'Point', 'coordinates': [-87.63330803...",,
1,610b029fca3fae89fda3a88b756185727e7df2cb,fc68d8f57ebef02a03efc9ed6c5ebd4488403fa6458091...,2025-07-03T23:45:00.000,2025-07-04T00:00:00.000,1238,15.47,76,8,38.0,8.7,...,Credit Card,Taxicab Insurance Agency Llc,41.97907082,-87.903039661,"{'type': 'Point', 'coordinates': [-87.90303966...",41.90749193,-87.63576009,"{'type': 'Point', 'coordinates': [-87.63576009...",17031980000.0,17031080300.0
2,63afa364a241c6d2e962ca03d261bb467a579189,e5274d6c103515af3ce705182d0bbbea7ca077a6f23b17...,2025-07-03T23:45:00.000,2025-07-03T23:45:00.000,270,0.4,8,8,4.75,0.0,...,Cash,Globe Taxi,41.899602111,-87.633308037,"{'type': 'Point', 'coordinates': [-87.63330803...",41.899602111,-87.633308037,"{'type': 'Point', 'coordinates': [-87.63330803...",,


In [4]:
# Data quality check: show column info and missing values
print("=== TAXI DATA QUALITY REPORT ===")
print(f"Total records: {len(taxi_df):,}")
print(f"Columns: {list(taxi_df.columns)}")
print("\nMissing values per column:")
print(taxi_df.isnull().sum())

=== TAXI DATA QUALITY REPORT ===
Total records: 16,688
Columns: ['trip_id', 'taxi_id', 'trip_start_timestamp', 'trip_end_timestamp', 'trip_seconds', 'trip_miles', 'pickup_community_area', 'dropoff_community_area', 'fare', 'tips', 'tolls', 'extras', 'trip_total', 'payment_type', 'company', 'pickup_centroid_latitude', 'pickup_centroid_longitude', 'pickup_centroid_location', 'dropoff_centroid_latitude', 'dropoff_centroid_longitude', 'dropoff_centroid_location', 'pickup_census_tract', 'dropoff_census_tract']

Missing values per column:
trip_id                           0
taxi_id                           0
trip_start_timestamp              0
trip_end_timestamp                0
trip_seconds                     11
trip_miles                        0
pickup_community_area           460
dropoff_community_area         1405
fare                             31
tips                             31
tolls                            31
extras                           31
trip_total                    

In [5]:
# Save raw taxi data for processing pipeline
output_path = "csv/raw_taxi_trips.csv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
taxi_df.to_csv(output_path, index=False)

print(f"Raw taxi data saved: {output_path}")
print(f"Records saved: {len(taxi_df):,}")

Raw taxi data saved: csv/raw_taxi_trips.csv
Records saved: 16,688
