# Understanding Hired Rides in NYC

_[Project prompt](https://docs.google.com/document/d/1VERPjEZcC1XSs4-02aM-DbkNr_yaJVbFjLJxaYQswqA/edit#)_

_This scaffolding notebook may be used to help setup your final project. It's **totally optional** whether you make use of this or not._

_If you do use this notebook, everything provided is optional as well - you may remove or add prose and code as you wish._

_Anything in italics (prose) or comments (in code) is meant to provide you with guidance. **Remove the italic lines and provided comments** before submitting the project, if you choose to use this scaffolding. We don't need the guidance when grading._

_**All code below should be consider "pseudo-code" - not functional by itself, and only a suggestion at the approach.**_

## Project Setup

In [3]:
# all import statements needed for the project, for example:
import os
import bs4
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
import requests
import sqlalchemy as db
import re
from datetime import datetime, timedelta
import numpy as np
import fiona
import math

In [4]:
# any constants you might need; some have been added for you, and 
# some you need to fill in

TLC_URL = "https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

TAXI_ZONES_DIR = ""
TAXI_ZONES_SHAPEFILE = f"{TAXI_ZONES_DIR}/taxi_zones.shp"
WEATHER_CSV_DIR = ""

CRS = 4326  # coordinate reference system

# (lat, lon)
NEW_YORK_BOX_COORDS = ((40.560445, -74.242330), (40.908524, -73.717047))
LGA_BOX_COORDS = ((40.763589, -73.891745), (40.778865, -73.854838))
JFK_BOX_COORDS = ((40.639263, -73.795642), (40.651376, -73.766264))
EWR_BOX_COORDS = ((40.686794, -74.194028), (40.699680, -74.165205))

DATABASE_URL = "sqlite:///project.db"
DATABASE_SCHEMA_FILE = "schema.sql"
QUERY_DIRECTORY = "queries"

In [5]:
# Make sure the QUERY_DIRECTORY exists
try:
    os.mkdir(QUERY_DIRECTORY)
except Exception as e:
    if e.errno == 17:
        # the directory already exists
        pass
    else:
        raise

## Part 1: Data Preprocessing

### Load Taxi Zones

In [8]:
def load_taxi_zones(file_path):
    geofile = gpd.read_file(file_path)
    return geofile
    
taxi_zones = load_taxi_zones("taxi_zones.shp")

In [9]:
def lookup_coords_for_taxi_zone_id(zone_loc_id, loaded_taxi_zones):
    if loaded_taxi_zones.crs is None:
        loaded_taxi_zones = loaded_taxi_zones.set_crs(epsg=2263)

    # Find the zone with the matching LocationID
    zone = loaded_taxi_zones[loaded_taxi_zones['LocationID'] == zone_loc_id]

    # If no match is found, return None
    if zone.empty:
        return None

    # Temporarily reproject to a projected CRS for accurate centroid calculation
    projected_zone = zone.to_crs(epsg=2263)
    centroid = projected_zone.geometry.centroid.iloc[0]

    # Transform the centroid back to geographic CRS (latitude/longitude)
    centroid_geo = gpd.GeoSeries([centroid], crs=2263).to_crs(epsg=4326)

    # Return the latitude and longitude as a tuple
    return (centroid_geo.geometry.iloc[0].y, centroid_geo.geometry.iloc[0].x)

### Calculate Sample Size

In [11]:
def calculate_sample_size(population, p = 0.5) -> int:
    """
    Calculates the required sample size using Cochran's formula.

    Args:
        population (int): The total population size.
        confidence_level (float): Confidence level as a proportion (default is 0.95 for 95% confidence).
        margin_of_error (float): Desired margin of error as a proportion (default is 0.05 for 5%).

    Returns:
        int: Calculated sample size.
    """
    # Z-value for confidence level (default: 1.96 for 95%)
    z = 1.96
    margin_of_error = 0.05
    q = 1 - p  # Complementary proportion
    
    # Cochran's sample size formula for infinite population
    n_0 = (z**2 * p * q) / (margin_of_error**2)
    
    # Adjust for finite population size
    sample_size = n_0 / (1 + (n_0 - 1) / population)
    
    return math.ceil(sample_size)

### Common Functions

In [13]:
def get_all_urls_from_page(page_url):
    """
    Fetches all URLs from a given webpage.

    Args:
        page_url (str): URL of the webpage to scrape.

    Returns:
        list: List of all URLs found on the webpage.
    """
    try:
        # Send a GET request to the page
        response = requests.get(page_url)
        response.raise_for_status()  # Raise an HTTPError for bad responses
    except requests.exceptions.RequestException as e:
        raise Exception(f"Failed to access the URL: {page_url}. Error: {e}")
    
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Find all anchor tags with href attributes
    links = soup.find_all("a", href=True)
    
    # Extract and return all href attributes
    all_urls = [link["href"] for link in links]
    
    return all_urls

In [14]:
def filter_parquet_urls(links):
    parquet_urls = []
    for url in links:
        # Normalize the URL (strip whitespace, handle cases like trailing slashes)
        url = url.strip()
        # Use regex to ensure matching even with query parameters
        if re.search(r"\.parquet(\?.*)?$", url):
            parquet_urls.append(url)
    return parquet_urls

### Process Taxi Data

In [16]:
def get_and_clean_taxi_month(parquet_url: str) -> pd.DataFrame:
    """
    Downloads, processes, and saves Yellow Taxi dataset for a given month.

    Args:
        parquet_url (str): URL of the Yellow Taxi Parquet file.

    Returns:
        pd.DataFrame: Sampled and processed DataFrame.
    """
    # Default directory for processed Yellow Taxi data
    save_dir = "processed_data/yellow_taxi"

    # Ensure the save directory exists
    os.makedirs(save_dir, exist_ok=True)

    # Extract file name and define local path
    file_name = parquet_url.split("/")[-1]
    local_file_path = os.path.join(save_dir, file_name)

    # Download the file if not already downloaded
    if not os.path.exists(local_file_path):
        response = requests.get(parquet_url, stream=True)
        response.raise_for_status()
        with open(local_file_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=1024 * 1024):  # 1MB chunks
                if chunk:
                    f.write(chunk)

    # Load the dataset
    data = pd.read_parquet(local_file_path)

    # Determine population size
    population = len(data)

    # Calculate sample size (using p = 0.5 for Yellow Taxi data)
    sample_size = calculate_sample_size(population, p = 0.5)

    # Sample the dataset
    sampled_data = data.sample(n=sample_size, random_state=42) if population > sample_size else data

    # Save the sampled dataset
    processed_file_path = os.path.join(save_dir, f"sampled_{file_name}")
    sampled_data.to_parquet(processed_file_path)
    return sampled_data


In [17]:
def get_and_clean_taxi_data(parquet_urls):
    all_taxi_dataframes = []
    yellow_taxi_pattern = re.compile(r"yellow_tripdata_(2020-(0[1-9]|1[0-2])|202[1-3]-(0[1-9]|1[0-2])|2024-(0[1-8]))\.parquet")

    # Filter URLs matching the pattern
    yellow_taxi_urls = [url for url in parquet_urls if yellow_taxi_pattern.search(url)]
    
    for url in yellow_taxi_urls:
        # maybe: first try to see if you've downloaded this exact
        # file already and saved it before trying again
        dataframe = get_and_clean_taxi_month(url)
        # maybe: if the file hasn't been saved, save it so you can
        # avoid re-downloading it if you re-run the function
        
        all_taxi_dataframes.append(dataframe)
        
    # create one gigantic dataframe with data from every month needed
    taxi_data = pd.concat(all_taxi_dataframes)
    
    return taxi_data

In [18]:
def clean_taxi_data(taxi_data):
    """
    Cleans the taxi data by retaining specified columns, normalizing column names,
    converting column types, and removing invalid trips. Also unifies column names 
    across different data sources using a mapping.

    Args:
        taxi_data (pd.DataFrame): The input taxi data DataFrame.
        column_mapping (dict): A dictionary mapping original column names to standardized names.

    Returns:
        pd.DataFrame: The cleaned and filtered taxi data.
    """

    # Step 0: Normalize column names using the column_mapping
    column_mapping = {
    'tpep_pickup_datetime': 'pickup_datetime',
    'tpep_dropoff_datetime': 'dropoff_datetime',
    'trip_distance': 'trip_miles'}
   
    taxi_data = taxi_data.rename(columns=column_mapping)

    
    # Add latitude and longitude for PULocationID and DOLocationID
    taxi_data["PU_coords"] = taxi_data["PULocationID"].apply(lambda x: lookup_coords_for_taxi_zone_id(x, taxi_zones))
    taxi_data["DO_coords"] = taxi_data["DOLocationID"].apply(lambda x: lookup_coords_for_taxi_zone_id(x, taxi_zones))

    # Remove trips with invalid location IDs (where coordinates could not be found)
    taxi_data = taxi_data.dropna(subset=["PU_coords", "DO_coords"]).reset_index(drop=True)

    # Split coordinates into latitude and longitude for pickups and dropoffs
    taxi_data[["PU_lat", "PU_lon"]] = pd.DataFrame(taxi_data["PU_coords"].tolist(), index=taxi_data.index)
    taxi_data[["DO_lat", "DO_lon"]] = pd.DataFrame(taxi_data["DO_coords"].tolist(), index=taxi_data.index)

    # Drop temporary coordinate columns
    taxi_data = taxi_data.drop(columns=["PU_coords", "DO_coords"])
    
    # Step 1: Retain only the required columns
    required_columns = [
        'pickup_datetime', 'dropoff_datetime', 'trip_miles',
        'PU_lat', 'PU_lon', 'DO_lat', 'DO_lon', 'total_amount'
    ]
    taxi_data = taxi_data[required_columns]

    # Removing Invalid Data Points
    # Remove rows where trip distance is less than or equal to 0
    taxi_data = taxi_data[taxi_data["trip_miles"] > 0]

    # Remove rows where fare amount or total amount is less than or equal to 0
    taxi_data = taxi_data[taxi_data["total_amount"] > 0]

    # Remove rows where pickup time is after dropoff time
    taxi_data = taxi_data[taxi_data["pickup_datetime"] < taxi_data["dropoff_datetime"]]

    # Reset index after filtering
    taxi_data = taxi_data.reset_index(drop=True)

    #Normalize column names
    taxi_data.columns = [col.lower() for col in taxi_data.columns]

    # Normalizing and Using Appropriate Column Types
    taxi_data["pickup_datetime"] = pd.to_datetime(taxi_data["pickup_datetime"], errors="coerce")
    taxi_data["dropoff_datetime"] = pd.to_datetime(taxi_data["dropoff_datetime"], errors="coerce")

    # Ensure numeric columns are float or int
    numeric_columns = ["trip_miles", "pu_lat", "pu_lon", "do_lat", "do_lon", "total_amount"]
    taxi_data[numeric_columns] = taxi_data[numeric_columns].apply(pd.to_numeric, errors="coerce")

    # Drop rows with invalid datetime or numeric values
    taxi_data = taxi_data.dropna(subset=["pickup_datetime", "dropoff_datetime"] + numeric_columns)

    # Reset index after dropping invalid rows
    taxi_data = taxi_data.reset_index(drop=True)

    # Removing Trips Outside the Latitude/Longitude Bounding Box
    # Latitude and longitude bounding box
    lat_min, lon_min = 40.560445, -74.242330
    lat_max, lon_max = 40.908524, -73.717047

    # Filter rows where pickup and dropoff locations are within the bounding box
    taxi_data = taxi_data[
        (taxi_data["pu_lat"] >= lat_min) & (taxi_data["pu_lat"] <= lat_max) &
        (taxi_data["pu_lon"] >= lon_min) & (taxi_data["pu_lon"] <= lon_max) &
        (taxi_data["do_lat"] >= lat_min) & (taxi_data["do_lat"] <= lat_max) &
        (taxi_data["do_lon"] >= lon_min) & (taxi_data["do_lon"] <= lon_max)
    ]

    # Reset index after filtering
    taxi_data = taxi_data.reset_index(drop=True)

    return taxi_data

In [19]:
def get_taxi_data():
    all_urls = get_all_urls_from_page(TLC_URL)
    all_parquet_urls = filter_parquet_urls(all_urls)
    taxi_data = get_and_clean_taxi_data(all_parquet_urls)
    return taxi_data

In [20]:
taxi_data = get_taxi_data()
taxi_data = clean_taxi_data(taxi_data)

In [21]:
taxi_data.head()

Unnamed: 0,pickup_datetime,dropoff_datetime,trip_miles,pu_lat,pu_lon,do_lat,do_lon,total_amount
0,2024-01-20 13:31:30,2024-01-20 14:03:25,17.14,40.646985,-73.78653,40.749914,-73.970443,90.96
1,2024-01-18 21:52:46,2024-01-18 22:03:21,2.49,40.764421,-73.977569,40.790011,-73.94575,22.5
2,2024-01-01 03:43:58,2024-01-01 03:50:47,1.84,40.866075,-73.919308,40.857779,-73.885867,12.5
3,2024-01-19 22:20:12,2024-01-19 22:50:12,3.6,40.748497,-73.992438,40.778766,-73.95101,33.95
4,2024-01-06 22:41:50,2024-01-06 22:43:24,0.04,40.791705,-73.973049,40.791705,-73.973049,6.2


In [22]:
taxi_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20727 entries, 0 to 20726
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   pickup_datetime   20727 non-null  datetime64[ns]
 1   dropoff_datetime  20727 non-null  datetime64[ns]
 2   trip_miles        20727 non-null  float64       
 3   pu_lat            20727 non-null  float64       
 4   pu_lon            20727 non-null  float64       
 5   do_lat            20727 non-null  float64       
 6   do_lon            20727 non-null  float64       
 7   total_amount      20727 non-null  float64       
dtypes: datetime64[ns](2), float64(6)
memory usage: 1.3 MB


In [23]:
taxi_data.describe()

Unnamed: 0,pickup_datetime,dropoff_datetime,trip_miles,pu_lat,pu_lon,do_lat,do_lon,total_amount
count,20727,20727,20727.0,20727.0,20727.0,20727.0,20727.0,20727.0
mean,2022-04-30 03:11:05.817195008,2022-04-30 03:27:08.452115712,3.271662,40.75335,-73.966955,40.755752,-73.970757,22.482544
min,2020-01-01 00:11:06,2020-01-01 00:30:50,0.01,40.576961,-74.029893,40.576961,-74.174002,1.0
25%,2021-02-27 15:22:24,2021-02-27 15:32:43.500000,1.09,40.740337,-73.989845,40.740337,-73.989845,12.8
50%,2022-04-29 09:06:52,2022-04-29 09:22:22,1.81,40.758028,-73.977698,40.758028,-73.977698,17.02
75%,2023-06-28 11:02:35,2023-06-28 11:17:36,3.31,40.773633,-73.961764,40.775932,-73.959635,24.5
max,2024-08-31 22:43:47,2024-08-31 23:26:23,67.9,40.899528,-73.739337,40.899528,-73.726655,262.7
std,,,4.110867,0.032385,0.045035,0.03313,0.036128,17.359922


In [55]:
output_file = "taxi_data_cleaned.csv"

# Save the DataFrame to a CSV file
taxi_data.to_csv(output_file, index=False)

### Processing Uber Data

In [25]:
def get_and_clean_uber_month(parquet_url):
    save_dir = "processed_data/hvhf"

    # Ensure the save directory exists
    os.makedirs(save_dir, exist_ok=True)

    # Extract file name and define local path
    file_name = parquet_url.split("/")[-1]
    local_file_path = os.path.join(save_dir, file_name)

    # Download the file if not already downloaded
    if not os.path.exists(local_file_path):
        response = requests.get(parquet_url, stream=True)
        response.raise_for_status()
        with open(local_file_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=1024 * 1024):  # 1MB chunks
                if chunk:
                    f.write(chunk)

    data = pd.read_parquet(local_file_path)

    # Determine population size
    population = len(data)

    # Calculate sample size (using p = 0.5 for Yellow Taxi data)
    sample_size = calculate_sample_size(population, p = 0.4)

    # Sample the dataset
    sampled_data = data.sample(n=sample_size, random_state=42) if population > sample_size else data

    # Save the sampled dataset
    processed_file_path = os.path.join(save_dir, f"sampled_{file_name}")
    if not os.path.exists(processed_file_path):
        sampled_data.to_parquet(processed_file_path)
    return sampled_data

In [26]:
def get_and_clean_uber_data(parquet_urls):
    all_uber_dataframes = []
    hvfhv_pattern = re.compile(r"fhvhv_tripdata_(2020-(0[1-9]|1[0-2])|202[1-3]-(0[1-9]|1[0-2])|2024-(0[1-8]))\.parquet")
    hvfhv_urls = [url for url in parquet_urls if hvfhv_pattern.search(url)]
    for url in hvfhv_urls:
        # maybe: first try to see if you've downloaded this exact
        # file already and saved it before trying again
        dataframe = get_and_clean_uber_month(url)
        # maybe: if the file hasn't been saved, save it so you can
        # avoid re-downloading it if you re-run the function
        
        all_uber_dataframes.append(dataframe)
        
    # create one gigantic dataframe with data from every month needed
    uber_data = pd.concat(all_uber_dataframes)
    return uber_data

In [27]:
def load_and_clean_uber_data(uber_data):
    uber_data['hvfhs_license_num'] = uber_data['hvfhs_license_num'].astype(str)
    uber_data = uber_data[uber_data['hvfhs_license_num'] == 'HV0003'].copy()

    # Coords matching
    uber_data["PU_coords"] = uber_data["PULocationID"].apply(lambda x: lookup_coords_for_taxi_zone_id(x, taxi_zones))
    uber_data["DO_coords"] = uber_data["DOLocationID"].apply(lambda x: lookup_coords_for_taxi_zone_id(x, taxi_zones))

    # Remove trips with invalid location IDs (where coordinates could not be found)
    uber_data = uber_data.dropna(subset=["PU_coords", "DO_coords"]).reset_index(drop=True)

    # Split coordinates into latitude and longitude for pickups and dropoffs
    uber_data[["PU_lat", "PU_lon"]] = pd.DataFrame(uber_data["PU_coords"].tolist(), index=uber_data.index)
    uber_data[["DO_lat", "DO_lon"]] = pd.DataFrame(uber_data["DO_coords"].tolist(), index=uber_data.index)

    # Drop temporary coordinate columns
    uber_data = uber_data.drop(columns=["PU_coords", "DO_coords"])

    # Step 1: Retain only the required columns
    required_columns = [
        'pickup_datetime', 'dropoff_datetime',
        'PU_lat', 'PU_lon', 'DO_lat', 'DO_lon',
        'trip_miles', 'base_passenger_fare', 'tolls', 'bcf',
        'sales_tax', 'congestion_surcharge', 'tips','airport_fee'
    ]
    uber_data = uber_data[required_columns]

    # Removing Invalid Data Points
    # Remove rows where trip distance is less than or equal to 0
    uber_data = uber_data[uber_data["trip_miles"] > 0]

    # Remove rows where fare amount or total amount is less than or equal to 0
    uber_data = uber_data[uber_data["base_passenger_fare"] > 0]

    # Remove rows where pickup time is after dropoff time
    uber_data = uber_data[uber_data["pickup_datetime"] < uber_data["dropoff_datetime"]]

    # Normalize column names
    uber_data.columns = [col.lower() for col in uber_data.columns]

    # Ensure numeric columns are float or int
    numeric_columns = [
        'pu_lat', 'pu_lon', 'do_lat', 'do_lon',
        'trip_miles', 'base_passenger_fare', 'tolls', 'bcf',
        'sales_tax', 'congestion_surcharge', 'tips', 'airport_fee'
    ]
    uber_data[numeric_columns] = uber_data[numeric_columns].apply(pd.to_numeric, errors="coerce")

    # Drop rows with invalid datetime or numeric values
    uber_data = uber_data.dropna(subset=["pickup_datetime", "dropoff_datetime"] + numeric_columns)
    uber_data['total_amount'] = (uber_data['base_passenger_fare'] + uber_data['tolls'] + uber_data['bcf'] + uber_data['sales_tax'] + uber_data['congestion_surcharge'] +
                                 uber_data['airport_fee'] + uber_data['tips'])
    updated_columns = ['pickup_datetime', 'dropoff_datetime', 'pu_lat', 'pu_lon', 'do_lat', 'do_lon','trip_miles','total_amount']
    uber_data = uber_data[updated_columns]

    # Removing Trips Outside the Latitude/Longitude Bounding Box
    # Latitude and longitude bounding box
    lat_min, lon_min = 40.560445, -74.242330
    lat_max, lon_max = 40.908524, -73.717047

    # Filter rows where pickup and dropoff locations are within the bounding box
    uber_data = uber_data[
        (uber_data["pu_lat"] >= lat_min) & (uber_data["pu_lat"] <= lat_max) &
        (uber_data["pu_lon"] >= lon_min) & (uber_data["pu_lon"] <= lon_max) &
        (uber_data["do_lat"] >= lat_min) & (uber_data["do_lat"] <= lat_max) &
        (uber_data["do_lon"] >= lon_min) & (uber_data["do_lon"] <= lon_max)
    ]

    # Reset index after filtering
    uber_data = uber_data.reset_index(drop=True)
    
    return uber_data

In [28]:
def get_uber_data():
    all_urls = get_all_urls_from_page(TLC_URL)
    all_parquet_urls = filter_parquet_urls(all_urls)
    uber_data = get_and_clean_uber_data(all_parquet_urls)
    uber_data = load_and_clean_uber_data(uber_data)
    return uber_data

In [29]:
uber_data = get_uber_data()

In [30]:
uber_data.head()

Unnamed: 0,pickup_datetime,dropoff_datetime,pu_lat,pu_lon,do_lat,do_lon,trip_miles,total_amount
0,2024-01-26 08:07:17,2024-01-26 08:35:38,40.646116,-73.951623,40.666559,-73.895364,4.29,30.69
1,2024-01-19 02:17:05,2024-01-19 02:29:12,40.882403,-73.910665,40.857108,-73.932832,2.55,16.9
2,2024-01-21 01:44:00,2024-01-21 02:08:30,40.748575,-73.985156,40.71537,-73.936794,6.37,33.19
3,2024-01-20 12:58:40,2024-01-20 13:15:42,40.758028,-73.977698,40.753309,-74.004016,1.99,23.91
4,2024-01-02 08:40:48,2024-01-02 08:54:28,40.666559,-73.895364,40.676644,-73.913632,2.23,17.95


In [31]:
uber_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10545 entries, 0 to 10544
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   pickup_datetime   10545 non-null  datetime64[ns]
 1   dropoff_datetime  10545 non-null  datetime64[ns]
 2   pu_lat            10545 non-null  float64       
 3   pu_lon            10545 non-null  float64       
 4   do_lat            10545 non-null  float64       
 5   do_lon            10545 non-null  float64       
 6   trip_miles        10545 non-null  float64       
 7   total_amount      10545 non-null  float64       
dtypes: datetime64[ns](2), float64(6)
memory usage: 659.2 KB


In [32]:
uber_data.describe()

Unnamed: 0,pickup_datetime,dropoff_datetime,pu_lat,pu_lon,do_lat,do_lon,trip_miles,total_amount
count,10545,10545,10545.0,10545.0,10545.0,10545.0,10545.0,10545.0
mean,2022-12-17 14:17:10.341299200,2022-12-17 14:35:41.576481536,40.736481,-73.9373,40.73583,-73.937359,4.455277,28.322973
min,2021-03-29 10:06:06,2021-03-29 10:18:54,40.561994,-74.170885,40.561994,-74.174002,0.01,0.68
25%,2022-02-07 08:12:21,2022-02-07 09:06:15,40.691507,-73.985937,40.691507,-73.985156,1.55,13.96
50%,2022-12-16 22:51:49,2022-12-16 23:02:11,40.736824,-73.95101,40.736824,-73.94954,2.85,21.94
75%,2023-10-28 23:36:51,2023-10-28 23:41:22,40.77157,-73.901709,40.773633,-73.899735,5.64,35.15
max,2024-08-31 21:37:05,2024-08-31 21:57:29,40.899528,-73.726655,40.899528,-73.726655,33.17,239.83
std,,,0.066292,0.065408,0.066639,0.069448,4.37185,21.284107


In [57]:
output_file = "uber_data_cleaned.csv"

# Save the DataFrame to a CSV file
uber_data.to_csv(output_file, index=False)

### Processing Weather Data

In [34]:
def get_all_weather_csvs(directory):
    """
    Get all CSV files in the specified directory.

    Args:
        directory (str): Path to the directory containing CSV files.

    Returns:
        List[str]: List of file paths to all CSV files.
    """
    csv_files = []
    for file in os.listdir(directory):
        if file.endswith(".csv"):
            csv_files.append(os.path.join(directory, file))
    return csv_files

In [35]:
def clean_month_weather_data_hourly(csv_file):
    """
    Clean and process hourly weather data from a CSV file.

    Args:
        csv_file (str): Path to the CSV file.

    Returns:
        pd.DataFrame: Processed hourly weather data.
    """    
    try:
        df = pd.read_csv(csv_file, low_memory=False)
        df.rename(columns=lambda x: x.strip(), inplace=True)
        df["DATE"] = pd.to_datetime(df["DATE"], errors="coerce")
        
        hourly_data = df[["DATE", "HourlyPrecipitation", "HourlyWindSpeed"]].copy()
        hourly_data.replace("T", 0, inplace=True)
        hourly_data.replace(regex=r"[^\d.]", value=np.nan, inplace=True)
        
        hourly_data.columns = ["date", "precipitation", "wind_speed"]
        hourly_data["precipitation"] = pd.to_numeric(hourly_data["precipitation"], errors="coerce")
        hourly_data["wind_speed"] = pd.to_numeric(hourly_data["wind_speed"], errors="coerce")
        
        return hourly_data
    except Exception as e:
        print(f"Error processing file {csv_file}: {e}")
        return pd.DataFrame()

In [36]:
def clean_month_weather_data_daily(csv_file):
    try:
        # 读取文件
        df = pd.read_csv(csv_file, low_memory=False)
        
        # 修整列名，解析日期
        df.rename(columns=lambda x: x.strip(), inplace=True)
        df["DATE"] = pd.to_datetime(df["DATE"], errors="coerce")
        
        # 提取需要的列：这里假设 Daily 数据已经存在
        daily_data = df[["DATE", "DailyPrecipitation", "DailyAverageWindSpeed", "DailySnowfall"]].copy()
        
        # 处理特殊值 "T" 和其他非数字字符
        daily_data.replace("T", 0, inplace=True)
        daily_data.replace(regex=r"[^\d.]", value=np.nan, inplace=True)
        
        # 重命名列
        daily_data.columns = ["date", "precipitation", "average_wind_speed", "snowfall"]
        
        # 转换为数值类型
        daily_data["precipitation"] = pd.to_numeric(daily_data["precipitation"], errors="coerce")
        daily_data["average_wind_speed"] = pd.to_numeric(daily_data["average_wind_speed"], errors="coerce")
        daily_data["snowfall"] = pd.to_numeric(daily_data["snowfall"], errors="coerce")
        
        # 提取非空值的数据
        daily_data = daily_data.dropna(subset=["precipitation", "average_wind_speed", "snowfall"], how="all")
        
        # 确保日期列只保留日期部分
        daily_data["date"] = daily_data["date"].dt.date
        
        return daily_data
    except Exception as e:
        print(f"Error processing file {csv_file}: {e}")
        return pd.DataFrame()

In [37]:
def load_and_clean_weather_data(directory):
    """
    Load and clean weather data from all CSV files in the directory.

    Args:
        directory (str): Path to the directory containing CSV files.

    Returns:
        tuple: Two DataFrames - hourly and daily weather data.
    """
    
    weather_csv_files = get_all_weather_csvs(directory)
    
    hourly_dataframes = []
    daily_dataframes = []
        
    for csv_file in weather_csv_files:
        print(f"Processing {csv_file}...")
        hourly_dataframe = clean_month_weather_data_hourly(csv_file)
        daily_dataframe = clean_month_weather_data_daily(csv_file)
        hourly_dataframes.append(hourly_dataframe)
        daily_dataframes.append(daily_dataframe)
        
    # Concatenate dataframes
    hourly_data = pd.concat(hourly_dataframes, ignore_index=True)
    daily_data = pd.concat(daily_dataframes, ignore_index=True)
    
    return hourly_data, daily_data

In [59]:
if __name__ == "__main__":
    
    WEATHER_CSV_DIR = "weather/"  # Path to your weather data directory

    # Process weather data
    hourly_weather_data, daily_weather_data = load_and_clean_weather_data(WEATHER_CSV_DIR)

    # Replace NaN values with 0
    hourly_weather_data.fillna(0, inplace=True)
    daily_weather_data.fillna(0, inplace=True)

    # Save cleaned data to CSV
    hourly_weather_data.to_csv("cleaned_hourly_weather_data.csv", index=False)
    daily_weather_data.to_csv("cleaned_daily_weather_data.csv", index=False)
    print("Cleaned data saved.")

Processing weather/2020_weather.csv...
Processing weather/2023_weather.csv...
Processing weather/2021_weather.csv...
Processing weather/2024_weather.csv...
Processing weather/2022_weather.csv...
Cleaned data saved.


In [61]:
hourly_weather_data.head()

Unnamed: 0,date,precipitation,wind_speed
0,2020-01-01 00:51:00,0.0,8.0
1,2020-01-01 01:51:00,0.0,8.0
2,2020-01-01 02:51:00,0.0,14.0
3,2020-01-01 03:51:00,0.0,11.0
4,2020-01-01 04:51:00,0.0,6.0


In [63]:
hourly_weather_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56098 entries, 0 to 56097
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           56098 non-null  datetime64[ns]
 1   precipitation  56098 non-null  float64       
 2   wind_speed     56098 non-null  float64       
dtypes: datetime64[ns](1), float64(2)
memory usage: 1.3 MB


In [65]:
hourly_weather_data.describe()

Unnamed: 0,date,precipitation,wind_speed
count,56098,56098.0,56098.0
mean,2022-05-29 21:14:19.618881024,0.010288,4.537238
min,2020-01-01 00:51:00,0.0,0.0
25%,2021-03-18 19:01:45,0.0,0.0
50%,2022-05-28 01:21:00,0.0,5.0
75%,2023-08-15 05:39:00,0.0,7.0
max,2024-10-22 18:51:00,3.47,2237.0
std,,0.056033,13.883208


In [67]:
daily_weather_data.head()

Unnamed: 0,date,precipitation,average_wind_speed,snowfall
0,2020-01-01,0.0,8.6,0.0
1,2020-01-02,0.0,5.4,0.0
2,2020-01-03,0.15,3.4,0.0
3,2020-01-04,0.27,4.4,0.0
4,2020-01-05,0.0,11.3,0.0


In [69]:
daily_weather_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1755 entries, 0 to 1754
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   date                1755 non-null   object 
 1   precipitation       1755 non-null   float64
 2   average_wind_speed  1755 non-null   float64
 3   snowfall            1755 non-null   float64
dtypes: float64(3), object(1)
memory usage: 55.0+ KB


In [71]:
daily_weather_data.describe()

Unnamed: 0,precipitation,average_wind_speed,snowfall
count,1755.0,1755.0,1755.0
mean,0.141966,4.835499,0.039088
std,0.414574,2.467952,0.493457
min,0.0,0.0,0.0
25%,0.0,3.0,0.0
50%,0.0,4.5,0.0
75%,0.06,6.2,0.0
max,7.13,14.2,14.8


## Part 2: Storing Cleaned Data

In [None]:
engine = db.create_engine(DATABASE_URL)
from sqlalchemy import create_engine

# Load cleaned datasets
daily_data = pd.read_csv("/cleaned_daily_weather_data.csv")
hourly_data = pd.read_csv("/cleaned_hourly_weather_data.csv")
taxi_data = pd.read_csv("/taxi_data_cleaned.csv")
uber_data = pd.read_csv("/uber_data_cleaned.csv")

In [None]:
# if using SQL (as opposed to SQLAlchemy), define the commands 
# to create your 4 tables/dataframes
HOURLY_WEATHER_SCHEMA = """
CREATE TABLE hourly_weather (
    datetime TEXT NOT NULL,
    temperature FLOAT,
    precipitation FLOAT,
    wind_speed FLOAT
);
"""

DAILY_WEATHER_SCHEMA = """
CREATE TABLE daily_weather (
    date TEXT NOT NULL,
    avg_precipitation FLOAT,
    avg_wind_speed FLOAT,
    total_snowfall FLOAT
);
"""

TAXI_TRIPS_SCHEMA = """
CREATE TABLE taxi_trips (
    pickup_datetime TEXT NOT NULL,
    dropoff_datetime TEXT NOT NULL,
    pu_lat FLOAT,
    pu_lon FLOAT,
    do_lat FLOAT,
    do_lon FLOAT,
    trip_miles FLOAT,
    total_amount FLOAT
);
"""

UBER_TRIPS_SCHEMA = """
CREATE TABLE uber_trips (
    pickup_datetime TEXT NOT NULL,
    dropoff_datetime TEXT NOT NULL,
    pu_lat FLOAT,
    pu_lon FLOAT,
    do_lat FLOAT,
    do_lon FLOAT,
    trip_miles FLOAT,
    total_amount FLOAT
);
"""


In [None]:
# create that required schema.sql file
DATABASE_SCHEMA_FILE = "schema.sql"

with open(DATABASE_SCHEMA_FILE, "w") as f:
    f.write(HOURLY_WEATHER_SCHEMA)
    f.write(DAILY_WEATHER_SCHEMA)
    f.write(TAXI_TRIPS_SCHEMA)
    f.write(UBER_TRIPS_SCHEMA)

In [None]:
# Add DROP TABLE statements
DROP_TABLES = [
    "DROP TABLE IF EXISTS hourly_weather;",
    "DROP TABLE IF EXISTS daily_weather;",
    "DROP TABLE IF EXISTS taxi_trips;",
    "DROP TABLE IF EXISTS uber_trips;"
]

# Create SQLite engine
DATABASE_URL = "sqlite:///project.db"
engine = create_engine(DATABASE_URL)

# Create tables using raw connection
conn = engine.raw_connection()
try:
    cursor = conn.cursor()
    # Drop existing tables individually
    for drop_table in DROP_TABLES:
        cursor.execute(drop_table)
    
    # Create new tables
    cursor.execute(HOURLY_WEATHER_SCHEMA.strip())
    cursor.execute(DAILY_WEATHER_SCHEMA.strip())
    cursor.execute(TAXI_TRIPS_SCHEMA.strip())
    cursor.execute(UBER_TRIPS_SCHEMA.strip())
    conn.commit()  # Commit changes
finally:
    cursor.close()  # Always close the cursor
    conn.close()    # Always close the connection


### Add Data to Database

In [None]:
def write_dataframes_to_table(table_to_df_dict):
    """
    Writes dataframes to the corresponding SQL tables.
    Args:
        table_to_df_dict (dict): A dictionary where keys are table names
                                 and values are the respective DataFrames.
    """
    for table_name, df in table_to_df_dict.items():
        df.to_sql(table_name, con=engine, if_exists='replace', index=False)

# Map table names to cleaned DataFrames
map_table_name_to_dataframe = {
    "taxi_trips": taxi_data,
    "uber_trips": uber_data,
    "hourly_weather": hourly_data,
    "daily_weather": daily_data,
}


In [None]:
# Write data to the database
write_dataframes_to_table(map_table_name_to_dataframe)

In [None]:
# Example: Query the first 5 rows of the taxi_trips table
taxi_data_preview = pd.read_sql("SELECT * FROM taxi_trips LIMIT 5;", con=engine)
print(taxi_data_preview)

## Part 3: Understanding the Data

In [None]:
# Helper function to write the queries to file
def write_query_to_file(query, outfile):
    raise NotImplementedError()

### Query 1

In [None]:
QUERY_1_FILENAME = ""

QUERY_1 = """
TODO
"""

In [None]:
# execute query either via sqlalchemy
with engine.connect() as con:
    results = con.execute(db.text(QUERY_1)).fetchall()
results

# or via pandas
pd.read_sql(QUERY_1, con=engine)

In [None]:
write_query_to_file(QUERY_1, QUERY_1_FILENAME)

## Part 4: Visualizing the Data

### Visualization 1

In [None]:
# use a more descriptive name for your function
def plot_visual_1(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    values = "..."  # use the dataframe to pull out values needed to plot
    
    # you may want to use matplotlib to plot your visualizations;
    # there are also many other plot types (other 
    # than axes.plot) you can use
    axes.plot(values, "...")
    # there are other methods to use to label your axes, to style 
    # and set up axes labels, etc
    axes.set_title("Some Descriptive Title")
    
    plt.show()

In [None]:
def get_data_for_visual_1():
    # Query SQL database for the data needed.
    # You can put the data queried into a pandas dataframe, if you wish
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_for_visual_1()
plot_visual_1(some_dataframe)