# Understanding Hired Rides in NYC

_[Project prompt](https://docs.google.com/document/d/1VERPjEZcC1XSs4-02aM-DbkNr_yaJVbFjLJxaYQswqA/edit#)_

_This scaffolding notebook may be used to help setup your final project. It's **totally optional** whether you make use of this or not._

_If you do use this notebook, everything provided is optional as well - you may remove or add prose and code as you wish._

_Anything in italics (prose) or comments (in code) is meant to provide you with guidance. **Remove the italic lines and provided comments** before submitting the project, if you choose to use this scaffolding. We don't need the guidance when grading._

_**All code below should be consider "pseudo-code" - not functional by itself, and only a suggestion at the approach.**_

## Project Setup

In [24]:
# all import statements needed for the project, for example:

import os

import bs4
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
import requests
import sqlalchemy as db
import re
from datetime import datetime, timedelta
import numpy as np
import fiona
import math

ModuleNotFoundError: No module named 'geopandas'

In [3]:
# any constants you might need; some have been added for you, and 
# some you need to fill in

TLC_URL = "https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

TAXI_ZONES_DIR = ""
TAXI_ZONES_SHAPEFILE = f"{TAXI_ZONES_DIR}/taxi_zones.shp"
WEATHER_CSV_DIR = ""

CRS = 4326  # coordinate reference system

# (lat, lon)
NEW_YORK_BOX_COORDS = ((40.560445, -74.242330), (40.908524, -73.717047))
LGA_BOX_COORDS = ((40.763589, -73.891745), (40.778865, -73.854838))
JFK_BOX_COORDS = ((40.639263, -73.795642), (40.651376, -73.766264))
EWR_BOX_COORDS = ((40.686794, -74.194028), (40.699680, -74.165205))

DATABASE_URL = "sqlite:///project.db"
DATABASE_SCHEMA_FILE = "schema.sql"
QUERY_DIRECTORY = "queries"

In [5]:
# Make sure the QUERY_DIRECTORY exists
try:
    os.mkdir(QUERY_DIRECTORY)
except Exception as e:
    if e.errno == 17:
        # the directory already exists
        pass
    else:
        raise

## Part 1: Data Preprocessing

### Load Taxi Zones

In [5]:
def load_taxi_zones(file_path):
    geofile = gpd.read_file(file_path)
    return geofile
    
taxi_zones = load_taxi_zones("taxi_zones.shp")

In [7]:
def lookup_coords_for_taxi_zone_id(zone_loc_id, loaded_taxi_zones):
    if loaded_taxi_zones.crs is None:
        loaded_taxi_zones = loaded_taxi_zones.set_crs(epsg=2263)

    # Find the zone with the matching LocationID
    zone = loaded_taxi_zones[loaded_taxi_zones['LocationID'] == zone_loc_id]

    # If no match is found, return None
    if zone.empty:
        return None

    # Temporarily reproject to a projected CRS for accurate centroid calculation
    projected_zone = zone.to_crs(epsg=2263)
    centroid = projected_zone.geometry.centroid.iloc[0]

    # Transform the centroid back to geographic CRS (latitude/longitude)
    centroid_geo = gpd.GeoSeries([centroid], crs=2263).to_crs(epsg=4326)

    # Return the latitude and longitude as a tuple
    return (centroid_geo.geometry.iloc[0].y, centroid_geo.geometry.iloc[0].x)

### Calculate Sample Size

In [9]:
def calculate_sample_size(population, p = 0.5) -> int:
    """
    Calculates the required sample size using Cochran's formula.

    Args:
        population (int): The total population size.
        confidence_level (float): Confidence level as a proportion (default is 0.95 for 95% confidence).
        margin_of_error (float): Desired margin of error as a proportion (default is 0.05 for 5%).

    Returns:
        int: Calculated sample size.
    """
    # Z-value for confidence level (default: 1.96 for 95%)
    z = 1.96
    margin_of_error = 0.05
    q = 1 - p  # Complementary proportion
    
    # Cochran's sample size formula for infinite population
    n_0 = (z**2 * p * q) / (margin_of_error**2)
    
    # Adjust for finite population size
    sample_size = n_0 / (1 + (n_0 - 1) / population)
    
    return math.ceil(sample_size)

### Common Functions

In [11]:
def get_all_urls_from_page(page_url):
    """
    Fetches all URLs from a given webpage.

    Args:
        page_url (str): URL of the webpage to scrape.

    Returns:
        list: List of all URLs found on the webpage.
    """
    try:
        # Send a GET request to the page
        response = requests.get(page_url)
        response.raise_for_status()  # Raise an HTTPError for bad responses
    except requests.exceptions.RequestException as e:
        raise Exception(f"Failed to access the URL: {page_url}. Error: {e}")
    
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Find all anchor tags with href attributes
    links = soup.find_all("a", href=True)
    
    # Extract and return all href attributes
    all_urls = [link["href"] for link in links]
    
    return all_urls

In [13]:
def filter_parquet_urls(links):
    parquet_urls = []
    for url in links:
        # Normalize the URL (strip whitespace, handle cases like trailing slashes)
        url = url.strip()
        # Use regex to ensure matching even with query parameters
        if re.search(r"\.parquet(\?.*)?$", url):
            parquet_urls.append(url)
    return parquet_urls

### Process Taxi Data

In [24]:
def get_and_clean_taxi_month(parquet_url: str) -> pd.DataFrame:
    """
    Downloads, processes, and saves Yellow Taxi dataset for a given month.

    Args:
        parquet_url (str): URL of the Yellow Taxi Parquet file.

    Returns:
        pd.DataFrame: Sampled and processed DataFrame.
    """
    # Default directory for processed Yellow Taxi data
    save_dir = "processed_data/yellow_taxi"

    # Ensure the save directory exists
    os.makedirs(save_dir, exist_ok=True)

    # Extract file name and define local path
    file_name = parquet_url.split("/")[-1]
    local_file_path = os.path.join(save_dir, file_name)

    # Download the file if not already downloaded
    if not os.path.exists(local_file_path):
        response = requests.get(parquet_url, stream=True)
        response.raise_for_status()
        with open(local_file_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=1024 * 1024):  # 1MB chunks
                if chunk:
                    f.write(chunk)

    # Load the dataset
    data = pd.read_parquet(local_file_path)

    # Determine population size
    population = len(data)

    # Calculate sample size (using p = 0.5 for Yellow Taxi data)
    sample_size = calculate_sample_size(population, p = 0.5)

    # Sample the dataset
    sampled_data = data.sample(n=sample_size, random_state=42) if population > sample_size else data

    # Save the sampled dataset
    processed_file_path = os.path.join(save_dir, f"sampled_{file_name}")
    sampled_data.to_parquet(processed_file_path)
    return sampled_data


In [148]:
def get_and_clean_taxi_data(parquet_urls):
    all_taxi_dataframes = []
    yellow_taxi_pattern = re.compile(r"yellow_tripdata_(2020-(0[1-9]|1[0-2])|202[1-3]-(0[1-9]|1[0-2])|2024-(0[1-8]))\.parquet")

    # Filter URLs matching the pattern
    yellow_taxi_urls = [url for url in parquet_urls if yellow_taxi_pattern.search(url)]
    
    for url in yellow_taxi_urls:
        # maybe: first try to see if you've downloaded this exact
        # file already and saved it before trying again
        dataframe = get_and_clean_taxi_month(url)
        # maybe: if the file hasn't been saved, save it so you can
        # avoid re-downloading it if you re-run the function
        
        all_taxi_dataframes.append(dataframe)
        
    # create one gigantic dataframe with data from every month needed
    taxi_data = pd.concat(all_taxi_dataframes)
    
    return taxi_data

In [152]:
def clean_taxi_data(taxi_data):
    """
    Cleans the taxi data by retaining specified columns, normalizing column names,
    converting column types, and removing invalid trips.

    Args:
        taxi_data (pd.DataFrame): The input taxi data DataFrame.

    Returns:
        pd.DataFrame: The cleaned and filtered taxi data.
    """
    # Add latitude and longitude for PULocationID and DOLocationID
    taxi_data["PU_coords"] = taxi_data["PULocationID"].apply(lambda x: lookup_coords_for_taxi_zone_id(x, taxi_zones))
    taxi_data["DO_coords"] = taxi_data["DOLocationID"].apply(lambda x: lookup_coords_for_taxi_zone_id(x, taxi_zones))

    # Remove trips with invalid location IDs (where coordinates could not be found)
    taxi_data = taxi_data.dropna(subset=["PU_coords", "DO_coords"]).reset_index(drop=True)

    # Split coordinates into latitude and longitude for pickups and dropoffs
    taxi_data[["PU_lat", "PU_lon"]] = pd.DataFrame(taxi_data["PU_coords"].tolist(), index=taxi_data.index)
    taxi_data[["DO_lat", "DO_lon"]] = pd.DataFrame(taxi_data["DO_coords"].tolist(), index=taxi_data.index)

    # Drop temporary coordinate columns
    taxi_data = taxi_data.drop(columns=["PU_coords", "DO_coords"])
    
    # Step 1: Retain only the required columns
    required_columns = [
        'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'trip_distance',
        'PU_lat', 'PU_lon', 'DO_lat', 'DO_lon',
        'fare_amount', 'extra', 'mta_tax', 'tip_amount',
        'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge'
    ]
    taxi_data = taxi_data[required_columns]

    # Removing Invalid Data Points
    # Remove rows where trip distance is less than or equal to 0
    taxi_data = taxi_data[taxi_data["trip_distance"] > 0]

    # Remove rows where fare amount or total amount is less than or equal to 0
    taxi_data = taxi_data[taxi_data["fare_amount"] > 0]
    taxi_data = taxi_data[taxi_data["total_amount"] > 0]

    # Remove rows where pickup time is after dropoff time
    taxi_data = taxi_data[taxi_data["tpep_pickup_datetime"] < taxi_data["tpep_dropoff_datetime"]]

    # Reset index after filtering
    taxi_data = taxi_data.reset_index(drop=True)

    #Normalize column names
    taxi_data.columns = [col.lower() for col in taxi_data.columns]

    # Normalizing and Using Appropriate Column Types
    taxi_data["tpep_pickup_datetime"] = pd.to_datetime(taxi_data["tpep_pickup_datetime"], errors="coerce")
    taxi_data["tpep_dropoff_datetime"] = pd.to_datetime(taxi_data["tpep_dropoff_datetime"], errors="coerce")

    # Ensure numeric columns are float or int
    numeric_columns = [
        "trip_distance", "pu_lat", "pu_lon", "do_lat", "do_lon",
        "fare_amount", "extra", "mta_tax", "tip_amount",
        "tolls_amount", "improvement_surcharge", "total_amount", "congestion_surcharge"
    ]
    taxi_data[numeric_columns] = taxi_data[numeric_columns].apply(pd.to_numeric, errors="coerce")

    # Drop rows with invalid datetime or numeric values
    taxi_data = taxi_data.dropna(subset=["tpep_pickup_datetime", "tpep_dropoff_datetime"] + numeric_columns)

    # Reset index after dropping invalid rows
    taxi_data = taxi_data.reset_index(drop=True)

    # Removing Trips Outside the Latitude/Longitude Bounding Box
    # Latitude and longitude bounding box
    lat_min, lon_min = 40.560445, -74.242330
    lat_max, lon_max = 40.908524, -73.717047

    # Filter rows where pickup and dropoff locations are within the bounding box
    taxi_data = taxi_data[
        (taxi_data["pu_lat"] >= lat_min) & (taxi_data["pu_lat"] <= lat_max) &
        (taxi_data["pu_lon"] >= lon_min) & (taxi_data["pu_lon"] <= lon_max) &
        (taxi_data["do_lat"] >= lat_min) & (taxi_data["do_lat"] <= lat_max) &
        (taxi_data["do_lon"] >= lon_min) & (taxi_data["do_lon"] <= lon_max)
    ]

    # Reset index after filtering
    taxi_data = taxi_data.reset_index(drop=True)

    return taxi_data

In [154]:
def get_taxi_data():
    all_urls = get_all_urls_from_page(TLC_URL)
    all_parquet_urls = filter_parquet_urls(all_urls)
    taxi_data = get_and_clean_taxi_data(all_parquet_urls)
    return taxi_data

In [156]:
taxi_data = get_taxi_data()
taxi_data = clean_taxi_data(taxi_data)

In [164]:
taxi_data.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,pu_lat,pu_lon,do_lat,do_lon,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,2024-01-20 13:31:30,2024-01-20 14:03:25,17.14,40.646985,-73.78653,40.749914,-73.970443,70.0,0.0,0.5,8.27,6.94,1.0,90.96,2.5
1,2024-01-18 21:52:46,2024-01-18 22:03:21,2.49,40.764421,-73.977569,40.790011,-73.94575,13.5,1.0,0.5,4.0,0.0,1.0,22.5,2.5
2,2024-01-01 03:43:58,2024-01-01 03:50:47,1.84,40.866075,-73.919308,40.857779,-73.885867,10.0,1.0,0.5,0.0,0.0,1.0,12.5,0.0
3,2024-01-19 22:20:12,2024-01-19 22:50:12,3.6,40.748497,-73.992438,40.778766,-73.95101,23.3,3.5,0.5,5.65,0.0,1.0,33.95,2.5
4,2024-01-06 22:41:50,2024-01-06 22:43:24,0.04,40.791705,-73.973049,40.791705,-73.973049,3.7,1.0,0.5,0.0,0.0,1.0,6.2,0.0


In [160]:
taxi_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19669 entries, 0 to 19668
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   tpep_pickup_datetime   19669 non-null  datetime64[ns]
 1   tpep_dropoff_datetime  19669 non-null  datetime64[ns]
 2   trip_distance          19669 non-null  float64       
 3   pu_lat                 19669 non-null  float64       
 4   pu_lon                 19669 non-null  float64       
 5   do_lat                 19669 non-null  float64       
 6   do_lon                 19669 non-null  float64       
 7   fare_amount            19669 non-null  float64       
 8   extra                  19669 non-null  float64       
 9   mta_tax                19669 non-null  float64       
 10  tip_amount             19669 non-null  float64       
 11  tolls_amount           19669 non-null  float64       
 12  improvement_surcharge  19669 non-null  float64       
 13  t

In [162]:
taxi_data.describe()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,pu_lat,pu_lon,do_lat,do_lon,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
count,19669,19669,19669.0,19669.0,19669.0,19669.0,19669.0,19669.0,19669.0,19669.0,19669.0,19669.0,19669.0,19669.0,19669.0
mean,2022-04-29 05:40:59.003304704,2022-04-29 05:56:50.285322240,3.166626,40.753658,-73.967372,40.756144,-73.971694,14.95525,1.270525,0.497916,2.735081,0.416819,0.550704,22.142688,2.329935
min,2020-01-01 00:11:06,2020-01-01 00:30:50,0.01,40.576961,-74.029893,40.576961,-74.174002,1.0,0.0,0.0,0.0,0.0,0.0,1.3,0.0
25%,2021-03-08 15:59:21,2021-03-08 16:22:45,1.06,40.740439,-73.989845,40.740337,-73.989845,7.2,0.0,0.5,0.0,0.0,0.3,12.6,2.5
50%,2022-04-30 09:25:30,2022-04-30 12:04:02,1.77,40.758028,-73.977698,40.758028,-73.977698,10.5,0.5,0.5,2.16,0.0,0.3,16.63,2.5
75%,2023-06-18 22:51:55,2023-06-18 23:00:48,3.2,40.773633,-73.965146,40.775932,-73.959635,16.5,2.5,0.5,3.46,0.0,1.0,23.8,2.5
max,2024-08-31 22:43:47,2024-08-31 23:26:23,67.9,40.899528,-73.739337,40.899528,-73.726655,209.5,11.75,0.5,50.0,40.0,1.0,262.7,2.5
std,,,4.037265,0.030998,0.044858,0.031462,0.034564,13.469103,1.519187,0.032217,3.180876,1.756269,0.336082,17.338749,0.629492


### Processing Uber Data

In [15]:
def get_and_clean_uber_month(parquet_url):
    save_dir = "processed_data/hvhf"

    # Ensure the save directory exists
    os.makedirs(save_dir, exist_ok=True)

    # Extract file name and define local path
    file_name = parquet_url.split("/")[-1]
    local_file_path = os.path.join(save_dir, file_name)

    # Download the file if not already downloaded
    if not os.path.exists(local_file_path):
        response = requests.get(parquet_url, stream=True)
        response.raise_for_status()
        with open(local_file_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=1024 * 1024):  # 1MB chunks
                if chunk:
                    f.write(chunk)

    data = pd.read_parquet(local_file_path)

    # Determine population size
    population = len(data)

    # Calculate sample size (using p = 0.5 for Yellow Taxi data)
    sample_size = calculate_sample_size(population, p = 0.4)

    # Sample the dataset
    sampled_data = data.sample(n=sample_size, random_state=42) if population > sample_size else data

    # Save the sampled dataset
    processed_file_path = os.path.join(save_dir, f"sampled_{file_name}")
    if not os.path.exists(processed_file_path):
        sampled_data.to_parquet(processed_file_path)
    return sampled_data

In [17]:
def get_and_clean_uber_data(parquet_urls):
    all_uber_dataframes = []
    hvfhv_pattern = re.compile(r"fhvhv_tripdata_(2020-(0[1-9]|1[0-2])|202[1-3]-(0[1-9]|1[0-2])|2024-(0[1-8]))\.parquet")
    hvfhv_urls = [url for url in parquet_urls if hvfhv_pattern.search(url)]
    for url in hvfhv_urls:
        # maybe: first try to see if you've downloaded this exact
        # file already and saved it before trying again
        dataframe = get_and_clean_uber_month(url)
        # maybe: if the file hasn't been saved, save it so you can
        # avoid re-downloading it if you re-run the function
        
        all_uber_dataframes.append(dataframe)
        
    # create one gigantic dataframe with data from every month needed
    uber_data = pd.concat(all_uber_dataframes)
    return uber_data

In [60]:
def load_and_clean_uber_data(uber_data):
    uber_data['hvfhs_license_num'] = uber_data['hvfhs_license_num'].astype(str)
    uber_data = uber_data[uber_data['hvfhs_license_num'] == 'HV0003'].copy()

    # Coords matching
    uber_data["PU_coords"] = uber_data["PULocationID"].apply(lambda x: lookup_coords_for_taxi_zone_id(x, taxi_zones))
    uber_data["DO_coords"] = uber_data["DOLocationID"].apply(lambda x: lookup_coords_for_taxi_zone_id(x, taxi_zones))

    # Remove trips with invalid location IDs (where coordinates could not be found)
    uber_data = uber_data.dropna(subset=["PU_coords", "DO_coords"]).reset_index(drop=True)

    # Split coordinates into latitude and longitude for pickups and dropoffs
    uber_data[["PU_lat", "PU_lon"]] = pd.DataFrame(uber_data["PU_coords"].tolist(), index=uber_data.index)
    uber_data[["DO_lat", "DO_lon"]] = pd.DataFrame(uber_data["DO_coords"].tolist(), index=uber_data.index)

    # Drop temporary coordinate columns
    uber_data = uber_data.drop(columns=["PU_coords", "DO_coords"])

    # Step 1: Retain only the required columns
    required_columns = [
        'pickup_datetime', 'dropoff_datetime',
        'PU_lat', 'PU_lon', 'DO_lat', 'DO_lon',
        'trip_miles', 'base_passenger_fare', 'tolls', 'bcf',
        'sales_tax', 'congestion_surcharge', 'tips', 'driver_pay'
    ]
    uber_data = uber_data[required_columns]

    # Removing Invalid Data Points
    # Remove rows where trip distance is less than or equal to 0
    uber_data = uber_data[uber_data["trip_miles"] > 0]

    # Remove rows where fare amount or total amount is less than or equal to 0
    uber_data = uber_data[uber_data["base_passenger_fare"] > 0]
    uber_data = uber_data[uber_data["driver_pay"] > 0]

    # Remove rows where pickup time is after dropoff time
    uber_data = uber_data[uber_data["pickup_datetime"] < uber_data["dropoff_datetime"]]

    # Normalize column names
    uber_data.columns = [col.lower() for col in uber_data.columns]

    # Ensure numeric columns are float or int
    numeric_columns = [
        'pu_lat', 'pu_lon', 'do_lat', 'do_lon',
        'trip_miles', 'base_passenger_fare', 'tolls', 'bcf',
        'sales_tax', 'congestion_surcharge', 'tips', 'driver_pay'
    ]
    uber_data[numeric_columns] = uber_data[numeric_columns].apply(pd.to_numeric, errors="coerce")

    # Drop rows with invalid datetime or numeric values
    uber_data = uber_data.dropna(subset=["pickup_datetime", "dropoff_datetime"] + numeric_columns)

    # Removing Trips Outside the Latitude/Longitude Bounding Box
    # Latitude and longitude bounding box
    lat_min, lon_min = 40.560445, -74.242330
    lat_max, lon_max = 40.908524, -73.717047

    # Filter rows where pickup and dropoff locations are within the bounding box
    uber_data = uber_data[
        (uber_data["pu_lat"] >= lat_min) & (uber_data["pu_lat"] <= lat_max) &
        (uber_data["pu_lon"] >= lon_min) & (uber_data["pu_lon"] <= lon_max) &
        (uber_data["do_lat"] >= lat_min) & (uber_data["do_lat"] <= lat_max) &
        (uber_data["do_lon"] >= lon_min) & (uber_data["do_lon"] <= lon_max)
    ]

    # Reset index after filtering
    uber_data = uber_data.reset_index(drop=True)
    
    return uber_data

In [62]:
def get_uber_data():
    all_urls = get_all_urls_from_page(TLC_URL)
    all_parquet_urls = filter_parquet_urls(all_urls)
    uber_data = get_and_clean_uber_data(all_parquet_urls)
    uber_data = load_and_clean_uber_data(uber_data)
    return uber_data

In [64]:
uber_data = get_uber_data()

In [65]:
uber_data

Unnamed: 0,pickup_datetime,dropoff_datetime,pu_lat,pu_lon,do_lat,do_lon,trip_miles,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,tips,driver_pay
0,2024-01-26 08:07:17,2024-01-26 08:35:38,40.646116,-73.951623,40.666559,-73.895364,4.29,27.49,0.0,0.76,2.44,0.00,0.00,24.88
1,2024-01-19 02:17:05,2024-01-19 02:29:12,40.882403,-73.910665,40.857108,-73.932832,2.55,15.14,0.0,0.42,1.34,0.00,0.00,10.19
2,2024-01-21 01:44:00,2024-01-21 02:08:30,40.748575,-73.985156,40.715370,-73.936794,6.37,24.57,0.0,0.68,2.18,2.75,3.01,23.00
3,2024-01-20 12:58:40,2024-01-20 13:15:42,40.758028,-73.977698,40.753309,-74.004016,1.99,18.96,0.0,0.52,1.68,2.75,0.00,12.47
4,2024-01-02 08:40:48,2024-01-02 08:54:28,40.666559,-73.895364,40.676644,-73.913632,2.23,16.08,0.0,0.44,1.43,0.00,0.00,10.64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14309,2020-12-31 14:17:53,2020-12-31 14:30:13,40.849058,-73.905122,40.865264,-73.905911,1.83,12.00,0.0,0.36,1.06,0.00,0.00,8.21
14310,2020-12-22 20:22:12,2020-12-22 20:37:34,40.849172,-73.831582,40.828987,-73.924410,9.01,25.48,0.0,0.76,2.26,0.00,0.00,17.66
14311,2020-12-18 17:51:59,2020-12-18 18:00:14,40.857108,-73.932832,40.841708,-73.941399,1.29,6.54,0.0,0.20,0.58,0.00,0.00,5.57
14312,2020-12-27 22:52:25,2020-12-27 23:07:06,40.882157,-73.858949,40.827902,-73.869680,5.06,15.35,0.0,0.46,1.36,0.00,0.00,12.96


In [68]:
uber_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14314 entries, 0 to 14313
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   pickup_datetime       14314 non-null  datetime64[ns]
 1   dropoff_datetime      14314 non-null  datetime64[ns]
 2   pu_lat                14314 non-null  float64       
 3   pu_lon                14314 non-null  float64       
 4   do_lat                14314 non-null  float64       
 5   do_lon                14314 non-null  float64       
 6   trip_miles            14314 non-null  float64       
 7   base_passenger_fare   14314 non-null  float64       
 8   tolls                 14314 non-null  float64       
 9   bcf                   14314 non-null  float64       
 10  sales_tax             14314 non-null  float64       
 11  congestion_surcharge  14314 non-null  float64       
 12  tips                  14314 non-null  float64       
 13  driver_pay      

In [70]:
uber_data.describe()

Unnamed: 0,pickup_datetime,dropoff_datetime,pu_lat,pu_lon,do_lat,do_lon,trip_miles,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,tips,driver_pay
count,14314,14314,14314.0,14314.0,14314.0,14314.0,14314.0,14314.0,14314.0,14314.0,14314.0,14314.0,14314.0,14314.0
mean,2022-05-04 22:47:47.122257920,2022-05-04 23:05:35.212868352,40.737286,-73.935201,40.737268,-73.935344,4.375069,21.038508,0.680329,0.616612,1.868726,1.056047,0.796023,16.981945
min,2020-01-01 03:27:51,2020-01-01 03:30:21,40.561994,-74.170885,40.561994,-74.174002,0.01,0.61,0.0,0.0,0.0,0.0,0.0,0.28
25%,2021-02-28 22:55:11,2021-02-28 23:04:53.500000,40.69089,-73.984197,40.691201,-73.984197,1.55,10.49,0.0,0.29,0.91,0.0,0.0,8.41
50%,2022-05-03 21:00:57.500000,2022-05-03 21:09:21.500000,40.736824,-73.948788,40.737698,-73.948136,2.83,16.69,0.0,0.47,1.45,0.0,0.0,13.47
75%,2023-07-04 21:57:38,2023-07-04 22:06:56.249999872,40.774376,-73.900317,40.774376,-73.899735,5.52,26.37,0.0,0.77,2.35,2.75,0.0,21.635
max,2024-08-31 21:37:05,2024-08-31 21:57:29,40.899528,-73.726655,40.899528,-73.726655,33.17,188.58,43.31,5.45,17.57,2.75,50.0,117.92
std,,,0.068439,0.064694,0.068843,0.068135,4.267862,15.215758,2.718339,0.494388,1.414169,1.333108,2.413868,12.024392


### Processing Weather Data

In [8]:
import pandas as pd
import os
import numpy as np
# Function 1: Clean hourly weather data
def clean_month_weather_data_hourly(csv_file):
    try:
        df = pd.read_csv(csv_file, low_memory=False)
        df.rename(columns=lambda x: x.strip(), inplace=True)
        df["DATE"] = pd.to_datetime(df["DATE"], errors="coerce")
        hourly_data = df[["DATE", "HourlyPrecipitation", "HourlyWindSpeed"]].copy()
        hourly_data.replace("T", 0, inplace=True)
        hourly_data.replace(regex=r"[^\d.]", value=np.nan, inplace=True)
        hourly_data.columns = ["date", "precipitation", "wind_speed"]
        hourly_data["precipitation"] = pd.to_numeric(hourly_data["precipitation"], errors="coerce")
        hourly_data["wind_speed"] = pd.to_numeric(hourly_data["wind_speed"], errors="coerce")
        return hourly_data
    except Exception as e:
        print(f"Error processing file {csv_file}: {e}")
        return pd.DataFrame()

# Function 2: Clean daily weather data
def clean_month_weather_data_daily(csv_file):
    try:
        # 读取文件
        df = pd.read_csv(csv_file, low_memory=False)
        
        # 修整列名，解析日期
        df.rename(columns=lambda x: x.strip(), inplace=True)
        df["DATE"] = pd.to_datetime(df["DATE"], errors="coerce")
        
        # 提取需要的列
        daily_data = df[["DATE", "DailyPrecipitation", "DailyAverageWindSpeed", "DailySnowfall"]].copy()
        
        # 处理特殊值 "T" 和其他非数字字符
        daily_data.replace("T", 0, inplace=True)
        daily_data.replace(regex=r"[^\d.]", value=np.nan, inplace=True)
        
        # 重命名列
        daily_data.columns = ["date", "precipitation", "average_wind_speed", "snowfall"]
        
        # 转换为数值类型
        daily_data["precipitation"] = pd.to_numeric(daily_data["precipitation"], errors="coerce")
        daily_data["average_wind_speed"] = pd.to_numeric(daily_data["average_wind_speed"], errors="coerce")
        daily_data["snowfall"] = pd.to_numeric(daily_data["snowfall"], errors="coerce")
        
        # 确保仅保留日期部分
        daily_data["date"] = pd.to_datetime(daily_data["date"]).dt.date
        
        # 按日期聚合数据：降水量和雪量求和，风速取均值
        daily_data = daily_data.groupby("date").agg({
            "precipitation": "sum",  # 降水量总和
            "average_wind_speed": "mean",  # 风速均值
            "snowfall": "sum"  # 雪量总和
        }).reset_index()
        
        return daily_data
    except Exception as e:
        print(f"Error processing file {csv_file}: {e}")
        return pd.DataFrame()


# Function 3: Process all files in a directory
def load_and_clean_weather_data(directory_path):
    hourly_weather_data = pd.DataFrame()
    daily_weather_data = pd.DataFrame()

    for file in os.listdir(directory_path):
        if file.endswith(".csv"):
            file_path = os.path.join(directory_path, file)
            print(f"Processing {file_path}...")
            hourly_data = clean_month_weather_data_hourly(file_path)
            daily_data = clean_month_weather_data_daily(file_path)
            hourly_weather_data = pd.concat([hourly_weather_data, hourly_data], ignore_index=True)
            daily_weather_data = pd.concat([daily_weather_data, daily_data], ignore_index=True)

    return hourly_weather_data, daily_weather_data

# Function 4: Analyze non-zero values and statistics
def analyze_weather_data(df, columns_to_analyze):
    print("\nStatistics and non-zero value counts:")
    for col in columns_to_analyze:
        print(f"\nColumn: {col}")
        print(df[col].describe())
        non_zero_count = (df[col] > 0).sum()
        print(f"Non-zero count: {non_zero_count}")

# Main script
directory_path = "weather/"  # Update to your directory path
hourly_weather_data, daily_weather_data = load_and_clean_weather_data(directory_path)


# 替换空值为0
daily_weather_data.fillna(0, inplace=True)
hourly_weather_data.fillna(0, inplace=True)

# Save cleaned data
hourly_weather_data.to_csv("cleaned_hourly_weather_data.csv", index=False)
daily_weather_data.to_csv("cleaned_daily_weather_data.csv", index=False)
print("Cleaned data saved.")

# Analyze daily weather data
columns_to_analyze = ["precipitation", "average_wind_speed", "snowfall"]
analyze_weather_data(daily_weather_data, columns_to_analyze)


# Preview data
print("\nHourly Weather Data (Preview):")
print(hourly_weather_data.head())

print("\nDaily Weather Data (Preview):")
print(daily_weather_data.head())

Processing weather/2020_weather.csv...
Processing weather/2023_weather.csv...
Processing weather/2021_weather.csv...
Processing weather/2024_weather.csv...
Processing weather/2022_weather.csv...
Cleaned data saved.

Statistics and non-zero value counts:

Column: precipitation
count    1462.000000
mean        0.144097
std         0.419189
min         0.000000
25%         0.000000
50%         0.000000
75%         0.070000
max         7.130000
Name: precipitation, dtype: float64
Non-zero count: 512

Column: average_wind_speed
count    1462.000000
mean        4.848974
std         2.499979
min         0.000000
25%         3.000000
50%         4.600000
75%         6.300000
max        14.200000
Name: average_wind_speed, dtype: float64
Non-zero count: 1406

Column: snowfall
count    1462.000000
mean        0.041792
std         0.530350
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max        14.800000
Name: snowfall, dtype: float64
Non-zero count: 27

Hour

In [10]:
directory_path = 'weather'  # 更新路径

# 遍历文件并检查时间范围
for file in os.listdir(directory_path):
    if file.endswith(".csv"):
        file_path = os.path.join(directory_path, file)
        df = pd.read_csv(file_path, low_memory=False)
        if "DATE" in df.columns:
            df["DATE"] = pd.to_datetime(df["DATE"], errors="coerce")
            print(f"{file}: {df['DATE'].min()} to {df['DATE'].max()}")


2020_weather.csv: 2020-01-01 00:51:00 to 2020-12-31 23:59:00
2023_weather.csv: 2023-01-01 00:20:00 to 2023-12-31 23:51:00
2021_weather.csv: 2021-01-01 00:51:00 to 2021-12-31 23:59:00
2024_weather.csv: 2020-01-01 00:51:00 to 2020-01-01 04:51:00
2022_weather.csv: 2022-01-01 00:51:00 to 2022-12-31 23:59:00


In [None]:
def get_all_weather_csvs(directory):
    raise NotImplementedError()

In [None]:
def clean_month_weather_data_hourly(csv_file):
    raise NotImplementedError()

In [None]:
def clean_month_weather_data_daily(csv_file):
    raise NotImplementedError()

In [None]:
def load_and_clean_weather_data():
    weather_csv_files = get_all_weather_csvs(WEATHER_CSV_DIR)
    
    hourly_dataframes = []
    daily_dataframes = []
        
    for csv_file in weather_csv_files:
        hourly_dataframe = clean_month_weather_data_hourly(csv_file)
        daily_dataframe = clean_month_weather_data_daily(csv_file)
        hourly_dataframes.append(hourly_dataframe)
        daily_dataframes.append(daily_dataframe)
        
    # create two dataframes with hourly & daily data from every month
    hourly_data = pd.concat(hourly_dataframes)
    daily_data = pd.concat(daily_dataframes)
    
    return hourly_data, daily_data

In [None]:
hourly_weather_data, daily_weather_data = load_and_clean_weather_data()

In [None]:
hourly_weather_data.head()

In [None]:
hourly_weather_data.info()

In [None]:
hourly_weather_data.describe()

In [None]:
daily_weather_data.head()

In [None]:
daily_weather_data.info()

In [None]:
daily_weather_data.describe()

## Part 2: Storing Cleaned Data

In [None]:
engine = db.create_engine(DATABASE_URL)

In [None]:
# if using SQL (as opposed to SQLAlchemy), define the commands 
# to create your 4 tables/dataframes
HOURLY_WEATHER_SCHEMA = """
TODO
"""

DAILY_WEATHER_SCHEMA = """
TODO
"""

TAXI_TRIPS_SCHEMA = """
TODO
"""

UBER_TRIPS_SCHEMA = """
TODO
"""

In [None]:
# create that required schema.sql file
with open(DATABASE_SCHEMA_FILE, "w") as f:
    f.write(HOURLY_WEATHER_SCHEMA)
    f.write(DAILY_WEATHER_SCHEMA)
    f.write(TAXI_TRIPS_SCHEMA)
    f.write(UBER_TRIPS_SCHEMA)

In [None]:
# create the tables with the schema files
with engine.connect() as connection:
    pass

### Add Data to Database

In [None]:
def write_dataframes_to_table(table_to_df_dict):
    raise NotImplemented()

In [None]:
map_table_name_to_dataframe = {
    "taxi_trips": taxi_data,
    "uber_trips": uber_data,
    "hourly_weather": hourly_data,
    "daily_weather": daily_data,
}

In [None]:
write_dataframes_to_table(map_table_name_to_dataframe)

## Part 3: Understanding the Data

In [None]:
# Helper function to write the queries to file
def write_query_to_file(query, outfile):
    raise NotImplementedError()

### Query 1

In [None]:
QUERY_1_FILENAME = ""

QUERY_1 = """
TODO
"""

In [None]:
# execute query either via sqlalchemy
with engine.connect() as con:
    results = con.execute(db.text(QUERY_1)).fetchall()
results

# or via pandas
pd.read_sql(QUERY_1, con=engine)

In [None]:
write_query_to_file(QUERY_1, QUERY_1_FILENAME)

## Part 4: Visualizing the Data

### Visualization 1

In [None]:
# use a more descriptive name for your function
def plot_visual_1(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    values = "..."  # use the dataframe to pull out values needed to plot
    
    # you may want to use matplotlib to plot your visualizations;
    # there are also many other plot types (other 
    # than axes.plot) you can use
    axes.plot(values, "...")
    # there are other methods to use to label your axes, to style 
    # and set up axes labels, etc
    axes.set_title("Some Descriptive Title")
    
    plt.show()

In [None]:
def get_data_for_visual_1():
    # Query SQL database for the data needed.
    # You can put the data queried into a pandas dataframe, if you wish
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_for_visual_1()
plot_visual_1(some_dataframe)