# Understanding Hired Rides in NYC

_[Project prompt](https://docs.google.com/document/d/1VERPjEZcC1XSs4-02aM-DbkNr_yaJVbFjLJxaYQswqA/edit#)_

_This scaffolding notebook may be used to help setup your final project. It's **totally optional** whether you make use of this or not._

_If you do use this notebook, everything provided is optional as well - you may remove or add prose and code as you wish._

_Anything in italics (prose) or comments (in code) is meant to provide you with guidance. **Remove the italic lines and provided comments** before submitting the project, if you choose to use this scaffolding. We don't need the guidance when grading._

_**All code below should be consider "pseudo-code" - not functional by itself, and only a suggestion at the approach.**_

## Project Setup

In [6]:
# all import statements needed for the project, for example:

import os

import bs4
import matplotlib.pyplot as plt
import pandas as pd
import requests
import sqlalchemy as db
import re
from datetime import datetime, timedelta

In [None]:
# any constants you might need; some have been added for you, and 
# some you need to fill in

TLC_URL = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

TAXI_ZONES_DIR = ""
TAXI_ZONES_SHAPEFILE = f"{TAXI_ZONES_DIR}/taxi_zones.shp"
WEATHER_CSV_DIR = ""

CRS = 4326  # coordinate reference system

# (lat, lon)
NEW_YORK_BOX_COORDS = ((40.560445, -74.242330), (40.908524, -73.717047))
LGA_BOX_COORDS = ((40.763589, -73.891745), (40.778865, -73.854838))
JFK_BOX_COORDS = ((40.639263, -73.795642), (40.651376, -73.766264))
EWR_BOX_COORDS = ((40.686794, -74.194028), (40.699680, -74.165205))

DATABASE_URL = "sqlite:///project.db"
DATABASE_SCHEMA_FILE = "schema.sql"
QUERY_DIRECTORY = "queries"

In [None]:
# Make sure the QUERY_DIRECTORY exists
try:
    os.mkdir(QUERY_DIRECTORY)
except Exception as e:
    if e.errno == 17:
        # the directory already exists
        pass
    else:
        raise

## Part 1: Data Preprocessing

In [8]:
# Generate the date range for Yellow Taxi and HVFHV datasets
start_date = "2009-01"
end_date = "2024-08"

dates = []
current_date = datetime.strptime(start_date, "%Y-%m")
end_date_obj = datetime.strptime(end_date, "%Y-%m")

while current_date <= end_date_obj:
    dates.append(current_date.strftime("%Y-%m"))
    current_date += timedelta(days=31)  # Move to the next month
    current_date = current_date.replace(day=1)

# Base URLs for Yellow Taxi and HVFHV datasets
yellow_taxi_base_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_"
hvhf_base_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_"

# Save directories for Yellow Taxi and HVFHV data
yellow_taxi_save_dir = "data/yellow_taxi"
hvhf_save_dir = "data/hvhf"

# Ensure the directories exist
os.makedirs(yellow_taxi_save_dir, exist_ok=True)
os.makedirs(hvhf_save_dir, exist_ok=True)

# Download Yellow Taxi data
for date in dates:
    file_url = f"{yellow_taxi_base_url}{date}.parquet"
    file_name = f"{date}.parquet"
    local_file_path = os.path.join(yellow_taxi_save_dir, file_name)

    print(f"Downloading Yellow Taxi file: {file_url} ...")
    try:
        response = requests.get(file_url, stream=True)
        response.raise_for_status()
        with open(local_file_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
        print(f"File saved to: {local_file_path}")
    except requests.exceptions.RequestException as e:
        print(f"Failed to download {file_url}: {e}")

# Download HVFHV data
for date in dates:
    file_url = f"{hvhf_base_url}{date}.parquet"
    file_name = f"{date}.parquet"
    local_file_path = os.path.join(hvhf_save_dir, file_name)

    print(f"Downloading HVFHV file: {file_url} ...")
    try:
        response = requests.get(file_url, stream=True)
        response.raise_for_status()
        with open(local_file_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
        print(f"File saved to: {local_file_path}")
    except requests.exceptions.RequestException as e:
        print(f"Failed to download {file_url}: {e}")


Downloading Yellow Taxi file: https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2009-01.parquet ...
File saved to: data/yellow_taxi/2009-01.parquet
Downloading Yellow Taxi file: https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2009-02.parquet ...
File saved to: data/yellow_taxi/2009-02.parquet
Downloading Yellow Taxi file: https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2009-03.parquet ...
File saved to: data/yellow_taxi/2009-03.parquet
Downloading Yellow Taxi file: https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2009-04.parquet ...
File saved to: data/yellow_taxi/2009-04.parquet
Downloading Yellow Taxi file: https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2009-05.parquet ...
File saved to: data/yellow_taxi/2009-05.parquet
Downloading Yellow Taxi file: https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2009-06.parquet ...
File saved to: data/yellow_taxi/2009-06.parquet
Downloading Yellow Taxi file

KeyboardInterrupt: 

In [None]:
def filter_uber_rides(hvhf_data: pd.DataFrame) -> pd.DataFrame:
    """
    Filters out non-Uber rides from the HVFHV dataset.

    Args:
        hvhf_data (pd.DataFrame): The raw HVFHV dataset.

    Returns:
        pd.DataFrame: Filtered dataset containing only Uber rides.
    """
    uber_only_data = hvhf_data[hvhf_data['dispatching_base_num'].str.startswith("B")].copy()  # Adjust as needed
    print(f"Filtered Uber rides: {len(uber_only_data)} out of {len(hvhf_data)} total rides.")
    return uber_only_data


def cochran_sample_size(population_size: int, confidence_level: float = 0.95, margin_of_error: float = 0.05, p: float = 0.5) -> int:
    """
    Calculate the sample size using Cochran's formula.

    Args:
        population_size (int): The total number of data points in the population.
        confidence_level (float): The confidence level (default is 0.95).
        margin_of_error (float): The margin of error (default is 0.05).
        p (float): The estimated proportion of the population (default is 0.5).

    Returns:
        int: The calculated sample size.
    """
    z = {0.90: 1.645, 0.95: 1.96, 0.99: 2.576}.get(confidence_level, 1.96)
    numerator = (z**2) * p * (1 - p)
    denominator = margin_of_error**2
    sample_size = numerator / denominator
    if population_size > 0:
        adjusted_sample_size = sample_size / (1 + (sample_size - 1) / population_size)
    else:
        adjusted_sample_size = sample_size
    return int(np.ceil(adjusted_sample_size))


def sample_dataset(data: pd.DataFrame, sample_size: int) -> pd.DataFrame:
    """
    Randomly samples a dataset.

    Args:
        data (pd.DataFrame): The dataset to sample from.
        sample_size (int): The size of the sample to draw.

    Returns:
        pd.DataFrame: A randomly sampled dataset.
    """
    if sample_size >= len(data):
        print("Sample size is larger than or equal to the dataset size. Returning the entire dataset.")
        return data
    return data.sample(n=sample_size, random_state=42)


def process_dataset(file_path: str, output_dir: str, filter_uber: bool = False) -> None:
    """
    Processes a dataset: loads, filters, samples, and saves the result.

    Args:
        file_path (str): Path to the dataset file.
        output_dir (str): Directory to save the processed file.
        filter_uber (bool): Whether to filter for Uber rides (default is False).
    """
    print(f"Processing file: {file_path}")
    
    # Load the dataset
    data = pd.read_parquet(file_path)
    
    # Apply filtering for Uber rides if needed
    if filter_uber:
        data = filter_uber_rides(data)
    
    # Determine population size
    population_size = len(data)
    
    # Calculate sample size
    sample_size = cochran_sample_size(population_size, confidence_level=0.95, margin_of_error=0.05)
    print(f"Calculated sample size: {sample_size}")
    
    # Sample the dataset
    sampled_data = sample_dataset(data, sample_size)
    
    # Save the sampled dataset
    output_file = os.path.join(output_dir, os.path.basename(file_path))
    sampled_data.to_parquet(output_file)
    print(f"Processed file saved to: {output_file}")


# Example Workflow

# Input directories (where raw datasets are stored)
yellow_taxi_dir = "data/yellow_taxi"
hvhf_dir = "data/hvhf"

# Output directories (where processed datasets will be saved)
processed_yellow_taxi_dir = "processed_data/yellow_taxi"
processed_hvhf_dir = "processed_data/hvhf"

# Ensure output directories exist
os.makedirs(processed_yellow_taxi_dir, exist_ok=True)
os.makedirs(processed_hvhf_dir, exist_ok=True)

# Process Yellow Taxi datasets
for file in os.listdir(yellow_taxi_dir):
    if file.endswith(".parquet"):
        process_dataset(
            file_path=os.path.join(yellow_taxi_dir, file),
            output_dir=processed_yellow_taxi_dir
        )

# Process HVFHV datasets (filter for Uber rides)
for file in os.listdir(hvhf_dir):
    if file.endswith(".parquet"):
        process_dataset(
            file_path=os.path.join(hvhf_dir, file),
            output_dir=processed_hvhf_dir,
            filter_uber=True
        )

### Load Taxi Zones

In [None]:
def load_taxi_zones(shapefile):
    raise NotImplementedError()

In [None]:
def lookup_coords_for_taxi_zone_id(zone_loc_id, loaded_taxi_zones):
    raise NotImplementedError()

### Calculate Sample Size

In [None]:
def calculate_sample_size(population):
    raise NotImplementedError()

### Common Functions

In [None]:
def get_all_urls_from_tlc_page(taxi_page):
    raise NotImplementedError()

In [None]:
def filter_parquet_urls(all_urls):
    raise NotImplementedError()

### Process Taxi Data

In [None]:
def get_and_clean_taxi_month(url):
    raise NotImplementedError()

In [None]:
def get_and_clean_taxi_data(parquet_urls):
    all_taxi_dataframes = []
    
    for parquet_url in parquet_urls:
        # maybe: first try to see if you've downloaded this exact
        # file already and saved it before trying again
        dataframe = get_and_clean_month(parquet_url)
        # maybe: if the file hasn't been saved, save it so you can
        # avoid re-downloading it if you re-run the function
        
        all_taxi_dataframes.append(dataframe)
        
    # create one gigantic dataframe with data from every month needed
    taxi_data = pd.contact(all_taxi_dataframes)
    return taxi_data

In [None]:
def get_taxi_data():
    all_urls = get_all_urls_from_taxi_page(TLC_URL)
    all_parquet_urls = find_taxi_parquet_urls(all_urls)
    taxi_data = get_and_clean_taxi_data(all_parquet_urls)
    return taxi_data

In [None]:
taxi_data = get_taxi_data()

In [None]:
taxi_data.head()

In [None]:
taxi_data.info()

In [None]:
taxi_data.describe()

### Processing Uber Data

In [None]:
def get_and_clean_uber_month(url):
    raise NotImplementedError()

In [None]:
def get_and_clean_uber_data(parquet_urls):
    all_uber_dataframes = []
    
    for parquet_url in parquet_urls:
        # maybe: first try to see if you've downloaded this exact
        # file already and saved it before trying again
        dataframe = get_and_clean_uber_month(parquet_url)
        # maybe: if the file hasn't been saved, save it so you can
        # avoid re-downloading it if you re-run the function
        
        all_uber_dataframes.append(dataframe)
        
    # create one gigantic dataframe with data from every month needed
    uber_data = pd.contact(all_uber_dataframes)
    return uber_data

In [None]:
def load_and_clean_uber_data():
    raise NotImplementedError()

In [None]:
def get_uber_data():
    all_urls = get_all_urls_from_tlc_page(TLC_URL)
    all_parquet_urls = find_parquet_urls(all_urls)
    taxi_data = get_and_clean_uber_data(all_parquet_urls)
    return taxi_data

In [None]:
uber_data = get_uber_data()

In [None]:
uber_data.head()

In [None]:
uber_data.info()

In [None]:
uber_data.describe()

### Processing Weather Data

In [None]:
def get_all_weather_csvs(directory):
    raise NotImplementedError()

In [None]:
def clean_month_weather_data_hourly(csv_file):
    raise NotImplementedError()

In [None]:
def clean_month_weather_data_daily(csv_file):
    raise NotImplementedError()

In [None]:
def load_and_clean_weather_data():
    weather_csv_files = get_all_weather_csvs(WEATHER_CSV_DIR)
    
    hourly_dataframes = []
    daily_dataframes = []
        
    for csv_file in weather_csv_files:
        hourly_dataframe = clean_month_weather_data_hourly(csv_file)
        daily_dataframe = clean_month_weather_data_daily(csv_file)
        hourly_dataframes.append(hourly_dataframe)
        daily_dataframes.append(daily_dataframe)
        
    # create two dataframes with hourly & daily data from every month
    hourly_data = pd.concat(hourly_dataframes)
    daily_data = pd.concat(daily_dataframes)
    
    return hourly_data, daily_data

In [None]:
hourly_weather_data, daily_weather_data = load_and_clean_weather_data()

In [None]:
hourly_weather_data.head()

In [None]:
hourly_weather_data.info()

In [None]:
hourly_weather_data.describe()

In [None]:
daily_weather_data.head()

In [None]:
daily_weather_data.info()

In [None]:
daily_weather_data.describe()

## Part 2: Storing Cleaned Data

In [None]:
engine = db.create_engine(DATABASE_URL)

In [None]:
# if using SQL (as opposed to SQLAlchemy), define the commands 
# to create your 4 tables/dataframes
HOURLY_WEATHER_SCHEMA = """
TODO
"""

DAILY_WEATHER_SCHEMA = """
TODO
"""

TAXI_TRIPS_SCHEMA = """
TODO
"""

UBER_TRIPS_SCHEMA = """
TODO
"""

In [None]:
# create that required schema.sql file
with open(DATABASE_SCHEMA_FILE, "w") as f:
    f.write(HOURLY_WEATHER_SCHEMA)
    f.write(DAILY_WEATHER_SCHEMA)
    f.write(TAXI_TRIPS_SCHEMA)
    f.write(UBER_TRIPS_SCHEMA)

In [None]:
# create the tables with the schema files
with engine.connect() as connection:
    pass

### Add Data to Database

In [None]:
def write_dataframes_to_table(table_to_df_dict):
    raise NotImplemented()

In [None]:
map_table_name_to_dataframe = {
    "taxi_trips": taxi_data,
    "uber_trips": uber_data,
    "hourly_weather": hourly_data,
    "daily_weather": daily_data,
}

In [None]:
write_dataframes_to_table(map_table_name_to_dataframe)

## Part 3: Understanding the Data

In [None]:
# Helper function to write the queries to file
def write_query_to_file(query, outfile):
    raise NotImplementedError()

### Query 1

In [None]:
QUERY_1_FILENAME = ""

QUERY_1 = """
TODO
"""

In [None]:
# execute query either via sqlalchemy
with engine.connect() as con:
    results = con.execute(db.text(QUERY_1)).fetchall()
results

# or via pandas
pd.read_sql(QUERY_1, con=engine)

In [None]:
write_query_to_file(QUERY_1, QUERY_1_FILENAME)

## Part 4: Visualizing the Data

### Visualization 1

In [None]:
# use a more descriptive name for your function
def plot_visual_1(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    values = "..."  # use the dataframe to pull out values needed to plot
    
    # you may want to use matplotlib to plot your visualizations;
    # there are also many other plot types (other 
    # than axes.plot) you can use
    axes.plot(values, "...")
    # there are other methods to use to label your axes, to style 
    # and set up axes labels, etc
    axes.set_title("Some Descriptive Title")
    
    plt.show()

In [None]:
def get_data_for_visual_1():
    # Query SQL database for the data needed.
    # You can put the data queried into a pandas dataframe, if you wish
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_for_visual_1()
plot_visual_1(some_dataframe)