Prepare dlt

In [None]:
 !pip install dlt requests
 #!pip install "dlt[bigquery]"

In [None]:
# Step 2: Authenticate with Google Cloud
from google.colab import files
import os

In [None]:
# Upload your service account key file
uploaded = files.upload()
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = list(uploaded.keys())[0]

In [None]:
# Step 3: Write the code to load data from a URL pattern into BigQuery
import dlt
import gzip
import csv
import requests
from io import BytesIO

# Define a function to read compressed CSV.GZ files from a public URL
def read_compressed_csv_from_url(url):
    # Fetch the file from the public URL
    response = requests.get(url, stream=True)
    if response.status_code != 200:
        raise ValueError(f"Failed to fetch file from URL: {url}")

    # Decompress the file and read it as CSV
    with gzip.GzipFile(fileobj=BytesIO(response.content), mode='rb') as file:
        reader = csv.DictReader(file.read().decode('utf-8').splitlines())
        for row in reader:
            yield row

# Define a pipeline to load data into BigQuery
pipeline = dlt.pipeline(
    pipeline_name="taxi_tripdata_to_bigquery",
    destination="bigquery",
    dataset_name="trips_data_all",  # Replace with your BigQuery dataset name
)

# Define the base URL template
base_url = "https://github.com/DataTalksClub/nyc-tlc-data/releases/download/{}/{}_tripdata_{}-{}.csv.gz"

# Define the taxi types to load (green, yellow, or fhv)
taxi_types = ["yellow", "fhv","green"]  # Add more types if needed

# Define the range of years and months to load
start_year = 2019
end_year = 2020
start_month = 1
end_month = 12

# Load data from each URL into BigQuery
for taxi_type in taxi_types:
    # Adjust the year range for fhv data (only 2019)
    if taxi_type == "fhv":
        years_to_load = [2019]
    else:
        years_to_load = range(start_year, end_year + 1)

    for year in years_to_load:
        for month in range(start_month, end_month + 1):
            # Format the month to two digits (e.g., 01, 02, ..., 12)
            month_str = f"{month:02d}"
            url = base_url.format(taxi_type, taxi_type, year, month_str)  # Generate the URL using the template

            try:
                load_info = pipeline.run(
                    read_compressed_csv_from_url(url),
                    table_name=f"{taxi_type}_tripdata",  # Use taxi type in the table name
                    write_disposition="append",  # Use "append" to add data from multiple files
                )
                print(f"Loaded data from {url}: {load_info}")
            except Exception as e:
                print(f"Failed to load data from {url}: {e}")

# Comments
- If the taxi_type is "fhv", the years_to_load variable is set to [2019] (only 2019).
- For other taxi types (green and yellow), the years_to_load variable is set to range(start_year, end_year + 1) (2019 and 2020).

Dynamic URL Generation:
The base_url template remains the same, but the year loop is adjusted based on the taxi_type.

## Table Name:
The table_name in the pipeline.run method is dynamically set based on the taxi type:

## Python
Copy
table_name=f"{taxi_type}_tripdata"
This ensures that green, yellow, and fhv trip data are loaded into separate tables (e.g., green_tripdata, yellow_tripdata, and fhv_tripdata).

## Error Handling:
If a URL fails to load (e.g., the file does not exist), the error is caught and logged without stopping the pipeline.

## Notes:
Dynamic URL Generation:
You can modify the URL template and range of years/months to match your file naming convention.
Add more robust error handling if needed (e.g., retry logic for failed URLs).

## Performance:
For large numbers of files, consider using parallel processing or batching to optimize performance.

## Schema Consistency:
Ensure all .csv.gz files for each taxi type have the same schema (column names and data types). If not, you may need to handle schema mismatches.
This approach allows you to dynamically generate and load green, yellow, and fhv taxi trip data into separate BigQuery tables, with the condition that fhv data is only loaded for the year 2019.