In [1]:
! dlt --version
! pip install dlt google-cloud-bigquery pandas pyarrow requests

[39mdlt 1.6.1[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import dlt
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.paginators import PageNumberPaginator
import pandas as pd
import os

In [None]:
def my_ingest(
    base_url="https://github.com/DataTalksClub/nyc-tlc-data/releases/tag/fhv",
    file_path="fhv_tripdata_{year}-{month}.csv.gz",
    years=[2019],
    months=range(1, 13),
    pipeline_name="nyc_taxi",
    destination_platform="bigquery",
    dataset_name="nyc_taxi_dataset",
    table_name="for_hire_vehicle_trips",
    write_disposition="append",
    loader_file_format="parquet",
):
    def my_paginate(
        base_url: str,
        file_path: str,
    ):
        """
        This source retrieves NYC taxi rides from the NYC Taxi & Limousine Commission's API.
        """
        client = RESTClient(
            base_url=base_url,
            paginator=PageNumberPaginator(base_page=1, total_path=None),
        )

        for page in client.paginate(file_path):
            yield page

    def download_parquet(url):
        """Download and load Parquet data from a URL into a Pandas DataFrame."""
        print(f"Downloading {url}")
        os.system(f"wget {url} -O temp.csv.gz")
        df = pd.read_csv(
            filepath_or_buffer="temp.csv.gz",
            iterator=True,
            chunksize=100000,
            compression="gzip",
        )
        yield df

    # Define the DLTHub pipeline
    pipeline = dlt.pipeline(
        pipeline_name=pipeline_name,
        destination=destination_platform,
        dataset_name=dataset_name,
    )
    # Ingest data into the pipeline

    for year in years:
        for month in months:
            year = str(year)
            month = str(month).zfill(2)
            file_path = file_path.format(year=year, month=month)
            pipeline.run(
                download_parquet(base_url + file_path),
                table_name=table_name,
                write_disposition=write_disposition,
                loader_file_format=loader_file_format,
            )
    os.system("rm temp.csv.gz")

In [None]:
my_ingest(
    base_url="https://github.com/DataTalksClub/nyc-tlc-data/releases/tag/fhv",
    file_path="fhv_tripdata_{year}-{month}.csv.gz",
    years=[2019],
    months=range(1, 13),
    pipeline_name="nyc_taxi",
    destination_platform="bigquery",
    dataset_name="nyc_taxi_dataset",
    table_name="for_hire_vehicle_trips",
    write_disposition="append",
    loader_file_format="parquet",
)

Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2019-01.parquet


--2025-02-21 07:23:36--  https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2019-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 3.160.203.184, 3.160.203.173, 3.160.203.53, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|3.160.203.184|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 210707424 (201M) [application/x-www-form-urlencoded]
Saving to: ‘temp.parquet’

     0K .......... .......... .......... .......... ..........  0%  304K 11m16s
    50K .......... .......... .......... .......... ..........  0%  275K 11m51s
   100K .......... .......... .......... .......... ..........  0%  355K 11m7s
   150K .......... .......... .......... .......... ..........  0%  607K 9m45s
   200K .......... .......... .......... .......... ..........  0% 1.14M 8m23s
   250K .......... .......... .......... .......... ..........  0% 1.10M 7m30s
   300K .......... .......... .......... .......... ...

Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2019-01.parquet


--2025-02-21 07:24:54--  https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2019-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 3.160.203.81, 3.160.203.184, 3.160.203.53, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|3.160.203.81|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 210707424 (201M) [application/x-www-form-urlencoded]
Saving to: ‘temp.parquet’

     0K .......... .......... .......... .......... ..........  0%  274K 12m31s
    50K .......... .......... .......... .......... ..........  0%  274K 12m31s
   100K .......... .......... .......... .......... ..........  0%  449K 10m53s
   150K .......... .......... .......... .......... ..........  0%  839K 9m11s
   200K .......... .......... .......... .......... ..........  0%  919K 8m5s
   250K .......... .......... .......... .......... ..........  0% 1.12M 7m14s
   300K .......... .......... .......... .......... .....

Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2019-01.parquet


--2025-02-21 07:25:59--  https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2019-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 3.160.203.53, 3.160.203.184, 3.160.203.81, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|3.160.203.53|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 210707424 (201M) [application/x-www-form-urlencoded]
Saving to: ‘temp.parquet’

     0K .......... .......... .......... .......... ..........  0%  275K 12m27s
    50K .......... .......... .......... .......... ..........  0%  292K 12m5s
   100K .......... .......... .......... .......... ..........  0%  464K 10m31s
   150K .......... .......... .......... .......... ..........  0%  879K 8m52s
   200K .......... .......... .......... .......... ..........  0% 1.12M 7m41s
   250K .......... .......... .......... .......... ..........  0% 1.49M 6m47s
   300K .......... .......... .......... .......... .....

Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2019-01.parquet


--2025-02-21 07:27:12--  https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2019-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 3.160.203.81, 3.160.203.173, 3.160.203.184, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|3.160.203.81|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 210707424 (201M) [application/x-www-form-urlencoded]
Saving to: ‘temp.parquet’

     0K .......... .......... .......... .......... ..........  0%  274K 12m30s
    50K .......... .......... .......... .......... ..........  0%  269K 12m37s
   100K .......... .......... .......... .......... ..........  0%  436K 11m2s
   150K .......... .......... .......... .......... ..........  0%  790K 9m21s
   200K .......... .......... .......... .......... ..........  0%  983K 8m11s
   250K .......... .......... .......... .......... ..........  0% 1.45M 7m12s
   300K .......... .......... .......... .......... ....

Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2019-01.parquet


--2025-02-21 07:28:18--  https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2019-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 3.160.203.173, 3.160.203.81, 3.160.203.184, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|3.160.203.173|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 210707424 (201M) [application/x-www-form-urlencoded]
Saving to: ‘temp.parquet’

     0K .......... .......... .......... .......... ..........  0%  298K 11m31s
    50K .......... .......... .......... .......... ..........  0%  277K 11m57s
   100K .......... .......... .......... .......... ..........  0%  434K 10m36s
   150K .......... .......... .......... .......... ..........  0%  834K 8m58s
   200K .......... .......... .......... .......... ..........  0%  932K 7m55s
   250K .......... .......... .......... .......... ..........  0% 1.34M 7m0s
   300K .......... .......... .......... .......... ...

Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2019-01.parquet


--2025-02-21 07:29:24--  https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2019-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 3.160.203.173, 3.160.203.184, 3.160.203.53, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|3.160.203.173|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 210707424 (201M) [application/x-www-form-urlencoded]
Saving to: ‘temp.parquet’

     0K .......... .......... .......... .......... ..........  0%  274K 12m31s
    50K .......... .......... .......... .......... ..........  0%  293K 12m6s
   100K .......... .......... .......... .......... ..........  0%  513K 10m17s
   150K .......... .......... .......... .......... ..........  0%  841K 8m44s
   200K .......... .......... .......... .......... ..........  0% 1.03M 7m38s
   250K .......... .......... .......... .......... ..........  0% 1.48M 6m44s
   300K .......... .......... .......... .......... ...

Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2019-01.parquet


--2025-02-21 07:30:31--  https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2019-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 3.160.203.173, 3.160.203.184, 3.160.203.53, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|3.160.203.173|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 210707424 (201M) [application/x-www-form-urlencoded]
Saving to: ‘temp.parquet’

     0K .......... .......... .......... .......... ..........  0%  275K 12m28s
    50K .......... .......... .......... .......... ..........  0%  271K 12m34s
   100K .......... .......... .......... .......... ..........  0%  435K 11m0s
   150K .......... .......... .......... .......... ..........  0%  823K 9m17s
   200K .......... .......... .......... .......... ..........  0%  990K 8m7s
   250K .......... .......... .......... .......... ..........  0% 1.06M 7m18s
   300K .......... .......... .......... .......... ....

Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2019-01.parquet


--2025-02-21 07:31:47--  https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2019-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 3.160.203.173, 3.160.203.184, 3.160.203.81, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|3.160.203.173|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 210707424 (201M) [application/x-www-form-urlencoded]
Saving to: ‘temp.parquet’

     0K .......... .......... .......... .......... ..........  0%  217K 15m48s
    50K .......... .......... .......... .......... ..........  0%  290K 13m48s
   100K .......... .......... .......... .......... ..........  0%  429K 11m52s
   150K .......... .......... .......... .......... ..........  0%  857K 9m54s
   200K .......... .......... .......... .......... ..........  0%  922K 8m40s
   250K .......... .......... .......... .......... ..........  0% 1.26M 7m39s
   300K .......... .......... .......... .......... ..

Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2019-01.parquet


--2025-02-21 07:32:51--  https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2019-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 3.160.203.53, 3.160.203.173, 3.160.203.184, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|3.160.203.53|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 210707424 (201M) [application/x-www-form-urlencoded]
Saving to: ‘temp.parquet’

     0K .......... .......... .......... .......... ..........  0%  221K 15m32s
    50K .......... .......... .......... .......... ..........  0%  319K 13m9s
   100K .......... .......... .......... .......... ..........  0%  434K 11m24s
   150K .......... .......... .......... .......... ..........  0% 1.20M 9m15s
   200K .......... .......... .......... .......... ..........  0% 1.10M 8m0s
   250K .......... .......... .......... .......... ..........  0%  815K 7m22s
   300K .......... .......... .......... .......... .....

Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2019-01.parquet


--2025-02-21 07:34:24--  https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2019-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 3.160.203.53, 3.160.203.184, 3.160.203.173, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|3.160.203.53|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 210707424 (201M) [application/x-www-form-urlencoded]
Saving to: ‘temp.parquet’

     0K .......... .......... .......... .......... ..........  0%  272K 12m35s
    50K .......... .......... .......... .......... ..........  0%  272K 12m35s
   100K .......... .......... .......... .......... ..........  0%  445K 10m57s
   150K .......... .......... .......... .......... ..........  0%  726K 9m24s
   200K .......... .......... .......... .......... ..........  0%  948K 8m14s
   250K .......... .......... .......... .......... ..........  0% 1.47M 7m15s
   300K .......... .......... .......... .......... ...

Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2019-01.parquet


--2025-02-21 07:35:30--  https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2019-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 3.160.203.184, 3.160.203.81, 3.160.203.53, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|3.160.203.184|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 210707424 (201M) [application/x-www-form-urlencoded]
Saving to: ‘temp.parquet’

     0K .......... .......... .......... .......... ..........  0%  273K 12m32s
    50K .......... .......... .......... .......... ..........  0%  271K 12m35s
   100K .......... .......... .......... .......... ..........  0%  422K 11m6s
   150K .......... .......... .......... .......... ..........  0%  832K 9m21s
   200K .......... .......... .......... .......... ..........  0%  943K 8m12s
   250K .......... .......... .......... .......... ..........  0% 1.23M 7m17s
   300K .......... .......... .......... .......... ....

Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2019-01.parquet


--2025-02-21 07:36:39--  https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2019-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 3.160.203.184, 3.160.203.53, 3.160.203.81, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|3.160.203.184|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 210707424 (201M) [application/x-www-form-urlencoded]
Saving to: ‘temp.parquet’

     0K .......... .......... .......... .......... ..........  0%  275K 12m29s
    50K .......... .......... .......... .......... ..........  0%  272K 12m32s
   100K .......... .......... .......... .......... ..........  0%  420K 11m5s
   150K .......... .......... .......... .......... ..........  0%  748K 9m27s
   200K .......... .......... .......... .......... ..........  0%  973K 8m16s
   250K .......... .......... .......... .......... ..........  0% 1.37M 7m18s
   300K .......... .......... .......... .......... ....