# Taxi Data EDA Using dlt
1. Import Libraries
2. Define the API Source
3. Create the dlt pipeline
	- a. Extract the data `extract_info = pipeline.extract(openlibrary_source())`
	- b. Normalize the data `normalize_info = pipeline.normalize()`
	- c. Load the data into tables `load_info = pipeline.load()`
4. Run the entire pipeline `load_info = pipeline.run(openlibrary_source())`

In [2]:
# Step 1 Import Libraries
import dlt
import dlt
from itertools import islice
from dlt.sources.rest_api import rest_api_source
from dlt.sources.rest_api import rest_api_resources
from dlt.sources.rest_api.typing import RESTAPIConfig

## Breaking it down step by step first
- Define Source
- Extract Data
- Normalize Data
- Load Data into DuckDB

In [25]:
# Step 2 - Define API Source
def taxi_demo_pipeline_rest_api_source():
    """Define dlt resources from REST API endpoints."""
    return rest_api_source({
        "client": {
            # Base URL for the REST API
            "base_url": "https://us-central1-dlthub-analytics.cloudfunctions.net/data_engineering_zoomcamp_api",
            # No authentication required
            "auth": None,
            # Pagination using JSON Link header
            "paginator": {
                "type": "json_link",
                "next_url_path": "paging.next",
            },
        },
        "resources": [
            # Define resource for the taxi data endpoint
            {
                "name": "taxi_data_demo",
                "table_name": "taxi_data_demo",
                "endpoint": {
                    "path": "/",
                    "params": {
                        "page": 1,  # Start from page 1
                    },
                },
            },
        ],
        # Set default configuration for all resources
        "resource_defaults": {
            "write_disposition": "append",
        },
    })

In [26]:
# Step 3 - Create the dlt Pipeline
pipeline = dlt.pipeline(
    pipeline_name="taxi_pipeline_demo",
    destination="duckdb",
    dataset_name="taxi_test_data",
)

In [27]:
# Step 4 - Extract data from source
extract_info = pipeline.extract(taxi_demo_pipeline_rest_api_source())

In [28]:
# Reviewing extraction summary data
load_id = extract_info.loads_ids[-1]
m = extract_info.metrics[load_id][0]

print("Resources:", list(m["resource_metrics"].keys()))
print("Tables:", list(m["table_metrics"].keys()))
print("Load ID:", load_id)
print()

for resource, rm in m["resource_metrics"].items():
    print(f"Resource: {resource}")
    print(f"rows extracted: {rm.items_count}")
    print()

Resources: ['taxi_data_demo']
Tables: ['taxi_data_demo']
Load ID: 1771448393.794041

Resource: taxi_data_demo
rows extracted: 1000



In [29]:
# Step 5 Normalize
normalize_info = pipeline.normalize()

  - rate_code
  - mta_tax

Unless type hints are provided, these columns will not be materialized in the destination.
One way to provide type hints is to use the 'columns' argument in the '@dlt.resource' decorator.  For example:

@dlt.resource(columns={'rate_code': {'data_type': 'text'}})

  - rate_code
  - mta_tax

Unless type hints are provided, these columns will not be materialized in the destination.
One way to provide type hints is to use the 'columns' argument in the '@dlt.resource' decorator.  For example:

@dlt.resource(columns={'rate_code': {'data_type': 'text'}})



In [30]:
# Normalize Info
load_id = normalize_info.loads_ids[-1]
m = normalize_info.metrics[load_id][0]

print("Load ID:", load_id)
print()

print("Tables created/updated:")
for table_name, tm in m["table_metrics"].items():
    # skip dlt internal tables to keep it beginner-friendly
    if table_name.startswith("_dlt"):
        continue
    print(f"  - {table_name}: {tm.items_count} rows")

Load ID: 1771448393.794041

Tables created/updated:
  - taxi_data_demo: 1000 rows


In [31]:
# Display schema 
print(pipeline.default_schema.to_pretty_yaml())

version: 4
version_hash: ocQFiF+Y1Dd8HlWZJPKkxoXzZCZJNG7y4o66NEv2y6o=
engine_version: 11
name: rest_api
tables:
  _dlt_version:
    columns:
      version:
        data_type: bigint
        nullable: false
      engine_version:
        data_type: bigint
        nullable: false
      inserted_at:
        data_type: timestamp
        nullable: false
      schema_name:
        data_type: text
        nullable: false
      version_hash:
        data_type: text
        nullable: false
      schema:
        data_type: text
        nullable: false
    write_disposition: skip
    resource: _dlt_version
    description: Created by DLT. Tracks schema updates
  _dlt_loads:
    columns:
      load_id:
        data_type: text
        nullable: false
        precision: 64
      schema_name:
        data_type: text
        nullable: true
      status:
        data_type: bigint
        nullable: false
      inserted_at:
        data_type: timestamp
        nullable: false
      schema_version_hash:
  

In [32]:
# Step 6 Load data into DuckDB
load_info = pipeline.load()

## Step 7 - Running Entire Pipeline End-to-End

In [None]:
# Defining the API Source
# if no argument is provided, `access_token` is read from `.dlt/secrets.toml`
@dlt.source
def taxi_pipeline_rest_api_source():
    """Define dlt resources from REST API endpoints."""
    config: RESTAPIConfig = {
        "client": {
            # TODO set base URL for the REST API
            "base_url": "https://us-central1-dlthub-analytics.cloudfunctions.net/data_engineering_zoomcamp_api",
            # TODO configure the right authentication method or remove
            # "auth": {"type": "bearer", "token": access_token},
            "paginator": {
                "type": "json_link",
                "next_url_path": "paging.next",
        
            },
        },
        "resources": [
            # Define resource for the taxi data endpoint
            {
                "name": "taxi_data",
                "table_name": "taxi_data",
                "endpoint": {
                    "path": "/",
                    "params": {
                        "page": 1,  # Start from page 1
                    },
                },
            },
        ],
        # Set default configuration for all resources
        "resource_defaults": {
            "write_disposition": "append"
        },
    }

    yield from rest_api_resources(config)

In [None]:
# Create Pipeline - Extract, Normalize, Load in DuckDB
pipeline = dlt.pipeline(
    pipeline_name='taxi_pipeline_pipeline',
    destination='duckdb',
    # `refresh="drop_sources"` ensures the data and the state is cleaned
    # on each `pipeline.run()`; remove the argument once you have a
    # working pipeline.
    refresh="drop_sources",
    # show basic progress of resources extracted, normalized files and load-jobs on stdout
    progress="log",
)

In [None]:
# Running the Pipeline
load_info = pipeline.run(taxi_pipeline_rest_api_source())

  - rate_code
  - mta_tax

Unless type hints are provided, these columns will not be materialized in the destination.
One way to provide type hints is to use the 'columns' argument in the '@dlt.resource' decorator.  For example:

@dlt.resource(columns={'rate_code': {'data_type': 'text'}})



LoadInfo(pipeline=<dlt.pipeline(pipeline_name='taxi_pipeline_demo', destination='duckdb', dataset_name='taxi_test_data', default_schema_name='rest_api', schema_names=['rest_api'], first_run=False, dev_mode=False, is_active=True, pipelines_dir='/home/user1129/.dlt/pipelines', working_dir='/home/user1129/.dlt/pipelines/taxi_pipeline_demo')>, metrics={'1771448930.0912588': [{'started_at': DateTime(2026, 2, 18, 21, 8, 53, 24274, tzinfo=Timezone('UTC')), 'finished_at': DateTime(2026, 2, 18, 21, 8, 53, 407893, tzinfo=Timezone('UTC')), 'job_metrics': {'taxi_data.8f943e9065.insert_values.gz': LoadJobMetrics(job_id='taxi_data.8f943e9065.insert_values.gz', file_path='/home/user1129/.dlt/pipelines/taxi_pipeline_demo/load/normalized/1771448930.0912588/started_jobs/taxi_data.8f943e9065.0.insert_values.gz', table_name='taxi_data', started_at=DateTime(2026, 2, 18, 21, 8, 53, 65311, tzinfo=Timezone('UTC')), finished_at=DateTime(2026, 2, 18, 21, 8, 53, 290699, tzinfo=Timezone('UTC')), state='completed'