In [1]:
%%capture
!pip install dlt[duckdb]

# Configuration

Configuration methods:

- Setting credentials with toml files
- Using environment variables; ENVs

## Code without credentials

The primary rate limit for unauthenticated requests is 60 requests per hour. Sooner or later you will face rate limit errors.

In [2]:
import dlt
from dlt.sources.helpers import requests

BASE_URL = "https://api.github.com/repos/dlt-hub/dlt/issues"

def pagination(url):
    while True:
        response = requests.get(url)
        response.raise_for_status()
        yield response.json()

        # Get next page
        if "next" not in response.links:
            break
        url = response.links["next"]["url"]


@dlt.resource(
    table_name="issues",
    write_disposition="replace",
    primary_key="id",
)
def get_issues(
):
    url = f"{BASE_URL}?per_page=100&sort=updated&directions=desc&state=open"
    yield pagination(url)


@dlt.source
def github_source():
    return get_issues


pipeline = dlt.pipeline(
    pipeline_name="github_issues1",
    destination="duckdb",
    dataset_name="github_data",
    dev_mode=True,
)
load_info = pipeline.run(github_source())
print(load_info)

Pipeline github_issues1 load step completed in 1.84 seconds
1 load package(s) were loaded to destination duckdb and into dataset github_data_20240902032104
The duckdb destination used duckdb:////content/github_issues1.duckdb location to store data
Load package 1725247264.3624146 is LOADED and contains no failed jobs


You can use a personal access token to make API requests. This will increase the rate to 5,000 requests per hour.

# Method 1: Set credentials with toml files

### Step 1: Setup the .dlt folder

In [3]:
# check environment to see if .dlt exists
!ls -a

.  ..  .config	github_issues1.duckdb  sample_data


In [4]:
# create the .dlt folder
!mkdir .dlt && ls -a

.  ..  .config	.dlt  github_issues1.duckdb  sample_data


In [5]:
# create the toml files

# toml file 1: secrets.toml
!echo access_token=\"access_token\" > .dlt/secrets.toml


# toml file 2: config.toml
!echo access_token=\"access_token\" > .dlt/config.toml

View the contents of secrets.toml

In [6]:
!cat .dlt/secrets.toml

access_token="access_token"


### Step 2: Use the value saved in the .toml file

Method to access the content of secret.toml and config.toml files in dlt

In [7]:
import dlt
dlt.secrets["access_token"]

ConfigFieldMissingException: Following fields are missing: ['access_token'] in configuration with spec Any
	for field "access_token" config providers and keys were tried in following order:
		In Environment Variables key ACCESS_TOKEN was not found.
WARNING: dlt looks for .dlt folder in your current working directory and your cwd (/content) is different from directory of your pipeline script (/usr/local/lib/python3.10/dist-packages).
If you keep your secret files in the same folder as your pipeline script but run your script from some other folder, secrets/configs will not be found
Please refer to https://dlthub.com/docs/general-usage/credentials for more information


In [8]:
dlt.secrets.value

SecretValue(dlt.secrets.value) awaiting injection

In [9]:
import dlt
from dlt.sources.helpers import requests

BASE_URL = "https://api.github.com/repos/dlt-hub/dlt/issues"

def pagination(url, access_token):
    while True:
        response = requests.get(url, headers={"Authorization": f"Bearer {access_token}"})
        response.raise_for_status()
        yield response.json()

        # Get next page
        if "next" not in response.links:
            break
        url = response.links["next"]["url"]


@dlt.resource(
    table_name="issues",
    write_disposition="replace",
    primary_key="id",
)
def get_issues(
    access_token=dlt.secrets.value
):
    url = f"{BASE_URL}?per_page=100&sort=updated&directions=desc&state=open"
    yield pagination(url, access_token)


@dlt.source
def github_source():
    return get_issues


pipeline = dlt.pipeline(
    pipeline_name="github_issues2",
    destination="duckdb",
    dataset_name="github_data",
    dev_mode=True,
)

load_info = pipeline.run(github_source())
print(load_info)

# using the created access token with the resource and source decorators. This will give error bcos the access token is not authentic

PipelineStepFailed: Pipeline execution failed at stage extract when processing package 1725247884.1978774 with exception:

<class 'dlt.extract.exceptions.ResourceExtractionError'>
In processing pipe get_issues: extraction of resource get_issues in generator pagination caused an exception: 401 Client Error: Unauthorized for url: https://api.github.com/repos/dlt-hub/dlt/issues?per_page=100&sort=updated&directions=desc&state=open

# Method 2: Environment Variables

### Step 1: Create environment variables

In [10]:
# export ACCESS_TOKEN="access_token"
%env ACCESS_TOKEN="access_token"
!echo $ACCESS_TOKEN

env: ACCESS_TOKEN="access_token"
"access_token"


### Step 2: Run the pipeline with env variables

In [11]:
import dlt
from dlt.sources.helpers import requests
from google.colab import userdata
import os

os.environ["ACCESS_TOKEN"] = userdata.get('ACCESS_TOKEN')

BASE_URL = "https://api.github.com/repos/dlt-hub/dlt/issues"

def pagination(url, access_token):
    while True:
        response = requests.get(url, headers={"Authorization": f"Bearer {access_token}"})
        response.raise_for_status()
        yield response.json()

        # Get next page
        if "next" not in response.links:
            break
        url = response.links["next"]["url"]


@dlt.resource(
    table_name="issues",
    write_disposition="replace",
    primary_key="id",
)
def get_issues(
    access_token=dlt.secrets.value
):
    url = f"{BASE_URL}?per_page=100&sort=updated&directions=desc&state=open"
    yield pagination(url, access_token)


@dlt.source
def github_source(access_token=os.getenv("ACCESS_TOKEN")):
    return get_issues(access_token)


pipeline = dlt.pipeline(
    pipeline_name="github_issues",
    destination="duckdb",
    dataset_name="github_data",
)
load_info = pipeline.run(github_source())
print(load_info)

#Ensure you set the access-token as secret in the key section of google colab before running to avoid errors

SecretNotFoundError: Secret ACCESS_TOKEN does not exist.

### Step 2.1.: Running the pipeline by setting dlt.secrets with environment variables

In [12]:
import dlt
from dlt.sources.helpers import requests
import os

dlt.secrets["access_token"] = userdata.get('ACCESS_TOKEN')

BASE_URL = "https://api.github.com/repos/dlt-hub/dlt/issues"

def pagination(url, access_token):
    while True:
        response = requests.get(url, headers={"Authorization": f"Bearer {access_token}"})
        response.raise_for_status()
        yield response.json()

        # Get next page
        if "next" not in response.links:
            break
        url = response.links["next"]["url"]


@dlt.resource(
    table_name="issues",
    write_disposition="replace",
    primary_key="id",
)
def get_issues(
    access_token=dlt.secrets.value
):
    url = f"{BASE_URL}?per_page=100&sort=updated&directions=desc&state=open"
    yield pagination(url, access_token)


@dlt.source
def github_source(access_token=dlt.secrets.value):
    return get_issues(access_token)


pipeline = dlt.pipeline(
    pipeline_name="github_issues",
    destination="duckdb",
    dataset_name="github_data",
)


load_info = pipeline.run(github_source())
print(load_info)


#Ensure you set the access-token as secret in the key section of google colab before running to avoid errors

SecretNotFoundError: Secret ACCESS_TOKEN does not exist.

# Configuration sections

dlt uses a specific naming hierarchy to search for the secrets and configs values. This makes configurations and secrets easy to manage.

For example you set up your secrets.toml file as:

```toml
[sources.github_source]
access_token="access_token"
```

or via ENV:

```py
export SOURCES__GITHUB_SOURCE__access_token=<access_token>
```

or via Python:

```py
dlt.secrets["sources.github_source.access_token"] = userdata.get('ACCESS_TOKEN') # os.getenv("ACCESS_TOKEN")
```

# More about these different methods or *Providers*

- **Environment variables**: At the top of the hierarchy are environment variables. If a value for a specific argument is found in an environment variable, dlt will use it and will not proceed to search in lower-priority providers.

- **Vaults (Airflow/Google/AWS/Azure)**: These are specialized providers that come after environment variables. They can provide configuration values and secrets. However, they typically focus on handling sensitive information.

- **secrets.toml and config.toml files**: These files are used for storing both configuration values and secrets. secrets.toml is dedicated to sensitive information, while config.toml contains non-sensitive configuration data.

- **Default argument values**: These are the values specified in the function's signature. They have the lowest priority in the provider hierarchy.