In [0]:
import boto3
import io
import gzip
import os
from dotenv import load_dotenv

load_dotenv()  

aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')
aws_region = os.getenv('AWS_REGION', 'eu-central-1')

def get_s3_client(aws_access_key_id, aws_secret_access_key, region_name='eu-central-1'):
    """Return a boto3 S3 client given the access and secret keys."""
    s3_client = boto3.client(
        's3',
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
        region_name=region_name
    )
    return s3_client

s3_client = get_s3_client(aws_access_key_id, aws_secret_access_key)

In [0]:

def read_gz_lines_from_s3(s3_client, bucket, key):
    """Download a .gz file from S3 and yield its lines decoded as utf-8 strings."""
    obj = s3_client.get_object(Bucket=bucket, Key=key)
    with gzip.GzipFile(fileobj=io.BytesIO(obj["Body"].read()), mode="rb") as f:
        for line in f:
            yield line.decode("utf-8").strip()

In [0]:
response = s3_client.list_objects_v2(Bucket='dest-wikimedia')
print([obj['Key'] for obj in response.get('Contents', [])])

In [0]:
for line in read_gz_lines_from_s3(s3_client, 'dest-wikimedia', 'pageviews/2025/2025-01/pageviews-20250101-000000.gz'):
    print(line)