In [0]:
import boto3
import io
import gzip
import os
import re
from datetime import datetime
import pyspark.sql.functions as F
from dotenv import load_dotenv
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

load_dotenv()  

aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')
aws_region = os.getenv('AWS_REGION', 'eu-central-1')

def get_s3_client(aws_access_key_id, aws_secret_access_key, region_name='eu-central-1'):
    """Return a boto3 S3 client given the access and secret keys."""
    s3_client = boto3.client(
        's3',
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
        region_name=region_name
    )
    return s3_client

s3_client = get_s3_client(aws_access_key_id, aws_secret_access_key)

## Contained data

https://wikitech.wikimedia.org/wiki/Data_Platform/Data_Lake/Traffic/Pageviews

domain_code | page_title| count_views| total_response_size

In [0]:

def read_gz_lines_from_s3(s3_client, bucket, key):
    """Download a .gz file from S3 and yield its lines decoded as utf-8 strings."""
    obj = s3_client.get_object(Bucket=bucket, Key=key)
    with gzip.GzipFile(fileobj=io.BytesIO(obj["Body"].read()), mode="rb") as f:
        for line in f:
            yield line.decode("utf-8").strip()

In [0]:
response = s3_client.list_objects_v2(Bucket='dest-wikimedia')
print([obj['Key'] for obj in response.get('Contents', [])])

In [0]:
parsed_rows = []

for line in read_gz_lines_from_s3(s3_client, 'dest-wikimedia', 'pageviews/2025/2025-01/pageviews-20250101-000000.gz'):
    parsed = parse_line(line)
    if parsed:
        parsed_rows.append(parsed)

In [0]:
schema = StructType([
    StructField("domain_code", StringType(), True),
    StructField("page_title", StringType(), True),
    StructField("count_views", IntegerType(), True),
    StructField("total_response_size", IntegerType(), True)
])

df = spark.createDataFrame(parsed_rows, schema=schema)


In [0]:
df.show()

In [0]:
def read_gz_lines_from_s3(s3_client, bucket, key):
    obj = s3_client.get_object(Bucket=bucket, Key=key)
    with gzip.GzipFile(fileobj=io.BytesIO(obj["Body"].read()), mode="rb") as f:
        for line in f:
            yield line.decode("utf-8").strip()

def extract_timestamp_from_key(key):
    # path ejemplo: 'pageviews-20250101-000000.gz'
    match = re.search(r'pageviews-(\d{8})-(\d{6})\.gz', key)
    if match:
        date_str = match.group(1)  # '20250101'
        time_str = match.group(2)  # '000000'
        timestamp_str = date_str + time_str  # '20250101000000'
        return datetime.strptime(timestamp_str, '%Y%m%d%H%M%S')
    else:
        return None
# def parse_line(line):
#     Ejemplo: la línea es: "wikifunctions:main_page 1 0"
#     Puedes cambiar el parse según tus necesidades
#     parts = line.split()
#     if len(parts) >= 3:
#         return (parts[0], int(parts[1]), int(parts[2]))
#     else:
#         return None

def parse_line(line):
    if line.startswith('"" '):
        return None

    parts = line.split(" ", 3)
    if len(parts) == 4:
        project, page_title, view_count, response_size = parts
        return (
            project.replace('"', ''),
            page_title,
            int(view_count),
            int(response_size)
        )
    else:
        return None
    
schema = StructType([
    StructField("domain_code", StringType(), True),
    StructField("page_title", StringType(), True),
    StructField("count_views", IntegerType(), True),
    StructField("total_response_size", IntegerType(), True)
])


def process_file(s3_client, bucket, key):
    timestamp = extract_timestamp_from_key(key)
    rows = []
    for line in read_gz_lines_from_s3(s3_client, bucket, key):
        if line.startswith('""'):  # saltear filas que empiezan con ""
            continue
        parsed = parse_line(line)
        if parsed:
            rows.append(parsed)
    # Crear DataFrame temporal
    df = spark.createDataFrame(rows, schema=schema)
    # Agregar columna con timestamp
    df = df.withColumn("event_timestamp", F.lit(timestamp))
    return df

In [0]:
bucket = 'dest-wikimedia'
key = 'pageviews/2025/2025-01/pageviews-20250101-000000.gz'
df_file = process_file(s3_client, bucket, key)

# df_all = None
# for key in keys:
#     df_file = process_file(s3_client, bucket, key)
#     if df_all is None:
#         df_all = df_file
#     else:
#         df_all = df_all.union(df_file)

# df_all.show()

In [0]:
df_file.show()