In [0]:
import boto3
import io
import gzip
import os
import re
from datetime import datetime
import pyspark.sql.functions as F
from dotenv import load_dotenv
from pyspark.sql import SparkSession,DataFrame
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from functools import reduce
from collections import defaultdict

load_dotenv()  

In [0]:
df = spark.read.option("header", "true").csv("s3://dest-wikimedia/pageviews/2025/2025-01/*.gz")
df.write.mode("overwrite").parquet("/tmp/wikimedia/pageviews/2025/2025-01/")


In [0]:
display(dbutils.fs.mounts)

In [0]:
bucket = 'dest-wikimedia'
table_name = "workspace.default.wikimedia_pageviews"

In [0]:
aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')
aws_region = os.getenv('AWS_REGION', 'eu-central-1')

def get_s3_client(aws_access_key_id, aws_secret_access_key, region_name='eu-central-1'):
    """Return a boto3 S3 client given the access and secret keys."""
    s3_client = boto3.client(
        's3',
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
        region_name=region_name
    )
    return s3_client

In [0]:
s3_client = get_s3_client(aws_access_key_id, aws_secret_access_key)
response = s3_client.list_objects_v2(Bucket=bucket)

In [0]:
import datetime
import calendar

In [0]:
def generate_date_range(start_date, end_date):
    current = start_date
    while current <= end_date:
        yield current.strftime('%Y-%m-%d')
        current += datetime.timedelta(days=1)

# Parámetros de entrada
s3_base_path = "s3://dest-wikimedia/pageviews"
start_date = datetime.date(2025, 1, 1)
end_date = datetime.date(2025, 1, 31)  # Cambiá si querés otro rango

for day_str in generate_date_range(start_date, end_date):
    print(f"Procesando: {day_str}")
    year, month, day = day_str.split("-")
    
    try:
        path = f"{s3_base_path}/{year}/{year}-{month}/pageviews-{year}{month}{day}-*.gz"
        raw_df = spark.read.text(path)
        
        if raw_df.isEmpty():
            print(f"No hay archivos para {day_str}")
            continue
        
        # Procesamiento
        split_col = F.split(raw_df["value"], " ")

        parsed_df = raw_df.withColumn("domain_code", split_col.getItem(0))\
            .withColumn("page_title", split_col.getItem(1))\
            .withColumn("count_views", split_col.getItem(2).cast("int"))\
            .withColumn("total_response_size", split_col.getItem(3).cast("int"))\
            .withColumn("file_path", F.col("_metadata.file_path"))\
            .withColumn("timestamp_str", F.regexp_extract(F.col("file_path"), r'pageviews-(\d{8}-\d{6})\.gz', 1))\
            .withColumn("event_timestamp", F.to_timestamp("timestamp_str", "yyyyMMdd-HHmmss"))\
            .withColumn("event_date", F.to_date("event_timestamp"))\
            .drop("value", "file_path", "timestamp_str")
        
        parsed_df.write.format("delta")\
            .mode("append")\
            .partitionBy("event_date")\
            .saveAsTable("workspace.default.wikimedia_pageviews")

        print(f"✔ Día {day_str} procesado")
    
    except Exception as e:
        print(f"❌ Error procesando {day_str}: {e}")

## Contained data

https://wikitech.wikimedia.org/wiki/Data_Platform/Data_Lake/Traffic/Pageviews

domain_code | page_title| count_views| total_response_size

In [0]:
def read_gz_lines_from_s3(s3_client, bucket, key):
    obj = s3_client.get_object(Bucket=bucket, Key=key)
    with gzip.GzipFile(fileobj=io.BytesIO(obj["Body"].read()), mode="rb") as f:
        for line in f:
            yield line.decode("utf-8").strip()

def extract_timestamp_from_key(key):
    # path ejemplo: 'pageviews-20250101-000000.gz'
    match = re.search(r'pageviews-(\d{8})-(\d{6})\.gz', key)
    if match:
        date_str = match.group(1)  # '20250101'
        time_str = match.group(2)  # '000000'
        timestamp_str = date_str + time_str  # '20250101000000'
        return datetime.strptime(timestamp_str, '%Y%m%d%H%M%S')
    else:
        return None

def parse_line(line):
    if line.startswith('"" '):
        return None

    parts = line.split(" ", 3)
    if len(parts) == 4:
        project, page_title, view_count, response_size = parts
        return (
            project.replace('"', ''),
            page_title,
            int(view_count),
            int(response_size)
        )
    else:
        return None
    
schema = StructType([
    StructField("domain_code", StringType(), True),
    StructField("page_title", StringType(), True),
    StructField("count_views", IntegerType(), True),
    StructField("total_response_size", IntegerType(), True)
])

def process_file(s3_client, bucket, key):
    timestamp = extract_timestamp_from_key(key)
    rows = []
    for line in read_gz_lines_from_s3(s3_client, bucket, key):
        if line.startswith('""'):  
            continue
        parsed = parse_line(line)
        if parsed:
            rows.append(parsed)
    df = spark.createDataFrame(rows, schema=schema)
    df = df.withColumn("event_timestamp", F.lit(timestamp))
    return df

In [0]:
keys_by_date = defaultdict(list)

for key in gz_keys:
    ts = extract_timestamp_from_key(key)
    if ts:
        day_str = ts.strftime("%Y-%m-%d")
        keys_by_date[day_str].append(key)


In [0]:
file_paths =  [f"s3a://{bucket}/{obj['Key']}" for obj in response['Contents'] if obj["Key"].endswith(".gz")]

In [0]:
raw_df = spark.read.text("s3://dest-wikimedia/pageviews/2025/2025-01/*.gz")

In [0]:
parsed_df = (
    raw_df
    .filter(~F.col("value").startswith('""'))
    .withColumn("split", F.split("value", " ", 4))
    .filter(F.size("split") == 4) 
    .select(
        F.col("split")[0].alias("domain_code"),
        F.col("split")[1].alias("page_title"),
        F.col("split")[2].cast("int").alias("count_views"),
        F.col("split")[3].cast("int").alias("total_response_size"),
        F.col("_metadata.file_path").alias("file_path")
    )
    .withColumn("date_str", F.regexp_extract("file_path", r"pageviews-(\d{8})-(\d{6})\.gz", 1))
    .withColumn("time_str", F.regexp_extract("file_path", r"pageviews-(\d{8})-(\d{6})\.gz", 2))
    .withColumn("event_timestamp", F.to_timestamp(F.concat_ws("", "date_str", "time_str"), "yyyyMMddHHmmss"))
    .withColumn("event_date", F.to_date("event_timestamp"))
    .drop('date_str')
    .drop('time_str')
)

In [0]:
parsed_df.show(1)

In [0]:
parsed_df = parsed_df.repartition("event_date")

In [0]:
parsed_df.write.format("delta") \
    .option("mergeSchema", "true")\
    .mode("overwrite") \
    .partitionBy("event_date")\
    .saveAsTable("workspace.default.wikimedia_pageviews")

In [0]:
matching_keys = [
    obj["Key"] for obj in response.get("Contents", [])
    if obj["Key"].endswith(".gz")
]

for key in matching_keys:
    try:
        print(f"Processing {key}...")
        timestamp = extract_timestamp_from_key(key)
        if not timestamp:
            print(f"Could not extract timestamp from {key}, skipping.")
            continue

        rows = []
        for line in read_gz_lines_from_s3(s3_client, bucket, key):
            parsed = parse_line(line)
            if parsed:
                rows.append(parsed)

        if not rows:
            print(f"No valid data found in {key}, skipping.")
            continue

        df = spark.createDataFrame(rows, schema=schema)
        df = df.withColumn("event_timestamp", F.lit(timestamp))
        df = df.withColumn("event_date", F.to_date("event_timestamp"))

        # Save to Delta, partitioned by event_date
        df.coalesce(1).write.format("delta") \
            .mode("append") \
            .partitionBy("event_date") \
            .saveAsTable(table_name)

        print(f"✅ Successfully written: {key}")

    except Exception as e:
        print(f"Error processing {key}: {e}")

In [0]:
for day, keys in keys_by_date.items():
    try:
        print(f"Processing day {day} with {len(keys)} files...")
        dfs = []
        for key in keys:
            print(key)
            df = process_file(s3_client, bucket, key)
            if df and df.count() > 0:
                dfs.append(df)
        
        if dfs:
            full_day_df = reduce(lambda a, b: a.unionByName(b), dfs)
            full_day_df = full_day_df.withColumn("event_date", F.to_date("event_timestamp"))
            full_day_df.write.format("delta")\
                .mode("append")\
                .partitionBy("event_date")\
                .saveAsTable("workspace.default.wikimedia_pageviews")
            print(f"Day {day} successfully processed")
        else:
            print(f"Day {day} not processed")
            
    except Exception as e:
        print(f"Error on day {day}: {e}")