In [0]:
import os
import re
from datetime import datetime
import pyspark.sql.functions as F
from pyspark.sql import SparkSession,DataFrame
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from functools import reduce
from collections import defaultdict
# from dotenv import load_dotenv
# load_dotenv()  

## Parse Data

https://wikitech.wikimedia.org/wiki/Data_Platform/Data_Lake/Traffic/Pageviews

domain_code | count_views| total_response_size

In [0]:
raw_df = spark.read.text("s3://dest-wikimedia/projectviews/2025/2025-01/*")

In [0]:
parsed_df = (
    raw_df
    .filter(~F.col("value").startswith('""'))
    .withColumn("split", F.split("value", " ", 4))
    .filter(F.size("split") == 4) 
    .select(
        F.col("split")[0].alias("domain_code"),
        F.col("split")[2].cast("int").alias("count_views"),
        F.col("split")[3].cast("int").alias("total_response_size"),
        F.col("_metadata.file_path").alias("file_path")
    )
    .withColumn("date_str", F.regexp_extract(F.col("file_path"), r"projectviews-(\d{8})-(\d{6})", 1))
    .withColumn("time_str", F.regexp_extract(F.col("file_path"), r"projectviews-(\d{8})-(\d{6})", 2))
    .withColumn("event_timestamp", F.to_timestamp(F.concat_ws("", F.col("date_str"), F.col("time_str")), "yyyyMMddHHmmss"))
    .withColumn("event_date", F.to_date("event_timestamp"))
    .drop("date_str", "time_str" ,"file_path" )
)

parsed_df = parsed_df.repartition("event_date")

In [0]:
parsed_df.write.format("delta") \
    .option("mergeSchema", "true")\
    .mode("append") \
    .partitionBy("event_date")\
    .saveAsTable("workspace.default.wikimedia_projectviews")