In [0]:
from pyspark.sql import functions as F
from pyspark.sql import Window
import pandas as pd
import matplotlib.pyplot as plt

# 4. Exploratory Data Analysis

## Read df

In [0]:
df = spark.read.table("projectviews.default.gold_daily_projectviews")

In [0]:
df.count()

## 4.1 Engagement KPIs

**DAU / WAU / MAU (Views)** 

In [0]:
def plot_engagement_kpis(df):
    """
    Calculate and plot DAU, WAU, MAU trends.

    DataFrame must have:
        - event_date (date or string)
        - count_views (numeric)
    """

    df = df.withColumn("event_date", F.to_date("event_date"))

    dau_df = (df.groupBy("event_date")
                .agg(F.sum("count_views").alias("DAU_views"))
                .orderBy("event_date"))

    df_week = df.withColumn("week", F.date_trunc("week", "event_date"))
    wau_df = (df_week.groupBy("week")
                .agg(F.sum("count_views").alias("WAU_views"))
                .orderBy("week"))

    df_month = df.withColumn("month", F.date_trunc("month", "event_date"))
    mau_df = (df_month.groupBy("month")
                .agg(F.sum("count_views").alias("MAU_views"))
                .orderBy("month"))

    dau_pd = dau_df.toPandas()
    wau_pd = wau_df.toPandas()
    mau_pd = mau_df.toPandas()

    dau_pd["event_date"] = pd.to_datetime(dau_pd["event_date"])
    wau_pd["week"] = pd.to_datetime(wau_pd["week"])
    mau_pd["month"] = pd.to_datetime(mau_pd["month"])

    plt.figure(figsize=(10,5))
    plt.plot(dau_pd["event_date"], dau_pd["DAU_views"], marker='o', label="DAU")
    plt.title("Daily Active Views Trend")
    plt.xlabel("Date")
    plt.ylabel("Views")
    plt.legend()
    plt.show()

    plt.figure(figsize=(10,5))
    plt.plot(wau_pd["week"], wau_pd["WAU_views"], marker='o', label="WAU")
    plt.title("Weekly Active Views Trend")
    plt.xlabel("Date")
    plt.ylabel("Views")
    plt.legend()
    plt.show()

    return dau_pd, wau_pd, mau_pd

In [0]:
dau_pd, wau_pd, mau_pd = plot_engagement_kpis(df)

**Session-length (Proxy)**  

In [0]:
w = Window.partitionBy("domain_code").orderBy("event_date")

df = (
    df
    .withColumn("event_date", F.to_date("event_date"))
    .withColumn("is_active", (F.col("count_views") > 0).cast("int"))
    .withColumn("streak_group", F.sum(F.when(F.col("is_active") == 0, 1).otherwise(0)).over(w))
    .withColumn("streak_len", 
                F.when(F.col("is_active") == 1,
                        F.row_number().over(Window.partitionBy("domain_code", "streak_group").orderBy("event_date")))
                .otherwise(0)
                )
)


# Aggregate: average & median streak length per date

agg_df = df.groupBy("event_date").agg(
    F.avg("streak_len").alias("avg_streak"),
    F.expr("percentile_approx(streak_len, 0.5)").alias("median_streak")
).orderBy("event_date")

agg_pd = agg_df.toPandas()

# Plot
plt.figure(figsize=(10,5))
plt.plot(agg_pd["event_date"], agg_pd["avg_streak"], label="Average Streak Length")
plt.plot(agg_pd["event_date"], agg_pd["median_streak"], label="Median Streak Length", linestyle="--")
plt.xlabel("Date")
plt.ylabel("Streak Length (days)")
plt.title("Average & Median Streak Length Over Time")
plt.legend()
plt.grid(True)
plt.show()

In [0]:
agg_pd

**Content Diversity (Domain Diversity)**

In [0]:
# Total views per day
daily_total = df.groupBy("event_date").agg(
    F.sum("count_views").alias("total_views")
)

df_div = df.join(daily_total, on="event_date", how="left")
df_div = df_div.withColumn(
    "p_d",
    F.col("count_views") / F.col("total_views")
)

# HHI per day
hhi_df = df_div.groupBy("event_date").agg(
    F.sum(F.pow(F.col("p_d"), 2)).alias("HHI"))

# Diversity = 1 - HHI
hhi_df = hhi_df.withColumn(
    "diversity",
    F.lit(1) - F.col("HHI")
    )

hhi_df.orderBy("event_date").show()

## 4.2 Data quirks and mitigations

In [0]:

def plot_engagement_kpis_with_quirks(dau_pd, wau_pd, mau_pd):
    """
    Highlight possible data quirks (seasonality, sparsity, spikes).
    """

    # --- Detect quirks ---

    quirks = {}

    ###################################
    # 1. Sparsity: days with zero views
    ###################################

    # Define the full date range from the min to the max date in your data
    min_date = dau_pd['event_date'].min()
    max_date = dau_pd['event_date'].max()
    full_dates = pd.date_range(start=min_date, end=max_date)

    # Days with zero views
    zero_days_df = dau_pd[dau_pd["DAU_views"] == 0]

    # Days missing from the dataset
    missing_days = full_dates.difference(dau_pd['event_date'])

    quirks["sparsity_zero_views"] = zero_days_df["event_date"].tolist()
    quirks["sparsity_missing_days"] = missing_days.tolist()

    print("Days with zero views:", quirks["sparsity_zero_views"])
    print("Days missing from dataset:", quirks["sparsity_missing_days"])

    #############################################
    # 2. Spikes: values greater than mean + 3*std
    #############################################

    threshold = dau_pd["DAU_views"].mean() + 3 * dau_pd["DAU_views"].std()
    spikes_df = dau_pd[dau_pd["DAU_views"] > threshold]
    quirks["spikes"] = spikes_df["event_date"].tolist()

    #####################
    # 3. Seasonality hint
    #####################

    weekly_avg = dau_pd.groupby(dau_pd["event_date"].dt.dayofweek)["DAU_views"].mean()
    quirks["seasonality"] = weekly_avg.to_dict()

    # --- Plot ---
    plt.figure(figsize=(14,6))
    plt.plot(dau_pd["event_date"], dau_pd["DAU_views"], marker='o', label="DAU")
    plt.plot(wau_pd["week"], wau_pd["WAU_views"], marker='s', label="WAU")
    plt.plot(mau_pd["month"], mau_pd["MAU_views"], marker='^', label="MAU")

    # Mark zero days
    plt.scatter(zero_days_df["event_date"], zero_days_df["DAU_views"], color='red', label="Zero days", zorder=5)

    # Mark spikes
    plt.scatter(spikes_df["event_date"], spikes_df["DAU_views"], color='orange', label="Spikes", zorder=5)

    plt.title("Engagement KPIs with Data Quirks Highlighted")
    plt.xlabel("Date")
    plt.ylabel("Views")
    plt.legend()
    plt.grid(True)
    plt.show()

    # --- Print quirks summary ---
    print("\n=== Data Quirks Detected ===")
    print(f"1. Sparsity (zero days): {quirks['sparsity_zero_views']}")
    print(f"   Missing days: {quirks['sparsity_missing_days']}")
    print(f"2. Spikes: {quirks['spikes']}")
    print("3. Seasonality pattern (avg views by weekday):")
    for day, avg in quirks["seasonality"].items():
        print(f"   Day {day} (0=Mon): {avg:.2f}")


In [0]:
plot_engagement_kpis_with_quirks(dau_pd, wau_pd, mau_pd)

**Average Views by Weekday**

In [0]:
def plot_avg_views_by_weekday(df, date_col="event_date", views_col="count_views"):
    """
    Calculate average views by weekday and plot the result.

    Parameters:
    - df: Spark DataFrame with at least two columns: a date column and a views count column.
    - date_col: Name of the date column in df (default "event_date").
    - views_col: Name of the views count column in df (default "count_views").
    """

    # Add a weekday number column (Monday=0, Sunday=6)
    df_with_weekday = df.withColumn(
        "weekday",
        ((F.dayofweek(F.col(date_col)) + 5) % 7)
    ).withColumn(
        "weekday_name",
        F.when(F.col("weekday") == 0, "Monday")
         .when(F.col("weekday") == 1, "Tuesday")
         .when(F.col("weekday") == 2, "Wednesday")
         .when(F.col("weekday") == 3, "Thursday")
         .when(F.col("weekday") == 4, "Friday")
         .when(F.col("weekday") == 5, "Saturday")
         .when(F.col("weekday") == 6, "Sunday")
    )

    # Aggregate average views by weekday name
    agg_df = df_with_weekday.groupBy("weekday_name") \
        .agg(F.avg(views_col).alias("avg_views"))

    # Collect to pandas
    pdf = agg_df.toPandas()

    # Order the days of the week properly
    order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
    pdf['weekday_name'] = pd.Categorical(pdf['weekday_name'], categories=order, ordered=True)
    pdf = pdf.sort_values('weekday_name')

    # Plotting
    plt.figure(figsize=(10,6))
    plt.bar(pdf['weekday_name'], pdf['avg_views'], color='skyblue')
    plt.title('Average Views by Weekday')
    plt.xlabel('Weekday')
    plt.ylabel('Average Views')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()

plot_avg_views_by_weekday(df)


**Trend One domain**

In [0]:
df_dom = df.toPandas()

# Ensure event_date is in datetime format
df_dom["event_date"] = pd.to_datetime(df_dom["event_date"], errors="coerce")

# Filter for the given domain_code
df_dom = df_dom.loc[df_dom["domain_code"] == 'ab.m.d'].sort_values(by='event_date').reset_index(drop=True)

# Plot with label for legend
plt.figure(figsize=(10, 5))
plt.plot(df_dom["event_date"], df_dom["count_views"], marker='o', label='ab.m.d')
plt.title("Daily Active Views Trend for ab.m.d")
plt.xlabel("Date")
plt.ylabel("Views")
plt.legend()
plt.show()


## 4.3 Missing dates for each domain

In [0]:
w = Window.partitionBy("domain_code").orderBy("event_date")
gaps_df = (
    df
    .withColumn("prev_date", F.lag("event_date").over(w)) 
    .withColumn("gap_days", F.datediff("event_date", "prev_date")) 
    .filter(F.col("prev_date").isNotNull() & (F.col("gap_days") > 1))
)
print(f'Found {gaps_df.count()} gaps')


per_dom = df.groupBy("domain_code").agg(
    F.min("event_date").alias("start_date"),
    F.max("event_date").alias("end_date")
)
expanded = (
    per_dom
    .withColumn(
        "event_date",
        F.explode(F.sequence("start_date", "end_date", F.expr("interval 1 day")))
    )
    .select("domain_code", "event_date")
)

patched = (
    expanded
    .join(df, ["domain_code", "event_date"], "left") 
    .withColumn("is_missing", F.when(F.col("count_views").isNull(), True).otherwise(False))
    .withColumn("count_views", F.coalesce(F.col("count_views"), F.lit(0)).cast("long"))
)

missing_summary = (
    patched
    .filter(F.col('is_missing') == True)  
    .groupBy("domain_code")
    .agg(
        F.count("event_date").alias("missing_count"),
        F.sort_array(
            F.collect_set(F.date_format(F.col("event_date"), "yyyy-MM-dd"))
        ).alias("missing_dates")
    )
)

In [0]:
missing_summary.show()