# Traffic Source Quality (Source / Medium)

This notebook compares acquisition channels by traffic quality, not just volume.

**Metrics:** sessions, engagement rate, average pages per session
**Data source:** GA4 BigQuery export (`events_*`)


In [None]:
#@title Setup (run once)
import sys
import os

if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user()
    if not os.path.exists("lla-data"):
        !git clone -q https://github.com/aidoanto/lla-data.git
    repo = os.path.abspath("lla-data")
    if repo not in sys.path:
        sys.path.insert(0, repo)
    !pip install -q db-dtypes google-cloud-bigquery kaleido plotly
else:
    for p in ("..", "../.."):
        ap = os.path.abspath(p)
        if ap not in sys.path:
            sys.path.insert(0, ap)

import pandas as pd
import plotly.express as px

import lifeline_theme
from lla_data import config
from lla_data.bq import get_client, run_query

lifeline_theme.inject_fonts()

client = get_client()

In [None]:
#@title Parameters
DAYS_BACK = config.DEFAULT_DAYS_BACK #@param {type:"integer"}
TOP_N = config.DEFAULT_TOP_N #@param {type:"integer"}

In [None]:
query = f"""
WITH base AS (
  SELECT
    PARSE_DATE('%Y%m%d', event_date) AS event_day,
    event_name,
    user_pseudo_id,
    CONCAT(
      user_pseudo_id,
      '.',
      COALESCE(CAST((
        SELECT ep.value.int_value
        FROM UNNEST(event_params) ep
        WHERE ep.key = 'ga_session_id'
      ) AS STRING), '0')
    ) AS session_key,
    COALESCE((
      SELECT ep.value.string_value
      FROM UNNEST(event_params) ep
      WHERE ep.key = 'source'
    ), '(direct)') AS source,
    COALESCE((
      SELECT ep.value.string_value
      FROM UNNEST(event_params) ep
      WHERE ep.key = 'medium'
    ), '(none)') AS medium,
    COALESCE((
      SELECT ep.value.string_value
      FROM UNNEST(event_params) ep
      WHERE ep.key = 'session_engaged'
    ), '0') AS session_engaged
  FROM `{config.PROJECT_ID}.{config.GA4_DATASET}.events_*`
  WHERE _TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL {DAYS_BACK} DAY))
    AND FORMAT_DATE('%Y%m%d', CURRENT_DATE())
), session_level AS (
  SELECT
    source,
    medium,
    session_key,
    MAX(CAST(session_engaged AS INT64)) AS engaged_flag,
    COUNTIF(event_name = 'page_view') AS page_views_in_session
  FROM base
  GROUP BY source, medium, session_key
)
SELECT
  source,
  medium,
  COUNT(*) AS sessions,
  SUM(engaged_flag) AS engaged_sessions,
  SAFE_DIVIDE(SUM(engaged_flag), COUNT(*)) AS engagement_rate,
  AVG(page_views_in_session) AS avg_pages_per_session
FROM session_level
GROUP BY source, medium
HAVING sessions >= 20
ORDER BY sessions DESC
"""

df = run_query(client, query)
df["source_medium"] = df["source"] + " / " + df["medium"]
df.head()

In [None]:
top_df = df.nlargest(TOP_N, "sessions").sort_values("sessions", ascending=True)

fig = px.bar(
    top_df,
    x="sessions",
    y="source_medium",
    orientation="h",
    template="lifeline",
    title=f"Top {TOP_N} Source/Medium by Sessions (Last {DAYS_BACK} Days)",
    labels={"source_medium": "Source / Medium", "sessions": "Sessions"},
)
lifeline_theme.add_lifeline_logo(fig)
fig.show()

In [None]:
quality_df = df[df["sessions"] >= 50].copy()

fig = px.scatter(
    quality_df,
    x="engagement_rate",
    y="avg_pages_per_session",
    size="sessions",
    color="source",
    hover_name="source_medium",
    template="lifeline",
    title=f"Acquisition Quality Map (Last {DAYS_BACK} Days)",
    labels={
        "engagement_rate": "Engagement Rate",
        "avg_pages_per_session": "Average Pages per Session",
    },
)
fig.update_xaxes(tickformat=".0%")
lifeline_theme.add_lifeline_logo(fig)
fig.show()