In [None]:
!pip -q install -U "grpcio>=1.71.2" "grpcio-status>=1.71.2"
!pip -q install -U apache-beam crcmod

import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions, StandardOptions
from apache_beam.transforms.window import FixedWindows
from apache_beam.transforms.trigger import AfterWatermark, AfterProcessingTime, AccumulationMode
from apache_beam.testing.test_stream import TestStream
import json
from datetime import datetime, timezone

In [None]:
MODE = "stream"
WINDOW_SIZE_SECS = 60
ALLOWED_LATENESS_SECS = 120

def make_event(user_id, event_type, amount, event_time_epoch_s):
    return {"user_id": user_id, "event_type": event_type, "amount": float(amount), "event_time": int(event_time_epoch_s)}

base = datetime.now(timezone.utc).replace(microsecond=0)
t0 = int(base.timestamp())

BATCH_EVENTS = [
    make_event("u1", "purchase", 20, t0 + 5),
    make_event("u1", "purchase", 15, t0 + 20),
    make_event("u2", "purchase",  8, t0 + 35),
    make_event("u1", "refund",   -5, t0 + 62),
    make_event("u2", "purchase", 12, t0 + 70),
    make_event("u3", "purchase",  9, t0 + 75),
    make_event("u2", "purchase",  3, t0 + 50),
]

In [None]:
def format_joined_record(kv):
    user_id, d = kv
    return {
        "user_id": user_id,
        "count": int(d["count"][0]) if d["count"] else 0,
        "sum_amount": float(d["sum_amount"][0]) if d["sum_amount"] else 0.0,
    }

class WindowedUserAgg(beam.PTransform):
    def expand(self, pcoll):
        stamped = pcoll | beam.Map(lambda e: beam.window.TimestampedValue(e, e["event_time"]))
        windowed = stamped | beam.WindowInto(
            FixedWindows(WINDOW_SIZE_SECS),
            allowed_lateness=ALLOWED_LATENESS_SECS,
            trigger=AfterWatermark(
                early=AfterProcessingTime(10),
                late=AfterProcessingTime(10),
            ),
            accumulation_mode=AccumulationMode.ACCUMULATING,
        )
        keyed = windowed | beam.Map(lambda e: (e["user_id"], e["amount"]))
        counts = keyed | beam.combiners.Count.PerKey()
        sums = keyed | beam.CombinePerKey(sum)
        return (
            {"count": counts, "sum_amount": sums}
            | beam.CoGroupByKey()
            | beam.Map(format_joined_record)
        )

In [None]:
class AddWindowInfo(beam.DoFn):
    def process(self, element, window=beam.DoFn.WindowParam, pane_info=beam.DoFn.PaneInfoParam):
        ws = float(window.start)
        we = float(window.end)
        yield {
            **element,
            "window_start_utc": datetime.fromtimestamp(ws, tz=timezone.utc).strftime("%H:%M:%S"),
            "window_end_utc": datetime.fromtimestamp(we, tz=timezone.utc).strftime("%H:%M:%S"),
            "pane_timing": str(pane_info.timing),
            "pane_is_first": pane_info.is_first,
            "pane_is_last": pane_info.is_last,
        }

def build_test_stream():
    return (
        TestStream()
        .advance_watermark_to(t0)
        .add_elements([
            beam.window.TimestampedValue(make_event("u1", "purchase", 20, t0 + 5), t0 + 5),
            beam.window.TimestampedValue(make_event("u1", "purchase", 15, t0 + 20), t0 + 20),
            beam.window.TimestampedValue(make_event("u2", "purchase", 8, t0 + 35), t0 + 35),
        ])
        .advance_processing_time(5)
        .advance_watermark_to(t0 + 61)
        .add_elements([
            beam.window.TimestampedValue(make_event("u1", "refund", -5, t0 + 62), t0 + 62),
            beam.window.TimestampedValue(make_event("u2", "purchase", 12, t0 + 70), t0 + 70),
            beam.window.TimestampedValue(make_event("u3", "purchase", 9, t0 + 75), t0 + 75),
        ])
        .advance_processing_time(5)
        .add_elements([
            beam.window.TimestampedValue(make_event("u2", "purchase", 3, t0 + 50), t0 + 50),
        ])
        .advance_watermark_to(t0 + 121)
        .advance_watermark_to_infinity()
    )

In [3]:
def run_batch():
    with beam.Pipeline(options=PipelineOptions([])) as p:
        (
            p
            | beam.Create(BATCH_EVENTS)
            | WindowedUserAgg()
            | beam.ParDo(AddWindowInfo())
            | beam.Map(json.dumps)
            | beam.Map(print)
        )

def run_stream():
    opts = PipelineOptions([])
    opts.view_as(StandardOptions).streaming = True
    with beam.Pipeline(options=opts) as p:
        (
            p
            | build_test_stream()
            | WindowedUserAgg()
            | beam.ParDo(AddWindowInfo())
            | beam.Map(json.dumps)
            | beam.Map(print)
        )

run_stream() if MODE == "stream" else run_batch()

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
apache-beam 2.70.0 requires grpcio!=1.48.0,!=1.59.*,!=1.60.*,!=1.61.*,!=1.62.0,!=1.62.1,<1.66.0,<2,>=1.33.1; python_version <= "3.12", but you have grpcio 1.76.0 which is incompatible.
tensorflow 2.19.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.3, but you have protobuf 6.33.2 which is incompatible.[0m[31m
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
grpcio-status 1.76.0 requires grpcio>=1.76.0, but you have grpcio 1.65.5 which is incompatible.
tensorflow 2.19.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.3, but you have protobuf 6.33.2 which is incompatible.[0m[31m
[0m=== Running