In [1]:
# | default_exp kafka_server

In [2]:
# | export


import asyncio
from datetime import datetime, timedelta
from enum import Enum
from os import environ
from typing import *

import numpy as np
import pandas as pd
from airt.logger import get_logger
from fastkafka import FastKafka
from pydantic import BaseModel, EmailStr, Field, HttpUrl, NonNegativeInt, validator

import airt_service
from airt_service.confluent import aio_kafka_config
from airt_service.data.clickhouse import get_all_person_ids_for_account_ids, get_count_for_account_ids

In [3]:
import contextlib
import unittest

import asyncer

import pytest
from fastkafka.testing import Tester
from pytest import MonkeyPatch

from airt_service.db.models import create_user_for_testing

In [4]:
# | exporti

logger = get_logger(__name__)

In [5]:
def heading(title: Optional[str] = None, width: int = 160):
    print()
    print("*" * width)
    print("*" * 3 + " " * (width - 6) + "*" * 3)
    if title:
        l = int((width - 6 - len(title)) / 2)
        print("*" * 3 + " " * l + title + " " * l + "*" * 3)
        print("*" * 3 + " " * (width - 6) + "*" * 3)

In [6]:
# | export


def get_count_from_training_data_ch_table(
    account_ids: List[Union[int, str]]
) -> pd.DataFrame:
    """
    Get count of all rows for given account ids from clickhouse table

    Args:
        account_ids: List of account_ids to get count

    Returns:
        Count for the given account id
    """
    return get_count_for_account_ids(
        account_ids=account_ids,
        username=environ["KAFKA_CH_USERNAME"],
        password=environ["KAFKA_CH_PASSWORD"],
        host=environ["KAFKA_CH_HOST"],
        port=int(environ["KAFKA_CH_PORT"]),
        database=environ["KAFKA_CH_DATABASE"],
        table=environ["KAFKA_CH_TABLE"],
        protocol=environ["KAFKA_CH_PROTOCOL"],
    )

In [7]:
@contextlib.contextmanager
def patch_get_count_from_training_data_ch_table():
    with MonkeyPatch.context() as monkeypatch:
        monkeypatch.setattr(
            "__main__.get_count_from_training_data_ch_table",
            lambda account_ids: pd.DataFrame(
                {
                    "curr_count": [999] * len(account_ids),
                    "AccountId": account_ids,
                    "curr_check_on": [datetime.utcnow()] * len(account_ids),
                }
            ).set_index("AccountId"),
        )
        yield


with patch_get_count_from_training_data_ch_table():
    actual = get_count_from_training_data_ch_table(account_ids=[500])
    display(actual)
    assert actual.iloc[0]["curr_count"] == 999, actual

Unnamed: 0_level_0,curr_count,curr_check_on
AccountId,Unnamed: 1_level_1,Unnamed: 2_level_1
500,999,2023-04-30 22:49:05.378545


In [8]:
def get_count_from_training_data_ch_table_monkey_patch(
    account_id: int,
    *,
    total: int = 10_000,
    step: int = 1000,
    curr_check_shift: Optional[timedelta] = None,
):
    def _get_count_from_training_data_ch_table_monkey_patch(
        account_id: int = account_id,
        *,
        total: int = total,
        step: int = step,
        curr_check_shift: Optional[timedelta] = curr_check_shift,
    ):
        if curr_check_shift is None:
            curr_check_shift = timedelta(seconds=0)

        for i in range(0, total, step):
            curr_check_on = datetime.utcnow() + curr_check_shift
            yield (
                pd.DataFrame(
                    {
                        "curr_count": [i],
                        "AccountId": [account_id],
                        "curr_check_on": curr_check_on,
                    }
                ).set_index("AccountId")
            )

        while True:
            curr_check_on = datetime.utcnow() + curr_check_shift
            yield (
                pd.DataFrame(
                    {
                        "curr_count": [total],
                        "AccountId": [account_id],
                        "curr_check_on": curr_check_on,
                    }
                ).set_index("AccountId")
            )

    ix = _get_count_from_training_data_ch_table_monkey_patch()

    def _get_count_from_training_data_ch_table_monkey_patch(
        account_ids: List[int], ix=ix
    ):
        return next(ix)

    return _get_count_from_training_data_ch_table_monkey_patch

In [9]:
f = get_count_from_training_data_ch_table_monkey_patch(
    12345, curr_check_shift=timedelta(seconds=10)
)

df = pd.concat([f([]) for _ in range(13)])
display(df["curr_count"].to_frame())

assert df["curr_count"].to_list() == [1000 * i for i in range(10)] + [10_000] * 3

Unnamed: 0_level_0,curr_count
AccountId,Unnamed: 1_level_1
12345,0
12345,1000
12345,2000
12345,3000
12345,4000
12345,5000
12345,6000
12345,7000
12345,8000
12345,9000


In [10]:
f = get_count_from_training_data_ch_table_monkey_patch(
    12345, curr_check_shift=timedelta(seconds=10), total=4501, step=1500
)

df = pd.concat([f([]) for _ in range(6)])
display(df["curr_count"].to_frame())

assert df["curr_count"].to_list() == [1500 * i for i in range(4)] + [4501] * 2

Unnamed: 0_level_0,curr_count
AccountId,Unnamed: 1_level_1
12345,0
12345,1500
12345,3000
12345,4500
12345,4501
12345,4501


In [11]:
create_user_for_testing(username="infobip")

'infobip'

In [12]:
@contextlib.contextmanager
def monkeypatch_clickhouse(
    account_id: int,
    curr_check_shift: Optional[timedelta] = None,
    total: int = 10_000,
    step: int = 1_000,
) -> None:
    with MonkeyPatch.context() as monkeypatch:
        monkeypatch.setattr(
            "__main__.get_count_from_training_data_ch_table",
            get_count_from_training_data_ch_table_monkey_patch(
                account_id=account_id,
                curr_check_shift=curr_check_shift,
                total=total,
                step=step,
            ),
        )

        rng = np.random.default_rng(42)
        monkeypatch.setattr(
            "__main__.get_all_person_ids_for_account_ids",
            lambda account_id: pd.DataFrame(
                {
                    "PersonId": rng.integers(low=1_000_000, high=10_000_000, size=7),
                    "AccountId": [account_id] * 7,
                }
            ).set_index("AccountId"),
        )
        yield

In [13]:
with monkeypatch_clickhouse(account_id=12345):
    dfx = [get_count_from_training_data_ch_table([12345]) for _ in range(3)]
    person_ids = get_all_person_ids_for_account_ids(12345)

for df in dfx:
    display(df["curr_count"].to_frame())
person_ids.style.format("{:,d}")

Unnamed: 0_level_0,curr_count
AccountId,Unnamed: 1_level_1
12345,0


Unnamed: 0_level_0,curr_count
AccountId,Unnamed: 1_level_1
12345,1000


Unnamed: 0_level_0,curr_count
AccountId,Unnamed: 1_level_1
12345,2000


Unnamed: 0_level_0,PersonId
AccountId,Unnamed: 1_level_1
12345,1803258
12345,7965604
12345,6891143
12345,4949905
12345,4897137
12345,8727381
12345,1773510


In [14]:
# | export


def json_datetime_sec_encoder(dt: datetime) -> str:
    return dt.strftime("%Y-%m-%dT%H:%M:%S")

In [15]:
dt = datetime.fromisoformat("2023-01-01T12:34:56.789012")
expected = "2023-01-01T12:34:56"
actual = json_datetime_sec_encoder(dt)
assert actual == expected

In [16]:
# | export


class ModelType(str, Enum):
    churn = "churn"
    propensity_to_buy = "propensity_to_buy"


class ModelTrainingRequest(BaseModel):
    AccountId: NonNegativeInt = Field(
        ..., example=202020, description="ID of an account"
    )
    ApplicationId: Optional[str] = Field(
        default=None,
        example="TestApplicationId",
        description="Id of the application in case there is more than one for the AccountId",
    )
    ModelId: Optional[str] = Field(
        default=None,
        example="ChurnModelForDrivers",
        description="User supplied ID of the model trained",
    )
    model_type: ModelType = Field(
        ..., description="Model type, only 'churn' is supported right now"
    )
    total_no_of_records: NonNegativeInt = Field(
        ...,
        example=1_000_000,
        description="approximate total number of records (rows) to be ingested",
    )

In [17]:
model_training_request = ModelTrainingRequest(
    AccountId=12345,
    OccurredTime="2021-03-28T00:34:08",
    model_type="churn",
    total_no_of_records=1000,
)

expected = '{"AccountId": 12345, "ApplicationId": null, "ModelId": null, "model_type": "churn", "total_no_of_records": 1000}'
actual = model_training_request.json()

assert actual == expected

parsed = ModelTrainingRequest.parse_raw(actual)
assert parsed == model_training_request

In [18]:
# | export


class EventData(BaseModel):
    """
    A sequence of events for a fixed account_id
    """

    AccountId: NonNegativeInt = Field(
        ..., example=202020, description="ID of an account"
    )
    ApplicationId: Optional[str] = Field(
        default=None,
        example="TestApplicationId",
        description="Id of the application in case there is more than one for the AccountId",
    )
    ModelId: Optional[str] = Field(
        default=None,
        example="ChurnModelForDrivers",
        description="User supplied ID of the model trained",
    )

    DefinitionId: str = Field(
        ...,
        example="appLaunch",
        description="name of the event",
        min_length=1,
    )
    OccurredTime: datetime = Field(
        ...,
        example="2021-03-28T00:34:08",
        description="local time of the event",
    )
    OccurredTimeTicks: NonNegativeInt = Field(
        ...,
        example=1616891648496,
        description="local time of the event as the number of ticks",
    )
    PersonId: NonNegativeInt = Field(
        ..., example=12345678, description="ID of a person"
    )


class RealtimeData(EventData):
    pass

In [19]:
event_data = EventData(
    AccountId=12345,
    DefinitionId="BigButton",
    PersonId=123456789,
    OccurredTime="2021-03-28T00:34:08",
    OccurredTimeTicks=1616891648496,
)

expected = '{"AccountId": 12345, "ApplicationId": null, "ModelId": null, "DefinitionId": "BigButton", "OccurredTime": "2021-03-28T00:34:08", "OccurredTimeTicks": 1616891648496, "PersonId": 123456789}'
actual = event_data.json()

assert actual == expected

parsed = EventData.parse_raw(actual)
assert parsed == event_data

In [20]:
# | export


class TrainingDataStatus(BaseModel):
    AccountId: NonNegativeInt = Field(
        ..., example=202020, description="ID of an account"
    )
    ApplicationId: Optional[str] = Field(
        default=None,
        example="TestApplicationId",
        description="Id of the application in case there is more than one for the AccountId",
    )
    ModelId: Optional[str] = Field(
        default=None,
        example="ChurnModelForDrivers",
        description="User supplied ID of the model trained",
    )

    no_of_records: NonNegativeInt = Field(
        ...,
        example=12_345,
        description="number of records (rows) ingested",
    )
    total_no_of_records: NonNegativeInt = Field(
        ...,
        example=1_000_000,
        description="total number of records (rows) to be ingested",
    )

In [21]:
training_data_status = TrainingDataStatus(
    AccountId=12345,
    no_of_records=23,
    total_no_of_records=54,
)

expected = '{"AccountId": 12345, "ApplicationId": null, "ModelId": null, "no_of_records": 23, "total_no_of_records": 54}'
actual = training_data_status.json()

assert actual == expected

parsed = TrainingDataStatus.parse_raw(actual)
assert parsed == training_data_status

In [22]:
# | export


class TrainingModelStart(BaseModel):
    AccountId: NonNegativeInt = Field(
        ..., example=202020, description="ID of an account"
    )
    ApplicationId: Optional[str] = Field(
        default=None,
        example="TestApplicationId",
        description="Id of the application in case there is more than one for the AccountId",
    )
    ModelId: Optional[str] = Field(
        default=None,
        example="ChurnModelForDrivers",
        description="User supplied ID of the model trained",
    )
    model_type: ModelType = Field(
        ..., description="Model type, only 'churn' is supported right now"
    )
    no_of_records: NonNegativeInt = Field(
        ...,
        example=1_000_000,
        description="number of records (rows) in the DB used for training",
    )

In [23]:
training_model_start = TrainingModelStart(
    AccountId=12345,
    model_type="churn",
    no_of_records=100,
)

expected = '{"AccountId": 12345, "ApplicationId": null, "ModelId": null, "model_type": "churn", "no_of_records": 100}'
actual = training_model_start.json()

assert actual == expected, actual

parsed = TrainingModelStart.parse_raw(actual)
assert parsed == training_model_start

In [24]:
# | export


class Tracker:
    def __init__(self, *, limit: int, timeout: int):
        self._limit = limit
        self._timeout = timeout
        self._count: Optional[int] = None
        self._last_updated: Optional[datetime] = None

    def update(self, count: int) -> bool:
        if self._count != count:
            self._count = count
            self._last_updated = datetime.now()
            return True
        else:
            return False

    def finished(self) -> bool:
        if self._count is not None:
            return (self._count >= self._limit) or (
                datetime.now() - self._last_updated # type: ignore
            ) > timedelta(seconds=self._timeout)
        else:
            return False

In [25]:
tracker = Tracker(limit=10, timeout=5)

assert not tracker.finished()

assert tracker.update(9)
assert not tracker.update(9)

assert not tracker.finished()

assert tracker.update(10)

assert tracker.finished()

In [26]:
tracker = Tracker(limit=10, timeout=1)

assert not tracker.finished()

tracker.update(9)

assert not tracker.finished()

await asyncio.sleep(1.1)

assert tracker.finished()

In [27]:
# | export


def add_process_start_training(
    app: FastKafka,
    *,
    username: str = "infobip",
    stop_on_no_change_interval: int = 60,
    sleep_interval: int = 5,
) -> None:
    #     app.produces(
    #         topic=f"{username}_training_data_status",
    #         msg_type=TrainingDataStatus,
    #         encoding="avro",
    #     )

    @app.produces(topic=f"{username}_training_data_status") # type: ignore
    async def to_training_data_status(
        training_data_status: TrainingDataStatus,
    ) -> TrainingDataStatus:
        print(f"to_training_data_status({training_data_status})")
        return training_data_status

    @app.produces(topic=f"{username}_training_model_start") # type: ignore
    async def to_training_model_start(
        training_model_start: TrainingModelStart,
    ) -> TrainingModelStart:
        print(f"to_training_model_start({training_model_start})")
        return training_model_start

    app.to_training_data_status = to_training_data_status
    app.to_training_model_start = to_training_model_start

    @app.consumes(topic=f"{username}_start_training") # type: ignore
    async def on_start_training(msg: ModelTrainingRequest, app=app) -> None:
        print(f"on_start_training({msg}) starting...")

        account_ids = [msg.AccountId]
        total_no_of_records = msg.total_no_of_records

        tracker = Tracker(limit=total_no_of_records, timeout=stop_on_no_change_interval)

        while not tracker.finished():
            curr_count, _ = get_count_from_training_data_ch_table(
                account_ids=account_ids # type: ignore
            ).iloc[0, :]

            print(f"{curr_count=}")

            if tracker.update(curr_count):
                training_data_status = TrainingDataStatus(
                    no_of_records=curr_count, **msg.dict()
                )
                await app.to_training_data_status(training_data_status)

            await asyncio.sleep(sleep_interval)

        # trigger model training start
        training_model_start = TrainingModelStart(
            no_of_records=curr_count, **msg.dict()
        )
        await app.to_training_model_start(training_model_start)

        print(f"on_start_training({msg}) finished.")

In [28]:
kafka_brokers = {
    "localhost": {
        "url": "localhost",
        "description": "localhost kafka broker",
        "port": "9092",
    }
}

app = FastKafka(kafka_brokers=kafka_brokers)

add_process_start_training(app, stop_on_no_change_interval=3, sleep_interval=1)

tester = Tester(app)

total_no_of_records = 5000

with monkeypatch_clickhouse(
    account_id=12345,
    total=total_no_of_records,
    step=1500,
):
    async with tester:
        model_training_request = ModelTrainingRequest(
            AccountId=12345,
            OccurredTime="2021-03-28T00:34:08",
            model_type="churn",
            total_no_of_records=total_no_of_records + 123,
        )
        heading(f"tester.to_infobip_start_training({model_training_request})")

        await tester.to_infobip_start_training(model_training_request)

        heading("tester.awaited_mocks.on_infobip_training_data_status.assert_called()")

        await tester.awaited_mocks.on_infobip_training_data_status.assert_called(
            timeout=10
        )

        heading("tester.awaited_mocks.on_infobip_training_model_start.assert_called()")
        await tester.awaited_mocks.on_infobip_training_model_start.assert_called(
            timeout=20
        )

        heading("check final training_data_status")

        training_data_statuses = (
            tester.awaited_mocks.on_infobip_training_data_status._o.await_args_list
        )
        final_training_data_status = training_data_statuses[-1]

        assert len(training_data_statuses) == 5, training_data_statuses
        expected = unittest.mock.call(
            TrainingDataStatus(
                AccountId=12345,
                no_of_records=total_no_of_records,
                total_no_of_records=total_no_of_records + 123,
            )
        )
        assert final_training_data_status == expected, training_data_statuses

print("ok")

[INFO] fastkafka._testing.in_memory_broker: InMemoryBroker._start() called
[INFO] fastkafka._testing.in_memory_broker: InMemoryBroker._patch_consumers_and_producers(): Patching consumers and producers!
[INFO] fastkafka._testing.in_memory_broker: InMemoryBroker starting
[INFO] fastkafka._application.app: _create_producer() : created producer using the config: '{'bootstrap_servers': 'localhost:9092'}'
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaProducer patched start() called()
[INFO] fastkafka._application.app: _create_producer() : created producer using the config: '{'bootstrap_servers': 'localhost:9092'}'
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaProducer patched start() called()
[INFO] fastkafka._application.app: _create_producer() : created producer using the config: '{'bootstrap_servers': 'localhost:9092'}'
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaProducer patched start() called()
[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_lo

[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop() finished.
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaProducer patched stop() called
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaProducer patched stop() called
[INFO] fastkafka._testing.in_memory_broker: InMemoryBroker._stop() called
[INFO] fastkafka._testing.in_memory_broker: InMemoryBroker stopping
ok


In [29]:
# | export


class TrainingModelStatus(BaseModel):
    AccountId: NonNegativeInt = Field(
        ..., example=202020, description="ID of an account"
    )
    ApplicationId: Optional[str] = Field(
        default=None,
        example="TestApplicationId",
        description="Id of the application in case there is more than one for the AccountId",
    )
    ModelId: Optional[str] = Field(
        default=None,
        example="ChurnModelForDrivers",
        description="User supplied ID of the model trained",
    )

    current_step: NonNegativeInt = Field(
        ...,
        example=0,
        description="number of records (rows) ingested",
    )
    current_step_percentage: float = Field(
        ...,
        example=0.21,
        description="the percentage of the current step completed",
    )
    total_no_of_steps: NonNegativeInt = Field(
        ...,
        example=20,
        description="total number of steps for training the model",
    )

In [30]:
training_model_status = TrainingModelStatus(
    AccountId=12345,
    current_step=1,
    current_step_percentage=0.21,
    total_no_of_steps=20,
)

expected = '{"AccountId": 12345, "ApplicationId": null, "ModelId": null, "current_step": 1, "current_step_percentage": 0.21, "total_no_of_steps": 20}'
actual = training_model_status.json()

actual

assert actual == expected

parsed = TrainingModelStatus.parse_raw(actual)
assert parsed == training_model_status

In [31]:
# | export


class ModelMetrics(BaseModel):
    """The standard metrics for classification models.

    The most important metrics is AUC for unbalanced classes such as churn. Metrics such as
    accuracy are not very useful since they are easily maximized by outputting the most common
    class all the time.
    """

    AccountId: NonNegativeInt = Field(
        ..., example=202020, description="ID of an account"
    )
    ApplicationId: Optional[str] = Field(
        default=None,
        example="TestApplicationId",
        description="Id of the application in case there is more than one for the AccountId",
    )
    ModelId: Optional[str] = Field(
        default=None,
        example="ChurnModelForDrivers",
        description="User supplied ID of the model trained",
    )

    timestamp: datetime = Field(
        ...,
        example="2021-03-28T00:34:08",
        description="UTC time when the model was trained",
    )
    model_type: ModelType = Field(
        ...,
        example="churn",
        description="Name of the model used (churn, propensity to buy)",
    )

    auc: float = Field(
        ..., example=0.91, description="Area under ROC curve", ge=0.0, le=1.0
    )
    f1: float = Field(..., example=0.89, description="F-1 score", ge=0.0, le=1.0)
    precission: float = Field(
        ..., example=0.84, description="precission", ge=0.0, le=1.0
    )
    recall: float = Field(..., example=0.82, description="recall", ge=0.0, le=1.0)
    accuracy: float = Field(..., example=0.82, description="accuracy", ge=0.0, le=1.0)

    class Config:
        json_encoders = {
            datetime: json_datetime_sec_encoder,
        }

In [32]:
model_metrics = ModelMetrics(
    AccountId=12345,
    timestamp="2021-03-28T00:34:08",
    model_type="churn",
    auc=0.95,
    recall=0.94,
    precission=0.98,
    accuracy=0.99,
    f1=2 * 0.94 * 0.98 / (0.94 + 0.98),
)

expected = '{"AccountId": 12345, "ApplicationId": null, "ModelId": null, "timestamp": "2021-03-28T00:34:08", "model_type": "churn", "auc": 0.95, "f1": 0.9595833333333332, "precission": 0.98, "recall": 0.94, "accuracy": 0.99}'
actual = model_metrics.json()

actual

assert actual == expected

parsed = ModelMetrics.parse_raw(actual)
assert parsed == model_metrics

In [33]:
# | export


def add_process_training_model_start(
    app: FastKafka,
    *,
    username: str = "infobip",
    total_no_of_steps: int = 10,
    substep_interval: Union[int, float] = 2,
) -> None:
    @app.produces(topic=f"{username}_training_model_status") # type: ignore
    async def to_training_model_status(
        training_model_status: TrainingModelStatus,
    ) -> TrainingModelStatus:
        print(f"to_training_model_status({training_model_status})")
        return training_model_status

    @app.produces(topic=f"{username}_model_metrics") # type: ignore
    async def to_model_metrics(
        model_metrics: ModelMetrics,
    ) -> ModelMetrics:
        print(f"to_model_metrics({model_metrics})")
        return model_metrics

    app.to_training_model_status = to_training_model_status
    app.to_model_metrics = to_model_metrics

    @app.consumes(topic=f"{username}_training_model_start") # type: ignore
    async def on_training_model_start(msg: TrainingModelStart) -> None:
        print(f"on_training_model_start({msg}) starting...")

        AccountId = msg.AccountId
        ApplicationId = msg.ApplicationId
        ModelId = msg.ModelId
        model_type = msg.model_type

        for current_step in range(total_no_of_steps):
            for current_step_percentage in [0.0, 0.2, 0.5, 0.75, 1.0]:
                training_model_status = TrainingModelStatus(
                    AccountId=AccountId,
                    ApplicationId=ApplicationId,
                    ModelId=ModelId,
                    current_step=current_step,
                    current_step_percentage=current_step_percentage,
                    total_no_of_steps=total_no_of_steps,
                )
                await app.to_training_model_status(training_model_status)

                await asyncio.sleep(substep_interval)

        model_metrics = ModelMetrics(
            AccountId=AccountId,
            ApplicationId=ApplicationId,
            ModelId=ModelId,
            model_type=model_type,
            timestamp=datetime.now(),
            auc=0.951,
            recall=0.944,
            precission=0.983,
            accuracy=0.992,
            f1=f"{2*0.944*0.983/(0.944+0.983):0.3f}",
        )
        await app.to_model_metrics(model_metrics)

        print(f"on_training_model_start({msg}) finished.")

In [34]:
app = FastKafka(kafka_brokers=kafka_brokers)

add_process_start_training(app)
total_no_of_steps = 10
add_process_training_model_start(
    app, total_no_of_steps=total_no_of_steps, substep_interval=0.01
)

tester = Tester(app)

async with tester:
    training_model_start = TrainingModelStart(
        AccountId=12345, no_of_records=1_000, model_type="churn"
    )

    heading(f"tester.to_infobip_training_model_start({training_model_start})")

    await tester.to_infobip_training_model_start(training_model_start)

    heading(f"tester.awaited_mocks.on_infobip_training_model_status.assert_called()")

    await tester.awaited_mocks.on_infobip_training_model_status.assert_called(
        timeout=10
    )

    heading(f"tester.awaited_mocks.on_infobip_training_model_status.assert_called()")

    await tester.awaited_mocks.on_infobip_model_metrics.assert_called(timeout=10)

    heading("check final training_model_start")

    training_model_status = (
        tester.awaited_mocks.on_infobip_training_model_status._o.await_args_list
    )
    final_training_model_status = training_model_status[-1]

    assert len(training_model_status) == 5 * total_no_of_steps, training_model_status
    expected = unittest.mock.call(
        TrainingModelStatus(
            AccountId=12345,
            current_step=total_no_of_steps - 1,
            current_step_percentage=1.0,
            total_no_of_steps=total_no_of_steps,
        )
    )
    assert final_training_model_status == expected, training_model_status

print("ok")

[INFO] fastkafka._testing.in_memory_broker: InMemoryBroker._start() called
[INFO] fastkafka._testing.in_memory_broker: InMemoryBroker._patch_consumers_and_producers(): Patching consumers and producers!
[INFO] fastkafka._testing.in_memory_broker: InMemoryBroker starting
[INFO] fastkafka._application.app: _create_producer() : created producer using the config: '{'bootstrap_servers': 'localhost:9092'}'
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaProducer patched start() called()
[INFO] fastkafka._application.app: _create_producer() : created producer using the config: '{'bootstrap_servers': 'localhost:9092'}'
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaProducer patched start() called()
[INFO] fastkafka._application.app: _create_producer() : created producer using the config: '{'bootstrap_servers': 'localhost:9092'}'
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaProducer patched start() called()
[INFO] fastkafka._application.app: _create_producer() : created producer 

to_training_model_status(AccountId=12345 ApplicationId=None ModelId=None current_step=9 current_step_percentage=0.0 total_no_of_steps=10)
to_training_model_status(AccountId=12345 ApplicationId=None ModelId=None current_step=9 current_step_percentage=0.2 total_no_of_steps=10)
to_training_model_status(AccountId=12345 ApplicationId=None ModelId=None current_step=9 current_step_percentage=0.5 total_no_of_steps=10)
to_training_model_status(AccountId=12345 ApplicationId=None ModelId=None current_step=9 current_step_percentage=0.75 total_no_of_steps=10)
to_training_model_status(AccountId=12345 ApplicationId=None ModelId=None current_step=9 current_step_percentage=1.0 total_no_of_steps=10)
to_model_metrics(AccountId=12345 ApplicationId=None ModelId=None timestamp=datetime.datetime(2023, 4, 30, 22, 49, 22, 531668) model_type=<ModelType.churn: 'churn'> auc=0.951 f1=0.963 precission=0.983 recall=0.944 accuracy=0.992)
on_training_model_start(AccountId=12345 ApplicationId=None ModelId=None model_ty

In [35]:
# | export


class Prediction(BaseModel):
    AccountId: NonNegativeInt = Field(
        ..., example=202020, description="ID of an account"
    )
    ApplicationId: Optional[str] = Field(
        default=None,
        example="TestApplicationId",
        description="Id of the application in case there is more than one for the AccountId",
    )
    ModelId: Optional[str] = Field(
        default=None,
        example="ChurnModelForDrivers",
        description="User supplied ID of the model trained",
    )

    PersonId: NonNegativeInt = Field(
        ..., example=12345678, description="ID of a person"
    )
    prediction_time: datetime = Field(
        ...,
        example="2021-03-28T00:34:08",
        description="UTC time of prediction",
    )
    model_type: ModelType = Field(
        ...,
        example="churn",
        description="Name of the model used (churn, propensity to buy)",
    )
    score: float = Field(
        ...,
        example=0.4321,
        description="Prediction score (e.g. the probability of churn in the next 28 days)",
        ge=0.0,
        le=1.0,
    )

    class Config:
        json_encoders = {
            datetime: json_datetime_sec_encoder,
        }

In [36]:
prediction = Prediction(
    AccountId=12345,
    PersonId=123456789,
    prediction_time="2021-03-28T00:34:08",
    model_type="churn",
    score=0.873,
)

expected = '{"AccountId": 12345, "ApplicationId": null, "ModelId": null, "PersonId": 123456789, "prediction_time": "2021-03-28T00:34:08", "model_type": "churn", "score": 0.873}'
actual = prediction.json()

actual

assert actual == expected

parsed = Prediction.parse_raw(actual)
assert parsed == prediction

In [37]:
# | export


def add_predictions(
    app: FastKafka,
    *,
    username: str = "infobip",
) -> None:
    @app.produces(topic=f"{username}_prediction") # type: ignore
    async def to_prediction(
        prediction: Prediction,
    ) -> Prediction:
        return prediction

    app.to_prediction = to_prediction

    @app.consumes(topic=f"{username}_model_metrics") # type: ignore
    async def on_model_metrics(msg: ModelMetrics) -> None:
        print(f"on_model_metrics({msg}) starting...")

        AccountId = msg.AccountId
        ApplicationId = msg.ApplicationId
        ModelId = msg.ModelId
        model_type = msg.model_type
        prediction_time = datetime.now()

        person_ids = get_all_person_ids_for_account_ids([AccountId])["PersonId"]

        rng = np.random.default_rng(42)

        for PersonId in person_ids:
            prediction = Prediction(
                AccountId=AccountId,
                ApplicationId=ApplicationId,
                ModelId=ModelId,
                model_type=model_type,
                prediction_time=prediction_time,
                PersonId=PersonId,
                score=rng.uniform(),
            )
            await to_prediction(prediction)

        print(f"on_model_metrics({msg}) finished.")

In [38]:
kafka_brokers = {
    "localhost": {
        "url": "localhost",
        "description": "localhost kafka broker",
        "port": "9092",
    }
}

app = FastKafka(kafka_brokers=kafka_brokers)

add_predictions(app)

tester = Tester(app)

total_no_of_records = 5000

with monkeypatch_clickhouse(
    account_id=12345,
    total=total_no_of_records,
    step=1500,
):
    async with tester:
        model_metrics = ModelMetrics(
            AccountId=12345,
            timestamp="2021-03-28T00:34:08",
            model_type="churn",
            auc=0.95,
            recall=0.94,
            precission=0.98,
            accuracy=0.99,
            f1=2 * 0.94 * 0.98 / (0.94 + 0.98),
        )

        heading(f"tester.to_infobip_model_metrics({model_metrics})")

        await tester.to_infobip_model_metrics(model_metrics)

        heading("tester.awaited_mocks.on_infobip_prediction.assert_called()")

        await tester.awaited_mocks.on_infobip_prediction.assert_called(timeout=10)

print("ok")

[INFO] fastkafka._testing.in_memory_broker: InMemoryBroker._start() called
[INFO] fastkafka._testing.in_memory_broker: InMemoryBroker._patch_consumers_and_producers(): Patching consumers and producers!
[INFO] fastkafka._testing.in_memory_broker: InMemoryBroker starting
[INFO] fastkafka._application.app: _create_producer() : created producer using the config: '{'bootstrap_servers': 'localhost:9092'}'
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaProducer patched start() called()
[INFO] fastkafka._application.app: _create_producer() : created producer using the config: '{'bootstrap_servers': 'localhost:9092'}'
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaProducer patched start() called()
[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop() starting...
[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop(): Consumer created using the following parameters: {'bootstrap_servers': 'localhost:9092', 'auto_offset_reset': 'earliest', '

In [39]:
# | export


def _construct_kafka_brokers() -> Dict[str, Dict[str, Any]]:
    url, port = aio_kafka_config["bootstrap_servers"].split(":")

    kafka_brokers = {
        "staging": {
            "url": "pkc-1wvvj.westeurope.azure.confluent.cloud",
            "description": "Staging Kafka broker",
            "port": 9092,
            "protocol": "kafka-secure",
            "security": {"type": "plain"},
        },
        "production": {
            "url": "pkc-1wvvj.westeurope.azure.confluent.cloud",
            "description": "Production Kafka broker",
            "port": 9092,
            "protocol": "kafka-secure",
            "security": {"type": "plain"},
        },
    }

    if (url != kafka_brokers["staging"]["url"]) and (
        url != kafka_brokers["production"]["url"]
    ):
        kafka_brokers["dev"] = {
            "url": url,
            "description": "Development Kafka broker",
            "port": port,
        }

    return kafka_brokers

In [40]:
_construct_kafka_brokers()

{'staging': {'url': 'pkc-1wvvj.westeurope.azure.confluent.cloud',
  'description': 'Staging Kafka broker',
  'port': 9092,
  'protocol': 'kafka-secure',
  'security': {'type': 'plain'}},
 'production': {'url': 'pkc-1wvvj.westeurope.azure.confluent.cloud',
  'description': 'Production Kafka broker',
  'port': 9092,
  'protocol': 'kafka-secure',
  'security': {'type': 'plain'}},
 'dev': {'url': 'davor-redpanda',
  'description': 'Development Kafka broker',
  'port': '9092'}}

In [41]:
# | export


def create_fastkafka_application(
    start_process_for_username: Optional[str] = "infobip",
) -> FastKafka:
    """Create a FastKafka service

    Args:
        start_process_for_username: prefix for topics used

    Returns:
        A FastKafka application
    """

    kafka_brokers = _construct_kafka_brokers()

    exclude_keys = ["bootstrap_servers"]
    kafka_config = {
        k: aio_kafka_config[k]
        for k in set(list(aio_kafka_config.keys())) - set(exclude_keys)
    }

    # global description
    version = airt_service.__version__
    contact = dict(name="airt.ai", url="https://airt.ai", email="info@airt.ai")

    app = FastKafka(
        title="airt service kafka api",
        description="kafka api for airt service",
        kafka_brokers=kafka_brokers,
        version=version,
        contact=contact,
        #         auto_offset_reset="earliest",
        **kafka_config,
    )

    add_process_start_training(app)
    add_process_training_model_start(app)
    add_predictions(app)

    return app

In [42]:
# # | notest
# # | eval: false

app = create_fastkafka_application()

tester = Tester(app)

total_no_of_records = 5000

with monkeypatch_clickhouse(
    account_id=12345,
    total=total_no_of_records,
    step=1500,
):
    async with tester:
        model_training_request = ModelTrainingRequest(
            AccountId=12345,
            OccurredTime="2021-03-28T00:34:08",
            model_type="churn",
            total_no_of_records=total_no_of_records + 123,
        )
        heading(f"tester.to_infobip_start_training({model_training_request})")

        await tester.to_infobip_start_training(model_training_request)

        heading("tester.awaited_mocks.on_infobip_prediction.assert_called()")

        await tester.awaited_mocks.on_infobip_prediction.assert_called(timeout=500)

print("ok")

[INFO] fastkafka._testing.in_memory_broker: InMemoryBroker._start() called
[INFO] fastkafka._testing.in_memory_broker: InMemoryBroker._patch_consumers_and_producers(): Patching consumers and producers!
[INFO] fastkafka._testing.in_memory_broker: InMemoryBroker starting
[INFO] fastkafka._application.app: _create_producer() : created producer using the config: '{'bootstrap_servers': 'localhost:9092'}'
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaProducer patched start() called()
[INFO] fastkafka._application.app: _create_producer() : created producer using the config: '{'bootstrap_servers': 'localhost:9092'}'
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaProducer patched start() called()
[INFO] fastkafka._application.app: _create_producer() : created producer using the config: '{'bootstrap_servers': 'localhost:9092'}'
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaProducer patched start() called()
[INFO] fastkafka._application.app: _create_producer() : created producer 

[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop(): Consumer started.
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaConsumer patched subscribe() called
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaConsumer.subscribe(), subscribing to: ['infobip_prediction']
[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop(): Consumer subscribed.

****************************************************************************************************************************************************************
***                                                                                                                                                          ***
***     tester.to_infobip_start_training(AccountId=12345 ApplicationId=None ModelId=None model_type=<ModelType.churn: 'churn'> total_no_of_records=5123)     ***
***                                                                                                                   

[INFO] fastkafka._components.aiokafka_consumer_loop: _aiokafka_consumer_loop(): Consumer loop shutting down, waiting for send_stream to drain...
[INFO] fastkafka._components.aiokafka_consumer_loop: _aiokafka_consumer_loop(): Consumer loop shutting down, waiting for send_stream to drain...
[INFO] fastkafka._components.aiokafka_consumer_loop: _aiokafka_consumer_loop(): Consumer loop shutting down, waiting for send_stream to drain...
[INFO] fastkafka._components.aiokafka_consumer_loop: _aiokafka_consumer_loop(): Consumer loop shutting down, waiting for send_stream to drain...
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaConsumer patched stop() called
[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop(): Consumer stopped.
[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop() finished.
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaConsumer patched stop() called
[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consume