In [1]:
# | default_exp kafka_server

In [2]:
# | export


import asyncio
import logging
from datetime import datetime, timedelta
from enum import Enum
from os import environ
from typing import *

import numpy as np
import pandas as pd
from airt.logger import get_logger, supress_timestamps
from fastkafka import FastKafka, KafkaEvent
from pydantic import (
    BaseModel,
    EmailStr,
    Field,
    HttpUrl,
    NonNegativeInt,
    root_validator,
    validator,
)

import airt_service
from airt_service.confluent import aio_kafka_config
from airt_service.data.clickhouse import (
    get_all_person_ids_for_account_id,
    get_count_for_account_id,
)

23-05-15 07:53:09.680 [INFO] matplotlib.font_manager: generated new fontManager


In [3]:
import contextlib
import functools
import importlib
import unittest
from inspect import signature

import asyncer
import fastkafka
import pytest
from fastkafka.testing import Tester
from pytest import MonkeyPatch

from airt_service.db.models import create_user_for_testing

In [4]:
# | exporti

supress_timestamps(False)
logger = get_logger(__name__)

In [5]:
supress_timestamps(True)
logger = get_logger(__name__)

In [6]:
def heading(title: Optional[str] = None, width: int = 160):
    print()
    print("*" * width)
    print("*" * 3 + " " * (width - 6) + "*" * 3)
    if title:
        l = int((width - 6 - len(title)) / 2)
        print("*" * 3 + " " * l + title + " " * l + "*" * 3)
        print("*" * 3 + " " * (width - 6) + "*" * 3)

In [7]:
def get_count_for_account_id_monkey_patch(
    *,
    total: int = 10_000,
    step: int = 1000,
    curr_check_shift: Optional[timedelta] = None,
) -> Callable[
    [int, Optional[Union[int, str]], Optional[Union[int, str]]],
    Tuple[Optional[int], Optional[datetime]],
]:
    def _get_count_for_account_id_monkey_patch(
        *,
        total: int = total,
        step: int = step,
        curr_check_shift: Optional[timedelta] = curr_check_shift,
    ) -> Tuple[Optional[int], Optional[datetime]]:
        if curr_check_shift is None:
            curr_check_shift = timedelta(seconds=-3)

        yield (None, None)

        for i in range(0, total, step):
            yield (i, datetime.utcnow() + curr_check_shift)

        while True:
            yield (total, datetime.utcnow() + curr_check_shift)

    d = {}

    def _iterate_get_count_for_account_id_monkey_patch(
        account_id: int,
        model_id: Optional[Union[int, str]],
        d=d,
    ):
        k = (account_id, model_id)
        if (account_id, model_id) not in d:
            d[k] = _get_count_for_account_id_monkey_patch()

        return next(d[k])

    return _iterate_get_count_for_account_id_monkey_patch

In [8]:
f = get_count_for_account_id_monkey_patch(curr_check_shift=timedelta(seconds=10))

count, timestampt = f(12345, None)
assert count == None, count
assert timestampt == None, timestampt

df = pd.DataFrame(
    [f(12345, None) for _ in range(13)], columns=["curr_count", "timestamp"]
)
display(df["curr_count"].to_frame())

assert df["curr_count"].to_list() == [1000 * i for i in range(10)] + [10_000] * 3

Unnamed: 0,curr_count
0,0
1,1000
2,2000
3,3000
4,4000
5,5000
6,6000
7,7000
8,8000
9,9000


In [9]:
f = get_count_for_account_id_monkey_patch(
    curr_check_shift=timedelta(seconds=10), total=4501, step=1500
)

count, timestampt = f(12345, None)
assert count == None, count
assert timestampt == None, timestampt

df = pd.DataFrame(
    [f(12345, None) for _ in range(6)], columns=["curr_count", "timestamp"]
)
display(df["curr_count"].to_frame())

assert df["curr_count"].to_list() == [1500 * i for i in range(4)] + [4501] * 2

Unnamed: 0,curr_count
0,0
1,1500
2,3000
3,4500
4,4501
5,4501


In [10]:
create_user_for_testing(username="infobip")

'infobip'

In [11]:
@contextlib.contextmanager
def monkeypatch_clickhouse(
    curr_check_shift: Optional[timedelta] = None,
    total: int = 10_000,
    step: int = 1_000,
) -> None:
    with MonkeyPatch.context() as monkeypatch:
        monkeypatch.setattr(
            "__main__.get_count_for_account_id",
            get_count_for_account_id_monkey_patch(
                curr_check_shift=curr_check_shift,
                total=total,
                step=step,
            ),
        )

        rng = np.random.default_rng(42)
        monkeypatch.setattr(
            "__main__.get_all_person_ids_for_account_id",
            lambda account_id, model_id: pd.Series(
                rng.integers(low=1_000_000, high=10_000_000, size=7), name="PersonId"
            ),
        )
        yield

In [12]:
with monkeypatch_clickhouse():
    get_count_for_account_id(account_id=12345, model_id=None)
    xs = [get_count_for_account_id(account_id=12345, model_id=None) for _ in range(13)]

    pd.DataFrame(xs)
    df = pd.DataFrame(xs, columns=["curr_count", "timestamp"])
    display(df["curr_count"].to_frame())

    person_ids = get_all_person_ids_for_account_id(account_id=12345, model_id=None)
    display(person_ids.to_frame())

Unnamed: 0,curr_count
0,0
1,1000
2,2000
3,3000
4,4000
5,5000
6,6000
7,7000
8,8000
9,9000


Unnamed: 0,PersonId
0,1803258
1,7965604
2,6891143
3,4949905
4,4897137
5,8727381
6,1773510


In [13]:
# | export


def json_datetime_sec_encoder(dt: datetime) -> str:
    return dt.strftime("%Y-%m-%dT%H:%M:%S")

In [14]:
dt = datetime.fromisoformat("2023-01-01T12:34:56.789012")
expected = "2023-01-01T12:34:56"
actual = json_datetime_sec_encoder(dt)
assert actual == expected

In [15]:
# | export


class LogMessage(BaseModel):
    """
    Info, error and warning messages
    """

    level: NonNegativeInt = Field(10, example=10, description="level of the message")
    timestamp: datetime = Field(None, description="timestamp")
    message: str = Field(..., example="something went wrong", description="message")

    original_message: Optional[BaseModel] = Field(...)

    @root_validator
    def number_validator(cls, values: Dict[str, Any]) -> Dict[str, Any]:
        if values["timestamp"] is None:
            values["timestamp"] = datetime.now()

        return values

    class Config:
        json_encoders = {
            datetime: json_datetime_sec_encoder,
        }
        validate_assignment = True

In [16]:
class SomeMessage(BaseModel):
    a: int = 12
    b: str = "hello"


original_message = SomeMessage()

msg = LogMessage(
    level=logging.INFO,
    timestamp="2021-03-28T00:34:08",
    message="something went wrong",
    original_message=original_message,
)

actual = msg.json()
expected = '{"level": 20, "timestamp": "2021-03-28T00:34:08", "message": "something went wrong", "original_message": {"a": 12, "b": "hello"}}'

assert actual == expected

In [17]:
msg = LogMessage(
    message="something went wrong",
    original_message=original_message,
)
assert msg.timestamp is not None

In [18]:
# | export
def add_logging(
    app: FastKafka,
    *,
    username: str = "infobip",
) -> None:
    @app.produces(topic=f"{username}_logger")  # type: ignore
    async def to_logger(
        msg: LogMessage,
        key: Optional[Union[bytes, str]] = None,
        #     ) -> KafkaEvent[LogMessage]:
        #         print(f"to_logger({msg})")
        #         k = key.encode("utf-8") if isinstance(key, str) else key
        #         return KafkaEvent(message=msg, key=k)
    ) -> LogMessage:
        print(f"to_logger({msg})")
        k = key.encode("utf-8") if isinstance(key, str) else key
        return msg

    async def log(
        org_msg: BaseModel,
        msg: LogMessage,
        key: Optional[Union[bytes, str]] = None,
        *,
        level: int,
        app: FastKafka = app,
    ) -> None:
        log_msg = LogMessage(message=msg, level=10, key=key, original_message=org_msg)
        await app.to_logger(log_msg)
        logger.info(f"{msg} while processing {org_msg}.")

    async def info(
        org_msg: BaseModel,
        msg: LogMessage,
        key: Optional[Union[bytes, str]] = None,
        *,
        app: FastKafka = app,
    ) -> None:
        await app.log(org_msg, msg=msg, key=key, level=10)

    async def warning(
        org_msg: BaseModel,
        msg: LogMessage,
        key: Optional[Union[bytes, str]] = None,
        *,
        app: FastKafka = app,
    ) -> None:
        await app.log(org_msg, msg=msg, key=key, level=20)

    async def error(
        org_msg: BaseModel,
        msg: LogMessage,
        key: Optional[Union[bytes, str]] = None,
        *,
        app: FastKafka = app,
    ) -> None:
        await app.log(org_msg, msg=msg, key=key, level=30)

#     app.to_logger = to_logger
    app.log = log
    app.info = info
    app.warning = warning
    app.error = error

In [19]:
kafka_brokers = {
    "localhost": {
        "url": "localhost",
        "description": "localhost kafka broker",
        "port": "9092",
    }
}

app = FastKafka(kafka_brokers=kafka_brokers)

add_logging(app)

tester = Tester(app)


class SomeMessage(BaseModel):
    a: int = 12
    b: str = "hello"


original_message = SomeMessage()

msg = LogMessage(
    level=logging.INFO,
    timestamp="2021-03-28T00:34:08",
    message="something went wrong",
    original_message=original_message,
)

async with tester:
    heading(f"app.to_logger({msg})")
    msg = await app.to_logger(msg)

    await tester.awaited_mocks.on_infobip_logger.assert_awaited(timeout=5)


async with tester:
    heading(f"app.info({original_message})")
    msg = await app.info(original_message, "something went wrong again")

    await tester.awaited_mocks.on_infobip_logger.assert_awaited(timeout=5)

print("ok")

[INFO] fastkafka._testing.in_memory_broker: InMemoryBroker._start() called
[INFO] fastkafka._testing.in_memory_broker: InMemoryBroker._patch_consumers_and_producers(): Patching consumers and producers!
[INFO] fastkafka._testing.in_memory_broker: InMemoryBroker starting
[INFO] fastkafka._application.app: _create_producer() : created producer using the config: '{'bootstrap_servers': 'localhost:9092'}'
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaProducer patched start() called()
[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop() starting...
[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop(): Consumer created using the following parameters: {'bootstrap_servers': 'localhost:9092', 'auto_offset_reset': 'earliest', 'max_poll_records': 100}
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaConsumer patched start() called()
[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop(): Consumer started.
[INFO] fastk

In [20]:
# | export


class ModelType(str, Enum):
    churn = "churn"
    propensity_to_buy = "propensity_to_buy"


class ModelTrainingRequest(BaseModel):
    AccountId: NonNegativeInt = Field(
        ..., example=202020, description="ID of an account"
    )
    ModelId: Optional[str] = Field(
        default=None,
        example="ChurnModelForDrivers",
        description="User supplied ID of the model trained",
    )
    model_type: ModelType = Field(
        ..., description="Model type, only 'churn' is supported right now"
    )
    total_no_of_records: NonNegativeInt = Field(
        ...,
        example=1_000_000,
        description="approximate total number of records (rows) to be ingested",
    )

In [21]:
model_training_request = ModelTrainingRequest(
    AccountId=12345,
    OccurredTime="2021-03-28T00:34:08",
    model_type="churn",
    total_no_of_records=1000,
)

expected = '{"AccountId": 12345, "ModelId": null, "model_type": "churn", "total_no_of_records": 1000}'
actual = model_training_request.json()

assert actual == expected, actual

parsed = ModelTrainingRequest.parse_raw(actual)
assert parsed == model_training_request

In [22]:
# | export


class EventData(BaseModel):
    """
    A sequence of events for a fixed account_id
    """

    AccountId: NonNegativeInt = Field(
        ..., example=202020, description="ID of an account"
    )
    ApplicationId: Optional[str] = Field(
        default=None,
        example="TestApplicationId",
        description="Id of the application in case there is more than one for the AccountId",
    )
    ModelId: Optional[str] = Field(
        default=None,
        example="ChurnModelForDrivers",
        description="User supplied ID of the model trained",
    )

    DefinitionId: str = Field(
        ...,
        example="appLaunch",
        description="name of the event",
        min_length=1,
    )
    OccurredTime: datetime = Field(
        ...,
        example="2021-03-28T00:34:08",
        description="local time of the event",
    )
    OccurredTimeTicks: NonNegativeInt = Field(
        ...,
        example=1616891648496,
        description="local time of the event as the number of ticks",
    )
    PersonId: NonNegativeInt = Field(
        ..., example=12345678, description="ID of a person"
    )


class RealtimeData(EventData):
    pass

In [23]:
event_data = EventData(
    AccountId=12345,
    DefinitionId="BigButton",
    PersonId=123456789,
    OccurredTime="2021-03-28T00:34:08",
    OccurredTimeTicks=1616891648496,
)

expected = '{"AccountId": 12345, "ApplicationId": null, "ModelId": null, "DefinitionId": "BigButton", "OccurredTime": "2021-03-28T00:34:08", "OccurredTimeTicks": 1616891648496, "PersonId": 123456789}'
actual = event_data.json()

assert actual == expected, actual

parsed = EventData.parse_raw(actual)
assert parsed == event_data

In [24]:
# | export


class TrainingDataStatus(BaseModel):
    AccountId: NonNegativeInt = Field(
        ..., example=202020, description="ID of an account"
    )
    ModelId: Optional[str] = Field(
        default=None,
        example="ChurnModelForDrivers",
        description="User supplied ID of the model trained",
    )

    no_of_records: NonNegativeInt = Field(
        ...,
        example=12_345,
        description="number of records (rows) ingested",
    )
    total_no_of_records: NonNegativeInt = Field(
        ...,
        example=1_000_000,
        description="total number of records (rows) to be ingested",
    )

In [25]:
training_data_status = TrainingDataStatus(
    AccountId=12345,
    no_of_records=23,
    total_no_of_records=54,
)

expected = '{"AccountId": 12345, "ModelId": null, "no_of_records": 23, "total_no_of_records": 54}'
actual = training_data_status.json()

assert actual == expected, actual

parsed = TrainingDataStatus.parse_raw(actual)
assert parsed == training_data_status

In [26]:
# | export


class TrainingModelStart(BaseModel):
    AccountId: NonNegativeInt = Field(
        ..., example=202020, description="ID of an account"
    )
    ModelId: Optional[str] = Field(
        default=None,
        example="ChurnModelForDrivers",
        description="User supplied ID of the model trained",
    )
    model_type: ModelType = Field(
        ..., description="Model type, only 'churn' is supported right now"
    )
    no_of_records: NonNegativeInt = Field(
        ...,
        example=1_000_000,
        description="number of records (rows) in the DB used for training",
    )

In [27]:
training_model_start = TrainingModelStart(
    AccountId=12345,
    model_type="churn",
    no_of_records=100,
)

expected = (
    '{"AccountId": 12345, "ModelId": null, "model_type": "churn", "no_of_records": 100}'
)
actual = training_model_start.json()

assert actual == expected, actual

parsed = TrainingModelStart.parse_raw(actual)
assert parsed == training_model_start

In [28]:
# | export


def get_key(msg: BaseModel, attrs: Optional[List[str]] = None) -> bytes:
    if attrs is None:
        attrs = ["AccountId", "ModelId"]

    sx = [
        f"{attr}='{getattr(msg, attr)}'" if hasattr(msg, attr) else "" for attr in attrs
    ]

    return ", ".join(sx).encode("utf-8")

In [29]:
training_model_start = TrainingModelStart(
    AccountId=12345,
    model_type="churn",
    no_of_records=100,
)

actual = get_key(training_model_start)
assert actual == b"AccountId='12345', ModelId='None'", actual

actual = get_key(training_model_start, ["model_type"])
assert actual == b"model_type='churn'", actual

In [30]:
# | export


class Tracker:
    def __init__(self, *, limit: int, timeout: int, abort_after: int):
        self._limit = limit
        self._timeout = timeout
        self._abort_after = abort_after
        self._count: Optional[int] = None
        self._last_updated: Optional[datetime] = None
        self._sterted_at: datetime = datetime.now()

    def update(self, count: int) -> bool:
        if self._count != count:
            self._count = count
            self._last_updated = datetime.now()
            return True
        else:
            return False

    def finished(self) -> bool:
        if self._count is not None:
            return (self._count >= self._limit) or (
                datetime.now() - self._last_updated  # type: ignore
            ) > timedelta(seconds=self._timeout)
        else:
            return self.aborted()

    def aborted(self) -> bool:
        return self._count is None and (datetime.now() - self._sterted_at) > timedelta(
            seconds=self._abort_after
        )

In [31]:
tracker = Tracker(limit=10, timeout=5, abort_after=10)

assert not tracker.finished()

assert tracker.update(9)
assert not tracker.update(9)

assert not tracker.finished()

assert tracker.update(10)

assert tracker.finished()

In [32]:
tracker = Tracker(limit=10, timeout=1, abort_after=10)

assert not tracker.finished()

tracker.update(9)

assert not tracker.finished()

await asyncio.sleep(1.1)

assert tracker.finished()

In [33]:
tracker = Tracker(limit=10, timeout=1, abort_after=2)
await asyncio.sleep(1.1)

assert not tracker.finished()
assert not tracker.aborted()

await asyncio.sleep(1.1)

assert tracker.finished()
assert tracker.aborted()

In [34]:
# | export


def add_process_start_training_data(
    app: FastKafka,
    *,
    username: str = "infobip",
    stop_on_no_change_interval: int = 60,
    abort_on_no_change_interval: int = 120,
    sleep_interval: int = 5,
) -> None:
    @app.produces(topic=f"{username}_training_data_status")  # type: ignore
    async def to_training_data_status(
        training_data_status: TrainingDataStatus,
    ) -> TrainingDataStatus:
        print(f"to_training_data_status({training_data_status})")
        return training_data_status

    @app.produces(topic=f"{username}_training_model_start")  # type: ignore
    async def to_training_model_start(
        training_model_start: TrainingModelStart,
    ) -> TrainingModelStart:
        print(f"to_training_model_start({training_model_start})")
        return training_model_start

#     app.to_training_data_status = to_training_data_status
#     app.to_training_model_start = to_training_model_start

    @app.consumes(topic=f"{username}_start_training_data")  # type: ignore
    async def on_start_training_data(
        msg: ModelTrainingRequest, app: FastKafka = app
    ) -> None:
        await app.info(msg, f"on_start_training_data() starting...")

        account_id = msg.AccountId
        model_id = msg.ModelId
        total_no_of_records = msg.total_no_of_records

        tracker = Tracker(
            limit=total_no_of_records,
            timeout=stop_on_no_change_interval,
            abort_after=abort_on_no_change_interval,
        )

        while not tracker.finished():
            curr_count, timestamp = get_count_for_account_id(
                account_id=account_id,
                model_id=model_id,
            )
            if curr_count is not None:
                if tracker.update(curr_count):
                    training_data_status = TrainingDataStatus(
                        no_of_records=curr_count, **msg.dict()
                    )
                    await app.to_training_data_status(training_data_status)
            else:
                await app.warning(
                    msg,
                    f"on_start_training_data(): no data yet received in the database.",
                )

            await asyncio.sleep(sleep_interval)

        if tracker.aborted():
            await app.error(msg, f"on_start_training_data(): data retrieval aborted!")
        else:
            # trigger model training start
            training_model_start = TrainingModelStart(
                no_of_records=curr_count, **msg.dict()
            )
            await app.to_training_model_start(training_model_start)

            await app.info(msg, f"on_start_training_data(): finished")

In [35]:
kafka_brokers = {
    "localhost": {
        "url": "localhost",
        "description": "localhost kafka broker",
        "port": "9092",
    }
}

app = FastKafka(kafka_brokers=kafka_brokers)

add_logging(app)
add_process_start_training_data(
    app, stop_on_no_change_interval=3, sleep_interval=1, abort_on_no_change_interval=100
)

tester = Tester(app)

total_no_of_records = 5000

with monkeypatch_clickhouse(
    total=total_no_of_records,
    step=1500,
):
    async with tester:
        model_training_request = ModelTrainingRequest(
            AccountId=12345,
            OccurredTime="2021-03-28T00:34:08",
            model_type="churn",
            total_no_of_records=total_no_of_records + 123,
        )
        heading(f"tester.to_infobip_start_training_data({model_training_request})")

        await tester.to_infobip_start_training_data(model_training_request)

        heading("tester.awaited_mocks.on_infobip_training_data_status.assert_called()")

        await tester.awaited_mocks.on_infobip_training_data_status.assert_called(
            timeout=10
        )

        heading("tester.awaited_mocks.on_infobip_training_model_start.assert_called()")
        await tester.awaited_mocks.on_infobip_training_model_start.assert_called(
            timeout=20
        )

        heading("check final training_data_status")

        training_data_statuses = (
            tester.awaited_mocks.on_infobip_training_data_status._o.await_args_list
        )
        final_training_data_status = training_data_statuses[-1]

        assert len(training_data_statuses) == 5, training_data_statuses
        expected = unittest.mock.call(
            TrainingDataStatus(
                AccountId=12345,
                no_of_records=total_no_of_records,
                total_no_of_records=total_no_of_records + 123,
            )
        )
        assert final_training_data_status == expected, training_data_statuses

print("ok")

[INFO] fastkafka._testing.in_memory_broker: InMemoryBroker._start() called
[INFO] fastkafka._testing.in_memory_broker: InMemoryBroker._patch_consumers_and_producers(): Patching consumers and producers!
[INFO] fastkafka._testing.in_memory_broker: InMemoryBroker starting
[INFO] fastkafka._application.app: _create_producer() : created producer using the config: '{'bootstrap_servers': 'localhost:9092'}'
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaProducer patched start() called()
[INFO] fastkafka._application.app: _create_producer() : created producer using the config: '{'bootstrap_servers': 'localhost:9092'}'
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaProducer patched start() called()
[INFO] fastkafka._application.app: _create_producer() : created producer using the config: '{'bootstrap_servers': 'localhost:9092'}'
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaProducer patched start() called()
[INFO] fastkafka._application.app: _create_producer() : created producer 


****************************************************************************************************************************************************************
***                                                                                                                                                          ***
***                                                             check final training_data_status                                                             ***
***                                                                                                                                                          ***
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaConsumer patched stop() called
[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop(): Consumer stopped.
[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop() finished.
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaConsumer patched stop() called
[INF

In [36]:
# | export


class TrainingModelStatus(BaseModel):
    AccountId: NonNegativeInt = Field(
        ..., example=202020, description="ID of an account"
    )
    ModelId: Optional[str] = Field(
        default=None,
        example="ChurnModelForDrivers",
        description="User supplied ID of the model trained",
    )

    current_step: NonNegativeInt = Field(
        ...,
        example=0,
        description="number of records (rows) ingested",
    )
    current_step_percentage: float = Field(
        ...,
        example=0.21,
        description="the percentage of the current step completed",
    )
    total_no_of_steps: NonNegativeInt = Field(
        ...,
        example=20,
        description="total number of steps for training the model",
    )

In [37]:
training_model_status = TrainingModelStatus(
    AccountId=12345,
    current_step=1,
    current_step_percentage=0.21,
    total_no_of_steps=20,
)

expected = '{"AccountId": 12345, "ModelId": null, "current_step": 1, "current_step_percentage": 0.21, "total_no_of_steps": 20}'
actual = training_model_status.json()

actual

assert actual == expected

parsed = TrainingModelStatus.parse_raw(actual)
assert parsed == training_model_status

In [38]:
# | export


class ModelMetrics(BaseModel):
    """The standard metrics for classification models.

    The most important metrics is AUC for unbalanced classes such as churn. Metrics such as
    accuracy are not very useful since they are easily maximized by outputting the most common
    class all the time.
    """

    AccountId: NonNegativeInt = Field(
        ..., example=202020, description="ID of an account"
    )
    ModelId: Optional[str] = Field(
        default=None,
        example="ChurnModelForDrivers",
        description="User supplied ID of the model trained",
    )

    timestamp: datetime = Field(
        ...,
        example="2021-03-28T00:34:08",
        description="UTC time when the model was trained",
    )
    model_type: ModelType = Field(
        ...,
        example="churn",
        description="Name of the model used (churn, propensity to buy)",
    )

    auc: float = Field(
        ..., example=0.91, description="Area under ROC curve", ge=0.0, le=1.0
    )
    f1: float = Field(..., example=0.89, description="F-1 score", ge=0.0, le=1.0)
    precission: float = Field(
        ..., example=0.84, description="precission", ge=0.0, le=1.0
    )
    recall: float = Field(..., example=0.82, description="recall", ge=0.0, le=1.0)
    accuracy: float = Field(..., example=0.82, description="accuracy", ge=0.0, le=1.0)

    class Config:
        json_encoders = {
            datetime: json_datetime_sec_encoder,
        }

In [39]:
model_metrics = ModelMetrics(
    AccountId=12345,
    timestamp="2021-03-28T00:34:08",
    model_type="churn",
    auc=0.95,
    recall=0.94,
    precission=0.98,
    accuracy=0.99,
    f1=2 * 0.94 * 0.98 / (0.94 + 0.98),
)

expected = '{"AccountId": 12345, "ModelId": null, "timestamp": "2021-03-28T00:34:08", "model_type": "churn", "auc": 0.95, "f1": 0.9595833333333332, "precission": 0.98, "recall": 0.94, "accuracy": 0.99}'
actual = model_metrics.json()

actual

assert actual == expected

parsed = ModelMetrics.parse_raw(actual)
assert parsed == model_metrics

In [40]:
# | export


def add_process_training_model_start(
    app: FastKafka,
    *,
    username: str = "infobip",
    total_no_of_steps: int = 10,
    substep_interval: Union[int, float] = 2,
) -> None:
    @app.produces(topic=f"{username}_training_model_status")  # type: ignore
    async def to_training_model_status(
        training_model_status: TrainingModelStatus,
    ) -> TrainingModelStatus:
        print(f"to_training_model_status({training_model_status})")
        return training_model_status

    @app.produces(topic=f"{username}_model_metrics")  # type: ignore
    async def to_model_metrics(
        model_metrics: ModelMetrics,
    ) -> ModelMetrics:
        print(f"to_model_metrics({model_metrics})")
        return model_metrics

#     app.to_training_model_status = to_training_model_status
#     app.to_model_metrics = to_model_metrics

    @app.consumes(topic=f"{username}_training_model_start")  # type: ignore
    async def on_training_model_start(
        msg: TrainingModelStart, app: FastKafka = app
    ) -> None:
        await app.info(msg, f"on_training_model_start() starting")

        AccountId = msg.AccountId
        ModelId = msg.ModelId
        model_type = msg.model_type

        for current_step in range(total_no_of_steps):
            for current_step_percentage in [0.0, 0.2, 0.5, 0.75, 1.0]:
                training_model_status = TrainingModelStatus(
                    AccountId=AccountId,
                    ModelId=ModelId,
                    current_step=current_step,
                    current_step_percentage=current_step_percentage,
                    total_no_of_steps=total_no_of_steps,
                )
                await app.to_training_model_status(training_model_status)

                await asyncio.sleep(substep_interval)

        model_metrics = ModelMetrics(
            AccountId=AccountId,
            ModelId=ModelId,
            model_type=model_type,
            timestamp=datetime.now(),
            auc=0.951,
            recall=0.944,
            precission=0.983,
            accuracy=0.992,
            f1=f"{2*0.944*0.983/(0.944+0.983):0.3f}",
        )
        await app.to_model_metrics(model_metrics)

        await app.info(msg, f"on_training_model_start() finished")

In [41]:
app = FastKafka(kafka_brokers=kafka_brokers)

add_logging(app)
add_process_start_training_data(app)
total_no_of_steps = 10
add_process_training_model_start(
    app, total_no_of_steps=total_no_of_steps, substep_interval=0.01
)

tester = Tester(app)

async with tester:
    training_model_start = TrainingModelStart(
        AccountId=12345, no_of_records=1_000, model_type="churn"
    )

    heading(f"tester.to_infobip_training_model_start({training_model_start})")

    await tester.to_infobip_training_model_start(training_model_start)

    heading(f"tester.awaited_mocks.on_infobip_training_model_status.assert_called()")

    await tester.awaited_mocks.on_infobip_training_model_status.assert_called(
        timeout=10
    )

    heading(f"tester.awaited_mocks.on_infobip_training_model_status.assert_called()")

    await tester.awaited_mocks.on_infobip_model_metrics.assert_called(timeout=10)

    heading("check final training_model_start")

    training_model_status = (
        tester.awaited_mocks.on_infobip_training_model_status._o.await_args_list
    )
    final_training_model_status = training_model_status[-1]

    assert len(training_model_status) == 5 * total_no_of_steps, training_model_status
    expected = unittest.mock.call(
        TrainingModelStatus(
            AccountId=12345,
            current_step=total_no_of_steps - 1,
            current_step_percentage=1.0,
            total_no_of_steps=total_no_of_steps,
        )
    )
    assert final_training_model_status == expected, training_model_status

print("ok")

[INFO] fastkafka._testing.in_memory_broker: InMemoryBroker._start() called
[INFO] fastkafka._testing.in_memory_broker: InMemoryBroker._patch_consumers_and_producers(): Patching consumers and producers!
[INFO] fastkafka._testing.in_memory_broker: InMemoryBroker starting
[INFO] fastkafka._application.app: _create_producer() : created producer using the config: '{'bootstrap_servers': 'localhost:9092'}'
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaProducer patched start() called()
[INFO] fastkafka._application.app: _create_producer() : created producer using the config: '{'bootstrap_servers': 'localhost:9092'}'
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaProducer patched start() called()
[INFO] fastkafka._application.app: _create_producer() : created producer using the config: '{'bootstrap_servers': 'localhost:9092'}'
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaProducer patched start() called()
[INFO] fastkafka._application.app: _create_producer() : created producer 

to_training_model_status(AccountId=12345 ModelId=None current_step=0 current_step_percentage=0.0 total_no_of_steps=10)
to_training_model_status(AccountId=12345 ModelId=None current_step=0 current_step_percentage=0.2 total_no_of_steps=10)
to_training_model_status(AccountId=12345 ModelId=None current_step=0 current_step_percentage=0.5 total_no_of_steps=10)
to_training_model_status(AccountId=12345 ModelId=None current_step=0 current_step_percentage=0.75 total_no_of_steps=10)
to_training_model_status(AccountId=12345 ModelId=None current_step=0 current_step_percentage=1.0 total_no_of_steps=10)
to_training_model_status(AccountId=12345 ModelId=None current_step=1 current_step_percentage=0.0 total_no_of_steps=10)
to_training_model_status(AccountId=12345 ModelId=None current_step=1 current_step_percentage=0.2 total_no_of_steps=10)
to_training_model_status(AccountId=12345 ModelId=None current_step=1 current_step_percentage=0.5 total_no_of_steps=10)
to_training_model_status(AccountId=12345 ModelI

[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop(): Consumer stopped.
[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop() finished.
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaConsumer patched stop() called
[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop(): Consumer stopped.
[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop() finished.
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaConsumer patched stop() called
[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop(): Consumer stopped.
[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop() finished.
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaConsumer patched stop() called
[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop(): Consumer stopped.
[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop() finished.
[INFO] fast

In [42]:
# | export


class Prediction(BaseModel):
    AccountId: NonNegativeInt = Field(
        ..., example=202020, description="ID of an account"
    )
    ModelId: Optional[str] = Field(
        default=None,
        example="ChurnModelForDrivers",
        description="User supplied ID of the model trained",
    )

    PersonId: NonNegativeInt = Field(
        ..., example=12345678, description="ID of a person"
    )
    prediction_time: datetime = Field(
        ...,
        example="2021-03-28T00:34:08",
        description="UTC time of prediction",
    )
    model_type: ModelType = Field(
        ...,
        example="churn",
        description="Name of the model used (churn, propensity to buy)",
    )
    score: float = Field(
        ...,
        example=0.4321,
        description="Prediction score (e.g. the probability of churn in the next 28 days)",
        ge=0.0,
        le=1.0,
    )

    class Config:
        json_encoders = {
            datetime: json_datetime_sec_encoder,
        }

In [43]:
prediction = Prediction(
    AccountId=12345,
    PersonId=123456789,
    prediction_time="2021-03-28T00:34:08",
    model_type="churn",
    score=0.873,
)

expected = '{"AccountId": 12345, "ModelId": null, "PersonId": 123456789, "prediction_time": "2021-03-28T00:34:08", "model_type": "churn", "score": 0.873}'
actual = prediction.json()

actual

assert actual == expected

parsed = Prediction.parse_raw(actual)
assert parsed == prediction

In [44]:
# | export


def add_predictions(
    app: FastKafka,
    *,
    username: str = "infobip",
) -> None:
    @app.produces(topic=f"{username}_prediction")  # type: ignore
    async def to_prediction(
        prediction: Prediction,
    ) -> Prediction:
        return prediction

    app.to_prediction = to_prediction

    @app.consumes(topic=f"{username}_model_metrics")  # type: ignore
    async def on_model_metrics(msg: ModelMetrics, app: FastKafka = app) -> None:
        await app.info(msg, "on_model_metrics() starting")

        AccountId = msg.AccountId
        ModelId = msg.ModelId
        model_type = msg.model_type
        prediction_time = datetime.now()

        person_ids = get_all_person_ids_for_account_id(
            account_id=AccountId, model_id=ModelId
        )

        rng = np.random.default_rng(42)

        await app.info(msg, f"Sending predictions for {len(person_ids):,d} PersonIds")
        t0 = datetime.now()
        for PersonId in person_ids:
            prediction = Prediction(
                AccountId=AccountId,
                ModelId=ModelId,
                model_type=model_type,
                prediction_time=prediction_time,
                PersonId=PersonId,
                score=rng.uniform(),
            )
            await to_prediction(prediction)
        await app.info(
            msg,
            f"Sending predictions for {len(person_ids):,d} PersonIds finished in {datetime.now()-t0}.",
        )

        await app.info(msg, "on_model_metrics() finished")

In [45]:
kafka_brokers = {
    "localhost": {
        "url": "localhost",
        "description": "localhost kafka broker",
        "port": "9092",
    }
}

app = FastKafka(kafka_brokers=kafka_brokers)

add_logging(app)
add_predictions(app)

tester = Tester(app)

total_no_of_records = 5000

with monkeypatch_clickhouse(
    total=total_no_of_records,
    step=1500,
):
    async with tester:
        model_metrics = ModelMetrics(
            AccountId=12345,
            timestamp="2021-03-28T00:34:08",
            model_type="churn",
            auc=0.95,
            recall=0.94,
            precission=0.98,
            accuracy=0.99,
            f1=2 * 0.94 * 0.98 / (0.94 + 0.98),
        )

        heading(f"tester.to_infobip_model_metrics({model_metrics})")

        await tester.to_infobip_model_metrics(model_metrics)

        heading("tester.awaited_mocks.on_infobip_prediction.assert_called()")

        await tester.awaited_mocks.on_infobip_prediction.assert_called(timeout=10)

print("ok")

[INFO] fastkafka._testing.in_memory_broker: InMemoryBroker._start() called
[INFO] fastkafka._testing.in_memory_broker: InMemoryBroker._patch_consumers_and_producers(): Patching consumers and producers!
[INFO] fastkafka._testing.in_memory_broker: InMemoryBroker starting
[INFO] fastkafka._application.app: _create_producer() : created producer using the config: '{'bootstrap_servers': 'localhost:9092'}'
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaProducer patched start() called()
[INFO] fastkafka._application.app: _create_producer() : created producer using the config: '{'bootstrap_servers': 'localhost:9092'}'
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaProducer patched start() called()
[INFO] fastkafka._application.app: _create_producer() : created producer using the config: '{'bootstrap_servers': 'localhost:9092'}'
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaProducer patched start() called()
[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_lo

[INFO] fastkafka._testing.in_memory_broker: InMemoryBroker._stop() called
[INFO] fastkafka._testing.in_memory_broker: InMemoryBroker stopping
ok


In [51]:
# | export


def _construct_kafka_brokers() -> Dict[str, Dict[str, Any]]:
    url, port = aio_kafka_config["bootstrap_servers"].split(":")

    kafka_brokers = {
        "staging": {
            "url": "kafka.staging.airt.ai",
            "description": "Staging Kafka broker",
            "port": 9092,
            "protocol": "kafka-secure",
            "security": {"type": "scramSha256"},
        },
        "production": {
            "url": "pkc-1wvvj.westeurope.azure.confluent.cloud",
            "description": "Production Kafka broker",
            "port": 9092,
            "protocol": "kafka-secure",
            "security": {"type": "plain"},
        },
    }

    if (url != kafka_brokers["staging"]["url"]) and (
        url != kafka_brokers["production"]["url"]
    ):
        kafka_brokers["dev"] = {
            "url": url,
            "description": "Development Kafka broker",
            "port": port,
        }

    return kafka_brokers

In [52]:
_construct_kafka_brokers()

{'staging': {'url': 'kafka.staging.airt.ai',
  'description': 'Staging Kafka broker',
  'port': 9092,
  'protocol': 'kafka-secure',
  'security': {'type': 'scramSha256'}},
 'production': {'url': 'pkc-1wvvj.westeurope.azure.confluent.cloud',
  'description': 'Production Kafka broker',
  'port': 9092,
  'protocol': 'kafka-secure',
  'security': {'type': 'plain'}},
 'dev': {'url': 'davor-redpanda',
  'description': 'Development Kafka broker',
  'port': '9092'}}

In [53]:
# | export


def create_fastkafka_application(
    start_process_for_username: Optional[str] = "infobip",
) -> FastKafka:
    """Create a FastKafka service

    Args:
        start_process_for_username: prefix for topics used

    Returns:
        A FastKafka application
    """

    kafka_brokers = _construct_kafka_brokers()

    exclude_keys = ["bootstrap_servers"]
    kafka_config = {
        k: aio_kafka_config[k]
        for k in set(list(aio_kafka_config.keys())) - set(exclude_keys)
    }

    logger.info(f"create_fastkafka_application(): {kafka_config=}")

    # global description
    version = airt_service.__version__
    contact = dict(name="airt.ai", url="https://airt.ai", email="info@airt.ai")

    app = FastKafka(
        title="airt service kafka api",
        description="kafka api for airt service",
        kafka_brokers=kafka_brokers,
        version=version,
        contact=contact,
        enable_idempotence=True,
        request_timeout_ms=120_000,
        max_batch_size=120_000,
        #         auto_offset_reset="earliest",
        **kafka_config,
    )

    add_logging(app)
    add_process_start_training_data(app)
    add_process_training_model_start(app)
    add_predictions(app)

    return app

In [54]:
def import_symbol(s: str) -> Tuple[str, str]:
    xs = s.split(".")
    for i in range(len(xs)):
        try:
            mname = ".".join(xs[: i + 1])
            importlib.import_module(mname)
        except Exception as e:
            mname, fname = ".".join(xs[:i]), ".".join(xs[i:])
            imported_module = importlib.import_module(mname)
            imported_symbol = getattr(imported_module, fname)
            return imported_symbol
    raise ValueError()


@contextlib.contextmanager
def print_params(fqn: str):
    f = import_symbol(fqn)
    assert fqn == f"{f.__module__}.{f.__qualname__}", f"{f.__module__}.{f.__qualname__}"

    @functools.wraps(f)
    def _f(*args, **kwargs):
        call = f"{f.__module__}.{f.__qualname__}({', '.join([str(x) for x in args] + [str(k)+'='+str(v) for k, v in kwargs.items()])})"
        print(call)
        retval = f(*args, **kwargs)
        print(f"{call} returned {retval}")

        return retval

    with MonkeyPatch.context() as monkeypatch:
        monkeypatch.setattr(fqn, _f)
        yield


with print_params("fastkafka._application.app._get_kafka_config"):
    fastkafka._application.app._get_kafka_config(client_id="whatever")

fastkafka._application.app._get_kafka_config(client_id=whatever)
fastkafka._application.app._get_kafka_config(client_id=whatever) returned {'client_id': 'whatever', 'bootstrap_servers': 'localhost:9092', 'auto_offset_reset': 'earliest', 'max_poll_records': 100}


In [55]:
with print_params("fastkafka._application.app._get_kafka_config"):
    app = create_fastkafka_application()

    tester = Tester(app)

    total_no_of_records = 5000

    print("Starting test...")
    with monkeypatch_clickhouse(
        total=total_no_of_records,
        step=1500,
    ):
        async with tester:
            model_training_request = ModelTrainingRequest(
                AccountId=12345,
                OccurredTime="2021-03-28T00:34:08",
                model_type="churn",
                total_no_of_records=total_no_of_records + 123,
            )
            heading(f"tester.to_infobip_start_training_data({model_training_request})")

            await tester.to_infobip_start_training_data(model_training_request)

            heading("tester.awaited_mocks.on_infobip_prediction.assert_called()")

            await tester.awaited_mocks.on_infobip_prediction.assert_called(timeout=500)

print("ok")

[INFO] __main__: create_fastkafka_application(): kafka_config={'auto_offset_reset': 'earliest', 'group_id': 'davor-redpanda:9092_group'}
fastkafka._application.app._get_kafka_config(enable_idempotence=True, request_timeout_ms=120000, max_batch_size=120000, auto_offset_reset=earliest, group_id=davor-redpanda:9092_group)
fastkafka._application.app._get_kafka_config(enable_idempotence=True, request_timeout_ms=120000, max_batch_size=120000, auto_offset_reset=earliest, group_id=davor-redpanda:9092_group) returned {'enable_idempotence': True, 'request_timeout_ms': 120000, 'max_batch_size': 120000, 'auto_offset_reset': 'earliest', 'group_id': 'davor-redpanda:9092_group', 'bootstrap_servers': 'localhost:9092', 'max_poll_records': 100}
fastkafka._application.app._get_kafka_config()
fastkafka._application.app._get_kafka_config() returned {'bootstrap_servers': 'localhost:9092', 'auto_offset_reset': 'earliest', 'max_poll_records': 100}
Starting test...
[INFO] fastkafka._testing.in_memory_broker: I

[INFO] fastkafka._testing.in_memory_broker: AIOKafkaConsumer patched start() called()
[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop(): Consumer started.
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaConsumer patched subscribe() called
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaConsumer.subscribe(), subscribing to: ['infobip_training_model_start']
[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop(): Consumer subscribed.
[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop() starting...
[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop(): Consumer created using the following parameters: {'bootstrap_servers': 'localhost:9092', 'auto_offset_reset': 'earliest', 'max_poll_records': 100}
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaConsumer patched start() called()
[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop(): Consumer started.
[INFO] 

to_training_model_status(AccountId=12345 ModelId=None current_step=4 current_step_percentage=0.2 total_no_of_steps=10)
to_training_model_status(AccountId=12345 ModelId=None current_step=4 current_step_percentage=0.5 total_no_of_steps=10)
to_training_model_status(AccountId=12345 ModelId=None current_step=4 current_step_percentage=0.75 total_no_of_steps=10)
to_training_model_status(AccountId=12345 ModelId=None current_step=4 current_step_percentage=1.0 total_no_of_steps=10)
to_training_model_status(AccountId=12345 ModelId=None current_step=5 current_step_percentage=0.0 total_no_of_steps=10)
to_training_model_status(AccountId=12345 ModelId=None current_step=5 current_step_percentage=0.2 total_no_of_steps=10)
to_training_model_status(AccountId=12345 ModelId=None current_step=5 current_step_percentage=0.5 total_no_of_steps=10)
to_training_model_status(AccountId=12345 ModelId=None current_step=5 current_step_percentage=0.75 total_no_of_steps=10)
to_training_model_status(AccountId=12345 Model

[INFO] fastkafka._testing.in_memory_broker: AIOKafkaProducer patched stop() called
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaConsumer patched stop() called
[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop(): Consumer stopped.
[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop() finished.
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaConsumer patched stop() called
[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop(): Consumer stopped.
[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop() finished.
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaConsumer patched stop() called
[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop(): Consumer stopped.
[INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop() finished.
[INFO] fastkafka._testing.in_memory_broker: AIOKafkaProducer patched stop() called
[INFO] fastkafka._testing.in_m