# Training Status Process
> Process to handle training data stream

In [1]:
# | default_exp training_status_process

In [2]:
# | export

import asyncio
import random
import traceback
from contextlib import contextmanager
from datetime import datetime, timedelta
from os import environ
from time import sleep
from typing import *

import numpy as np
import pandas as pd
from airt.logger import get_logger
from airt.patching import patch
from asyncer import asyncify, create_task_group
from fastapi import FastAPI
from fastcore.meta import delegates
from fastkafka import FastKafka
from sqlalchemy import create_engine as sqlalchemy_create_engine
from sqlalchemy.engine import Engine
from sqlalchemy.exc import NoResultFound
from sqlmodel import Session, func, select

import airt_service
from airt_service.data.clickhouse import get_count_for_account_ids
from airt_service.db.models import (
    TrainingStreamStatus,
    User,
    create_connection_string,
    get_db_params_from_env_vars,
    get_engine,
    get_session_with_context,
)
from airt_service.users import User

23-03-30 13:20:30.430 [INFO] airt.executor.subcommand: Module loaded.


In [3]:
import json
import threading
from datetime import datetime
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock, call

import pytest
import uvicorn
from _pytest.monkeypatch import MonkeyPatch
from confluent_kafka import Consumer, Producer
from fastkafka.testing import Tester

from airt_service.confluent import confluent_kafka_config, create_topics_for_user
from airt_service.db.models import create_user_for_testing
from airt_service.helpers import set_env_variable_context
from airt_service.sanitizer import sanitized_print
from airt_service.server import (
    EventData,
    ModelTrainingRequest,
    TrainingDataStatus,
    create_ws_server,
)
from airt_service.uvicorn_helpers import run_uvicorn

In [4]:
test_username = create_user_for_testing()
display(test_username)

'hlzmyvhrib'

In [5]:
# | exporti

logger = get_logger(__name__)

In [6]:
def create_test_update_table() -> Tuple[pd.DataFrame, User]:
    throwaway_username = create_user_for_testing()

    with get_session_with_context() as session:
        user = session.exec(
            select(User).where(User.username == throwaway_username)
        ).one()

    return (
        pd.DataFrame(
            {
                "account_id": [666, 999],
                "application_id": [None, "23"],
                "model_id": ["ChurnModelForDrivers", "Whatever"],
                "total": [1000, 1000],
                "user_id": [user.id] * 2,
                "model_type": ["churn", "churn"],
                "count": [10, 670],
                "event": ["upload", "end"],
            }
        ).set_index("account_id"),
        user,
    )


update_table, user = create_test_update_table()
update_table

Unnamed: 0_level_0,application_id,model_id,total,user_id,model_type,count,event
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
666,,ChurnModelForDrivers,1000,76,churn,10,upload
999,23.0,Whatever,1000,76,churn,670,end


In [7]:
# | export


def update_mysql(
    update_table: pd.DataFrame,
) -> None:
    """
    Method to create event

    Args:
        account_id: account id
        application_id: Id of the application in case there is more than one for the AccountId
        model_id: User supplied ID of the model trained
        model_type: Model type
        event: one of start, upload, end
        count: current count of rows in clickhouse db
        total: total no. of rows sent by user
        user: user object
        session: session object

    """
    training_events = [
        TrainingStreamStatus(**kwargs)  # type: ignore
        for kwargs in update_table.reset_index().to_dict(orient="records")
    ]

    with get_session_with_context() as session:
        for training_event in training_events:
            session.add(training_event)

        session.commit()

In [8]:
update_table, user = create_test_update_table()

update_mysql(update_table=update_table)

with get_session_with_context() as session:
    most_recent_events = session.exec(
        select(TrainingStreamStatus)
        .where(TrainingStreamStatus.user == user)
        .order_by(TrainingStreamStatus.id.desc())
    ).all()

display(most_recent_events)

expected = update_table.sort_index().reindex(sorted(update_table.columns), axis=1)

actual = (
    pd.DataFrame([e.dict() for e in most_recent_events])
    .set_index("account_id")
    .drop(columns=["id", "uuid", "created"])
    .sort_index()
    .reindex(sorted(update_table.columns), axis=1)
)
pd.testing.assert_frame_equal(actual, expected)
np.testing.assert_array_equal(actual["application_id"], (None, "23"))

[TrainingStreamStatus(event=<TrainingEvent.end: 'end'>, account_id=999, model_id='Whatever', count=670, total=1000, user_id=77, id=212, uuid=UUID('f9f25b4c-5c10-4c4a-b96e-65e12936c9ee'), application_id='23', model_type='churn', created=datetime.datetime(2023, 3, 30, 13, 20, 32)),
 TrainingStreamStatus(event=<TrainingEvent.upload: 'upload'>, account_id=666, model_id='ChurnModelForDrivers', count=10, total=1000, user_id=77, id=211, uuid=UUID('1b4f1927-6881-4dc3-9aaa-ac4e621db193'), application_id=None, model_type='churn', created=datetime.datetime(2023, 3, 30, 13, 20, 32))]

In [9]:
def get_mysql_test_table() -> pd.DataFrame:
    d = {
        "application_id": {666: np.nan, 999: "23", 1000: "some app"},
        "model_id": {666: "ChurnModelForDrivers", 999: "Whatever", 1000: "CoolModel"},
        "event": {666: "start", 999: "upload", 1000: "upload"},
        "id": {666: 33, 999: 66, 1000: 1000},
        "uuid": {
            666: "b465060fa1da4af8b9d597ec3c8f8e07",
            999: "9999990fa1da4af8b9d597ec3c999999",
            1000: "0" * 16,
        },
        "prev_count": {666: 0, 999: 670, 1000: 1_000_000},
        "total": {666: 1000, 999: 1000, 1000: 1_000_000},
        "created": {
            666: datetime.utcnow() - timedelta(seconds=1),
            999: datetime.utcnow() - timedelta(seconds=60),
            1000: datetime.utcnow() - timedelta(seconds=1),
        },
        "user_id": {666: 18, 999: 18, 1000: 18},
        "model_type": {666: "churn", 999: "churn", 1000: "churn"},
    }
    return (
        pd.DataFrame(d)
        .reset_index()
        .rename(columns={"index": "AccountId"})
        .set_index("AccountId")
    )


get_mysql_test_table()

Unnamed: 0_level_0,application_id,model_id,event,id,uuid,prev_count,total,created,user_id,model_type
AccountId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
666,,ChurnModelForDrivers,start,33,b465060fa1da4af8b9d597ec3c8f8e07,0,1000,2023-03-30 13:20:30.654478,18,churn
999,23,Whatever,upload,66,9999990fa1da4af8b9d597ec3c999999,670,1000,2023-03-30 13:19:31.654480,18,churn
1000,some app,CoolModel,upload,1000,0000000000000000,1000000,1000000,2023-03-30 13:20:30.654481,18,churn


In [10]:
def get_clickhouse_test_table() -> pd.DataFrame:
    return (
        pd.DataFrame(
            {
                "curr_count": [10, 670, 1_000_000],
                "AccountId": [666, 999, 1000],
                "curr_check_on": [datetime.utcnow()] * 3,
            },
            index=[666, 999, 1000],
        )
        .reset_index(drop=True)
        .set_index("AccountId")
    )


get_clickhouse_test_table()

Unnamed: 0_level_0,curr_count,curr_check_on
AccountId,Unnamed: 1_level_1,Unnamed: 2_level_1
666,10,2023-03-30 13:20:31.666331
999,670,2023-03-30 13:20:31.666331
1000,1000000,2023-03-30 13:20:31.666331


In [11]:
# | export


@contextmanager
def create_sqlalchemy_engine(
    url: str, **kwargs: Dict[str, Any]
) -> Generator[Engine, None, None]:
    sqlalchemy_engine = sqlalchemy_create_engine(url, **kwargs)  # type: ignore
    try:
        yield sqlalchemy_engine
    finally:
        sqlalchemy_engine.dispose()


def get_recent_events_for_user(user: User) -> pd.DataFrame:
    """
    Get recent event for user

    Args:
        user: user object to get recent events

    Returns:
        A list of recent events for given user
    """
    conn_str = create_connection_string(**get_db_params_from_env_vars())  # type: ignore

    with create_sqlalchemy_engine(conn_str) as engine:
        # Get all rows from table
        df = pd.read_sql_table(table_name="trainingstreamstatus", con=engine)

    # Filter events for given user and group by account_id
    events_for_user = (
        df.loc[df["user_id"] == user.id]
        .sort_values("id", ascending=False)
        .groupby(
            by=["account_id", "application_id", "model_id"],
            as_index=False,
            dropna=False,
        )
        .first()
    )

    events_for_user = events_for_user.rename(
        columns={"count": "prev_count", "account_id": "AccountId"}
    )

    events_for_user = events_for_user.set_index("AccountId")

    # Leave 'end' events
    events_for_user = events_for_user.loc[
        events_for_user["event"] != "end"
    ].sort_values("AccountId", ascending=True)

    return events_for_user

In [12]:
end_count = 1_000_000

with get_session_with_context() as session:
    update_table, user = create_test_update_table()
    display(update_table)
    recent_event_for_user = get_recent_events_for_user(user=user)
    assert recent_event_for_user.empty, recent_event_for_user

    update_mysql(update_table=update_table)

    actual = get_recent_events_for_user(user=user)
    display(actual)
    assert len(actual) == 1
    assert (actual["event"] == "upload").all()
    assert (actual["user_id"] == user.id).all()
    assert (actual.index == 666).all()

Unnamed: 0_level_0,application_id,model_id,total,user_id,model_type,count,event
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
666,,ChurnModelForDrivers,1000,78,churn,10,upload
999,23.0,Whatever,1000,78,churn,670,end


Unnamed: 0_level_0,application_id,model_id,event,id,uuid,prev_count,total,created,user_id,model_type
AccountId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
666,,ChurnModelForDrivers,upload,213,d1001f2a41d24be1a38dfabd0c261824,10,1000,2023-03-30 13:20:32,78,churn


In [13]:
# | export


def get_count_from_training_data_ch_table(
    account_ids: List[Union[int, str]]
) -> pd.DataFrame:
    """
    Get count of all rows for given account ids from clickhouse table

    Args:
        account_ids: List of account_ids to get count

    Returns:
        Count for the given account id
    """
    return get_count_for_account_ids(
        account_ids=account_ids,
        username=environ["KAFKA_CH_USERNAME"],
        password=environ["KAFKA_CH_PASSWORD"],
        host=environ["KAFKA_CH_HOST"],
        port=int(environ["KAFKA_CH_PORT"]),
        database=environ["KAFKA_CH_DATABASE"],
        table=environ["KAFKA_CH_TABLE"],
        protocol=environ["KAFKA_CH_PROTOCOL"],
    )

In [14]:
@contextmanager
def patch_get_count_from_training_data_ch_table():
    with MonkeyPatch.context() as monkeypatch:
        monkeypatch.setattr(
            "__main__.get_count_from_training_data_ch_table",
            lambda account_ids: pd.DataFrame(
                {
                    "curr_count": [999] * len(account_ids),
                    "AccountId": account_ids,
                    "curr_check_on": [datetime.utcnow()] * len(account_ids),
                }
            ).set_index("AccountId"),
        )
        yield


with patch_get_count_from_training_data_ch_table():
    actual = get_count_from_training_data_ch_table(account_ids=[500])
    display(actual)
    assert actual.iloc[0]["curr_count"] == 999, actual

Unnamed: 0_level_0,curr_count,curr_check_on
AccountId,Unnamed: 1_level_1,Unnamed: 2_level_1
500,999,2023-03-30 13:20:32.015994


In [15]:
# | export


def get_user(username: str) -> User:
    """Get the user object for the given username

    Args:
        username: Username as a string

    Returns:
        The user object
    """
    with get_session_with_context() as session:
        user: User = session.exec(select(User).where(User.username == username)).one()

    return user

In [16]:
actual = get_user(username=test_username)
assert actual.username == test_username

In [17]:
# | export


def get_new_update_table(
    recent_events_df: pd.DataFrame, ch_df: pd.DataFrame, end_timedelta: int
) -> pd.DataFrame:
    merged = recent_events_df.merge(right=ch_df, how="left", on="AccountId")

    updated = merged["curr_count"] > merged["prev_count"]
    not_update_for_30s = (pd.to_datetime(merged["curr_check_on"]) - 
        pd.to_datetime(merged["created"])
    ) > timedelta(seconds=end_timedelta)

    df = merged[updated | not_update_for_30s]
    df = df.assign(action="end")

    df.loc[df["curr_count"] > df["prev_count"], "action"] = "upload"

    drop_columns = ["event", "id", "uuid", "prev_count", "created", "curr_check_on"]
    df = df.drop(columns=drop_columns)
    df = df.rename(columns=dict(curr_count="count", action="event"))
    df = df.astype({"count": "int"})
    df.index = df.index.rename("account_id")

    df = df.replace({np.nan: None})

    return df

In [18]:
recent_events_df = get_mysql_test_table()
ch_df = get_clickhouse_test_table()
display(recent_events_df)
display(ch_df)

update_table = get_new_update_table(recent_events_df, ch_df, end_timedelta=30)
display(update_table)
assert update_table.shape == (2, 7), update_table.shape
np.testing.assert_array_equal(update_table.index, (666, 999))
assert update_table.index.name == "account_id"
np.testing.assert_array_equal(update_table["event"], ("upload", "end"))
np.testing.assert_array_equal(update_table["count"], (10, 670))
np.testing.assert_array_equal(
    update_table["application_id"].fillna("nan"), ("nan", "23")
)
np.testing.assert_array_equal(update_table["application_id"], (None, "23"))

Unnamed: 0_level_0,application_id,model_id,event,id,uuid,prev_count,total,created,user_id,model_type
AccountId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
666,,ChurnModelForDrivers,start,33,b465060fa1da4af8b9d597ec3c8f8e07,0,1000,2023-03-30 13:20:31.060036,18,churn
999,23,Whatever,upload,66,9999990fa1da4af8b9d597ec3c999999,670,1000,2023-03-30 13:19:32.060038,18,churn
1000,some app,CoolModel,upload,1000,0000000000000000,1000000,1000000,2023-03-30 13:20:31.060039,18,churn


Unnamed: 0_level_0,curr_count,curr_check_on
AccountId,Unnamed: 1_level_1,Unnamed: 2_level_1
666,10,2023-03-30 13:20:32.062145
999,670,2023-03-30 13:20:32.062145
1000,1000000,2023-03-30 13:20:32.062145


Unnamed: 0_level_0,application_id,model_id,total,user_id,model_type,count,event
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
666,,ChurnModelForDrivers,1000,18,churn,10,upload
999,23.0,Whatever,1000,18,churn,670,end


In [19]:
# | export


async def update_kafka(update_table: pd.DataFrame, kafka_app: FastKafka) -> None:
    async with create_task_group() as task_group:
        to_infobip_training_data_status = task_group.soonify(
            kafka_app.to_infobip_training_data_status
        )
        to_infobip_start_training = task_group.soonify(
            kafka_app.to_infobip_start_training
        )
        # start training when necessary
        ready_df = update_table[(update_table["event"] == "end") | (update_table["count"] >= update_table["total"])]
        rename_dict = dict(count="no_of_records")
        drop_columns = ["model_type", "user_id", "event", "total"]
        msgs = (
            ready_df.drop(columns=drop_columns)
            .rename(columns=rename_dict)
            .reset_index()
            .to_dict(orient="records")
        )
        for kwargs in msgs:
            to_infobip_start_training(**kwargs)  # type: ignore
            
        # send status
        drop_columns = ["model_type", "user_id", "event"]
        rename_dict = dict(count="no_of_records", total="total_no_of_records")
        msgs = (
            update_table.drop(columns=drop_columns)
            .rename(columns=rename_dict)
            .reset_index()
            .to_dict(orient="records")
        )
        for kwargs in msgs:
            to_infobip_training_data_status(**kwargs)  # type: ignore

In [20]:
update_table, _ = create_test_update_table()

kafka_app = MagicMock()
kafka_app.to_infobip_training_data_status = AsyncMock()
kafka_app.to_infobip_start_training = AsyncMock()

expected_infobip_training_data_status = [
    call(
        account_id=666,
        application_id=None,
        model_id="ChurnModelForDrivers",
        total_no_of_records=1000,
        no_of_records=10,
    ),
    call(
        account_id=999,
        application_id="23",
        model_id="Whatever",
        total_no_of_records=1000,
        no_of_records=670,
    ),
]

expected_infobip_start_training = [
    call(
        account_id=999,
        application_id="23",
        model_id="Whatever",
        no_of_records=670,
    ),
]

await update_kafka(update_table, kafka_app=kafka_app)

assert kafka_app.to_infobip_training_data_status.call_count == 2
assert kafka_app.to_infobip_training_data_status.call_args_list == expected_infobip_training_data_status

assert kafka_app.to_infobip_start_training.call_count == 1
assert kafka_app.to_infobip_start_training.call_args_list == expected_infobip_start_training, kafka_app.to_infobip_start_training.call_args_list

print("ok")

ok


In [21]:
def exit_after(timeout: int):
    t0 = datetime.now()

    def _f(t0: datetime = t0, timeout: int = timeout) -> bool:
        return datetime.now() - t0 > timedelta(seconds=timeout)

    return _f


should_exit_f = exit_after(1)
assert not should_exit_f()
sleep(2)
assert should_exit_f()

In [22]:
# | export


async def process_training_status(
    username: str,
    fast_kafka_api_app: FastKafka,
    *,
    should_exit_f: Optional[Callable[[], bool]] = None,
    sleep_min: int = 5,
    sleep_max: int = 20,
    end_timedelta: int = 120,
) -> None:
    """
    An infinite loop to keep track of training_data uploads from user

    Args:
        username: username of user to track training data uploads
    """

    while should_exit_f is None or not should_exit_f():
        # moved here to allow for dynamic mocking up underlying functions
        async_get_user = asyncify(get_user)
        async_get_recent_events_for_user = asyncify(get_recent_events_for_user)
        async_get_count_from_training_data_ch_table = asyncify(
            get_count_from_training_data_ch_table
        )
        async_update_mysql = asyncify(update_mysql)
        
        #         logger.info(f"Starting the process loop")
        try:
            user = await async_get_user(username)
            recent_events_df = await async_get_recent_events_for_user(user=user)
            if not recent_events_df.empty:
                ch_df = await async_get_count_from_training_data_ch_table(
                    account_ids=recent_events_df.index.tolist()
                )
                update_table = get_new_update_table(
                    recent_events_df=recent_events_df,
                    ch_df=ch_df,
                    end_timedelta=end_timedelta,
                )
                async with create_task_group() as tg:
                    tg.soonify(update_kafka)(
                        update_table=update_table, kafka_app=fast_kafka_api_app
                    )
                    tg.soonify(async_update_mysql)(update_table=update_table)

        except Exception as e:
            logger.info(
                f"Error in process_training_status - {e}, {traceback.format_exc()}"
            )

        await asyncio.sleep(random.randint(sleep_min, sleep_max))  # nosec B311

In [23]:
username = create_user_for_testing()
kafka_app = MagicMock()
kafka_app.to_infobip_training_data_status = AsyncMock()
kafka_app.to_infobip_start_training = AsyncMock()

msg_count = 1000
account_id = 9000

with get_session_with_context() as session:
    user = session.exec(select(User).where(User.username == username)).one()
    test_start_event = TrainingStreamStatus(
        account_id=account_id,
        model_id="ChurnModelForDrivers",
        model_type="churn",
        event="start",
        count=0,
        total=msg_count,
        user=user,
    )
    session.add(test_start_event)
    session.commit()


with patch_get_count_from_training_data_ch_table():
    await process_training_status(
        username=username,
        fast_kafka_api_app=kafka_app,
        should_exit_f=exit_after(10),
        sleep_min=1,
        sleep_max=2,
        end_timedelta=5,
    )

with get_session_with_context() as session:
    user = session.exec(select(User).where(User.username == username)).one()

    display(f"All events for account id {account_id}")
    all_events = session.exec(
        select(TrainingStreamStatus)
        .where(TrainingStreamStatus.user == user)
        .where(TrainingStreamStatus.account_id == account_id)
        .order_by(TrainingStreamStatus.id.asc())
    ).all()
    display(all_events)

    assert all_events[-1].event == "end", all_events[-1]
    assert all_events[-1].count == 999, all_events[-1]


assert kafka_app.to_infobip_training_data_status.call_count == 2

expected = [
    call(
        account_id=9000,
        application_id=None,
        model_id="ChurnModelForDrivers",
        total_no_of_records=1000,
        no_of_records=999,
    ),
    call(
        account_id=9000,
        application_id=None,
        model_id="ChurnModelForDrivers",
        total_no_of_records=1000,
        no_of_records=999,
    ),
]

assert kafka_app.to_infobip_training_data_status.call_args_list == expected

'All events for account id 9000'

[TrainingStreamStatus(event=<TrainingEvent.start: 'start'>, account_id=9000, model_id='ChurnModelForDrivers', count=0, total=1000, user_id=80, id=215, uuid=UUID('c428eee9-2d64-469c-9668-99acc14b35d1'), application_id=None, model_type='churn', created=datetime.datetime(2023, 3, 30, 13, 20, 35)),
 TrainingStreamStatus(event=<TrainingEvent.upload: 'upload'>, account_id=9000, model_id='ChurnModelForDrivers', count=999, total=1000, user_id=80, id=216, uuid=UUID('f4e84a62-5f13-4333-b8c0-c8c70c1d0f0f'), application_id=None, model_type='churn', created=datetime.datetime(2023, 3, 30, 13, 20, 35)),
 TrainingStreamStatus(event=<TrainingEvent.end: 'end'>, account_id=9000, model_id='ChurnModelForDrivers', count=999, total=1000, user_id=80, id=217, uuid=UUID('371d3544-622c-4b5b-93f8-a002c6d2510a'), application_id=None, model_type='churn', created=datetime.datetime(2023, 3, 30, 13, 20, 42))]

In [24]:
# Integration tests

definitions = [
    "appLaunch",
    "sign_in",
    "sign_out",
    "add_to_cart",
    "purchase",
    "custom_event_1",
    "custom_event_2",
    "custom_event_3",
]


# applications = ["DriverApp", "PUBG", "COD"]
applications = ["DriverApp"]


def generate_n_rows_for_training_data(n: int, seed: int = 42):
    rng = np.random.default_rng(seed=seed)
    #     account_id = rng.choice([4000, 5000, 500], size=n)
    account_id = 6000
    definition_id = rng.choice(definitions, size=n)
    application_id = rng.choice(applications, size=n)
    model_id = rng.choice(["ChurnModelForDrivers", None], size=n)
    occurred_time_ticks = rng.integers(
        datetime(year=2022, month=1, day=1).timestamp() * 1000,
        datetime(year=2022, month=11, day=1).timestamp() * 1000,
        size=n,
    )
    occurred_time = pd.to_datetime(occurred_time_ticks, unit="ms").strftime(
        "%Y-%m-%dT%H:%M:%S.%f"
    )
    person_id = rng.integers(n // 10, size=n)

    df = pd.DataFrame(
        {
            "AccountId": account_id,
            "ApplicationId": application_id,
            "ModelId": model_id,
            "DefinitionId": definition_id,
            "OccurredTimeTicks": occurred_time_ticks,
            "OccurredTime": occurred_time,
            "PersonId": person_id,
        }
    )
    return json.loads(df.to_json(orient="records"))


generate_n_rows_for_training_data(100)[-1]

{'AccountId': 6000,
 'ApplicationId': 'DriverApp',
 'ModelId': None,
 'DefinitionId': 'sign_in',
 'OccurredTimeTicks': 1649146037462,
 'OccurredTime': '2022-04-05T08:07:17.462000',
 'PersonId': 4}

In [26]:
# test_username = "infobip"


async def test_process_training_status(tester):
    with get_session_with_context() as session:
        user = session.exec(select(User).where(User.username == test_username)).one()

        msg_count = 1000
        account_id = 6000

        test_start_event = TrainingStreamStatus(
            account_id=account_id,
            application_id="DriverApp",
            model_id="ChurnModelForDrivers",
            model_type="churn",
            event="start",
            count=0,
            total=msg_count,
            user=user,
        )
        session.add(test_start_event)
        session.commit()

        training_data = generate_n_rows_for_training_data(msg_count, seed=999)
        for i in range(msg_count):
            await tester.to_None_training_data(EventData(**training_data[i]))

    await tester.awaited_mocks.on_None_training_data_status.assert_awaited_with(
        TrainingDataStatus(
            AccountId=account_id,
            ApplicationId="DriverApp",
            ModelId="ChurnModelForDrivers",
            no_of_records=999,
            total_no_of_records=msg_count,
        ),
        timeout=5 * 60,
    )

    with get_session_with_context() as session:
        user = session.exec(select(User).where(User.username == test_username)).one()

        display(f"All events for account id {account_id}")
        all_events = session.exec(
            select(TrainingStreamStatus)
            .where(TrainingStreamStatus.user == user)
            .where(TrainingStreamStatus.account_id == account_id)
        )
        display([e for e in all_events])


display(f"{test_username=}")
create_topics_for_user(username=test_username)
with set_env_variable_context(variable="JOB_EXECUTOR", value="fastapi"):
    with MonkeyPatch.context() as monkeypatch:
        monkeypatch.setattr(
            "__main__.get_count_from_training_data_ch_table",
            lambda account_ids: pd.DataFrame(
                {
                    "curr_count": [999],
                    "AccountId": 6000,
                    "curr_check_on": [datetime.utcnow()],
                }
            ).set_index("AccountId"),
        )
        app, fast_kafka_api_app = create_ws_server(
            assets_path=Path("../assets"), start_process_for_username=None
        )

        @fast_kafka_api_app.run_in_background()
        async def startup_event():
            await process_training_status(
                username=test_username,
                fast_kafka_api_app=fast_kafka_api_app,
                end_timedelta=30,
            )

        config = uvicorn.Config(app, host="0.0.0.0", port=6010, log_level="debug")

        async with Tester(fast_kafka_api_app) as tester:
            # Server started.
            sanitized_print("server started")
            await test_process_training_status(tester)

        sanitized_print("server stopped")
        # Server stopped.

"test_username='hlzmyvhrib'"

23-03-30 13:22:44.925 [INFO] fastkafka._application.app: run_in_background() : Adding function 'startup_event' as background task
23-03-30 13:22:44.926 [INFO] fastkafka._application.app: run_in_background() : Adding function 'startup_event' as background task


%4|1680182564.784|CONFWARN|rdkafka#producer-2| [thrd:app]: Configuration property group.id is a consumer property and will be ignored by this producer instance
%4|1680182564.784|CONFWARN|rdkafka#producer-2| [thrd:app]: Configuration property auto.offset.reset is a consumer property and will be ignored by this producer instance


23-03-30 13:22:44.929 [INFO] fastkafka._components.test_dependencies: Java is already installed.
23-03-30 13:22:44.930 [INFO] fastkafka._components.test_dependencies: Kafka is installed.
23-03-30 13:22:44.931 [INFO] fastkafka._testing.local_broker: Starting zookeeper...
23-03-30 13:22:45.681 [INFO] fastkafka._testing.local_broker: Starting kafka...
23-03-30 13:22:47.582 [INFO] fastkafka._testing.local_broker: Local Kafka broker up and running on 127.0.0.1:9092
23-03-30 13:22:49.511 [INFO] fastkafka._application.app: _create_producer() : created producer using the config: '{'bootstrap_servers': '127.0.0.1:9092'}'
23-03-30 13:22:49.525 [INFO] fastkafka._application.app: _create_producer() : created producer using the config: '{'bootstrap_servers': '127.0.0.1:9092'}'
23-03-30 13:22:49.532 [INFO] fastkafka._application.app: _create_producer() : created producer using the config: '{'bootstrap_servers': '127.0.0.1:9092'}'
23-03-30 13:22:49.539 [INFO] fastkafka._application.app: _create_produ

23-03-30 13:22:49.705 [INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop() starting...
23-03-30 13:22:49.706 [INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop(): Consumer created using the following parameters: {'bootstrap_servers': '127.0.0.1:9092', 'auto_offset_reset': 'earliest', 'max_poll_records': 100}
23-03-30 13:22:49.707 [INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop() starting...
23-03-30 13:22:49.707 [INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop(): Consumer created using the following parameters: {'bootstrap_servers': '127.0.0.1:9092', 'auto_offset_reset': 'earliest', 'max_poll_records': 100}
23-03-30 13:22:49.709 [INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop() starting...
23-03-30 13:22:49.709 [INFO] fastkafka._components.aiokafka_consumer_loop: aiokafka_consumer_loop(): Consumer created using the following parameters: {'bootstrap_server

23-03-30 13:22:50.174 [ERROR] aiokafka.consumer.group_coordinator: Group Coordinator Request failed: [Error 15] CoordinatorNotAvailableError
23-03-30 13:22:50.274 [ERROR] aiokafka.consumer.group_coordinator: Group Coordinator Request failed: [Error 15] CoordinatorNotAvailableError
23-03-30 13:22:50.275 [ERROR] aiokafka.consumer.group_coordinator: Group Coordinator Request failed: [Error 15] CoordinatorNotAvailableError
23-03-30 13:22:50.276 [ERROR] aiokafka.consumer.group_coordinator: Group Coordinator Request failed: [Error 15] CoordinatorNotAvailableError
23-03-30 13:22:50.277 [ERROR] aiokafka.consumer.group_coordinator: Group Coordinator Request failed: [Error 15] CoordinatorNotAvailableError
23-03-30 13:22:50.282 [ERROR] aiokafka.consumer.group_coordinator: Group Coordinator Request failed: [Error 15] CoordinatorNotAvailableError
23-03-30 13:22:50.382 [ERROR] aiokafka.consumer.group_coordinator: Group Coordinator Request failed: [Error 15] CoordinatorNotAvailableError
23-03-30 13:2

23-03-30 13:22:53.943 [INFO] aiokafka.consumer.group_coordinator: Setting newly assigned partitions {TopicPartition(topic='infobip_realtime_data', partition=0)} for group airt-service-kafka-group
23-03-30 13:22:53.944 [INFO] aiokafka.consumer.group_coordinator: Successfully synced group airt-service-kafka-group with generation 2
23-03-30 13:22:53.945 [INFO] aiokafka.consumer.group_coordinator: Setting newly assigned partitions {TopicPartition(topic='infobip_start_training', partition=0)} for group airt-service-kafka-group
23-03-30 13:22:53.946 [INFO] aiokafka.consumer.group_coordinator: Successfully synced group airt-service-kafka-group with generation 2
23-03-30 13:22:53.948 [INFO] aiokafka.consumer.group_coordinator: Setting newly assigned partitions {TopicPartition(topic='infobip_start_training_data', partition=0)} for group airt-service-kafka-group
23-03-30 13:22:53.948 [INFO] aiokafka.consumer.group_coordinator: Successfully synced group airt-service-kafka-group with generation 2


AttributeError: 'TesterMocks' object has no attribute 'on_None_training_data_status'

In [None]:
tester.o