In [None]:
# | default_exp cleanup

In [None]:
from airt.testing import activate_by_import

[INFO] airt.testing.activate_by_import: Testing environment activated.


2023-01-06 09:14:30.423110: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[INFO] numexpr.utils: Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
[INFO] numexpr.utils: NumExpr defaulting to 8 threads.


In [None]:
# | export

from typing import *

import airt_service.sanitizer
from airt.logger import get_logger
from airt_service.auth import delete_apikey
from airt_service.aws.utils import get_s3_storage_bucket
from airt_service.confluent import delete_topics_for_user
from airt_service.data.datablob import delete_datablob
from airt_service.data.datasource import delete_datasource
from airt_service.db.models import APIKey, DataBlob, DataSource, Model, Prediction, User
from airt_service.model.prediction import delete_prediction
from airt_service.model.train import delete_model
from sqlmodel import Session, select

[INFO] airt.executor.subcommand: Module loaded.


In [None]:
import json
from datetime import datetime, timedelta
from os import environ

import pandas as pd
import pytest
import requests
from airt.remote_path import RemotePath
from airt_service.auth import create_apikey
from airt_service.aws.utils import upload_to_s3_with_retry
from airt_service.confluent import create_topics_for_user
from airt_service.data.csv import process_csv
from airt_service.data.datablob import FromLocalRequest, from_local_start_route
from airt_service.db.models import (
    APIKeyCreate,
    create_user_for_testing,
    get_session,
    get_session_with_context,
)
from airt_service.helpers import set_env_variable_context
from airt_service.model.train import TrainRequest, predict_model, train_model
from fastapi import BackgroundTasks
from sqlalchemy.exc import NoResultFound

[INFO] airt.data.importers: Module loaded:
[INFO] airt.data.importers:  - using pandas     : 1.5.1
[INFO] airt.data.importers:  - using dask       : 2022.10.0


In [None]:
# | exporti

logger = get_logger(__name__)

In [None]:
test_username = create_user_for_testing(subscription_type="small")
display(test_username)
with get_session_with_context() as session:
    display(session.exec(select(User).where(User.username == test_username)).one())

'glmfgiaofy'

User(id=135, uuid=UUID('e40c4ceb-3c66-4ba8-bb8b-2a367921b180'), username='glmfgiaofy', first_name='unittest', last_name='user', email='glmfgiaofy@email.com', subscription_type=<SubscriptionType.small: 'small'>, super_user=False, disabled=False, created=datetime.datetime(2023, 1, 6, 9, 14, 34), phone_number=None, is_phone_number_verified=False, mfa_secret=****, is_mfa_active=False)

In [None]:
# Create and pull datasource to use in following tests


def _populate_user(username: str):
    """
    Helper function to create valid apikey, datablob, datasource and predictions for given user

    Args:
        username: username to use to create objects
    """
    with get_session_with_context() as session:
        user = session.exec(select(User).where(User.username == username)).one()

        create_topics_for_user(username=username)

        create_apikey(
            apikey_to_create=APIKeyCreate(expiry=datetime.utcnow() + timedelta(days=1)),
            user=user,
            session=session,
        )

        from_local_request = FromLocalRequest(
            path="tmp/test-folder/", tag="my_csv_datasource_tag"
        )
        from_local_response = from_local_start_route(
            from_local_request=from_local_request,
            user=user,
            session=session,
        )

        with RemotePath.from_url(
            remote_url=f"s3://test-airt-service/account_312571_events",
            pull_on_enter=True,
            push_on_exit=False,
            exist_ok=True,
            parents=False,
            access_key=environ["AWS_ACCESS_KEY_ID"],
            secret_key=environ["AWS_SECRET_ACCESS_KEY"],
        ) as test_s3_path:
            df = pd.read_parquet(test_s3_path.as_path())
            display(df.head())
            df.to_csv(test_s3_path.as_path() / "file.csv", index=False)
            display(list(test_s3_path.as_path().glob("*")))
            #         !head -n 10 {test_s3_path.as_path()/"file.csv"}

            upload_to_s3_with_retry(
                test_s3_path.as_path() / "file.csv",
                from_local_response.presigned["url"],
                from_local_response.presigned["fields"],
            )

        datablob_id = (
            session.exec(
                select(DataBlob).where(DataBlob.uuid == from_local_response.uuid)
            )
            .one()
            .id
        )

        display(datablob_id)
        assert datablob_id > 0

        datasource = DataSource(
            datablob_id=datablob_id,
            cloud_provider="aws",
            region="eu-west-1",
            total_steps=1,
            user=user,
        )
        session.add(datasource)
        session.commit()

        process_csv(
            datablob_id=datablob_id,
            datasource_id=datasource.id,
            deduplicate_data=True,
            index_column="PersonId",
            sort_by="OccurredTime",
            blocksize="256MB",
            kwargs_json=json.dumps(
                dict(
                    usecols=[0, 1, 2, 3, 4],
                    parse_dates=["OccurredTime"],
                )
            ),
        )

    with get_session_with_context() as session:
        datasource = session.exec(
            select(DataSource).where(DataSource.id == datasource.id)
        ).one()
        display(datasource)

        train_request = TrainRequest(
            data_uuid=datasource.uuid,
            client_column="AccountId",
            target_column="DefinitionId",
            target="load*",
            predict_after=timedelta(seconds=20 * 24 * 60 * 60),
        )

        model = train_model(train_request=train_request, user=user, session=session)
        display(model)
        # Call exec_cli train_model

        b = BackgroundTasks()
        with set_env_variable_context(variable="JOB_EXECUTOR", value="fastapi"):
            predicted = predict_model(
                model_uuid=model.uuid, user=user, session=session, background_tasks=b
            )
        display(predicted)
        # Call exec_cli predict_model

In [None]:
_populate_user(test_username)

[INFO] airt_service.confluent: Topic airt_service_glmfgiaofy_training_data created
[INFO] airt_service.confluent: Topic airt_service_glmfgiaofy_realitime_data created
[INFO] airt_service.confluent: Topic airt_service_glmfgiaofy_training_data_status created
[INFO] airt_service.confluent: Topic airt_service_glmfgiaofy_training_model_status created
[INFO] airt_service.confluent: Topic airt_service_glmfgiaofy_model_metrics created
[INFO] airt_service.confluent: Topic airt_service_glmfgiaofy_prediction created
[INFO] botocore.credentials: Found credentials in environment variables.


%4|1672996474.003|CONFWARN|rdkafka#producer-1| [thrd:app]: Configuration property group.id is a consumer property and will be ignored by this producer instance
%4|1672996474.003|CONFWARN|rdkafka#producer-1| [thrd:app]: Configuration property auto.offset.reset is a consumer property and will be ignored by this producer instance


[INFO] airt_service.data.datablob: DataBlob.from_local(): FromLocalResponse(uuid=UUID('e478478c-258e-4622-818d-595bbcd2c5a4'), type='local', presigned={'url': 'https://kumaran-airt-service-eu-west-1.s3.amazonaws.com/', 'fields': {'key': '****************************************', 'x-amz-algorithm': 'AWS4-HMAC-SHA256', 'x-amz-credential': '********************/20230106/eu-west-1/s3/aws4_request', 'x-amz-date': '20230106T091434Z', 'policy': '************************************************************************************************************************************************************************************************************************************************************', 'x-amz-signature': '****************************'}})
[INFO] airt.remote_path: RemotePath.from_url(): creating remote path with the following url s3://test-airt-service/account_312571_events
[INFO] airt.remote_path: S3Path._create_cache_path(): created cache path: /tmp/s3test-airt-serviceaccount_31257

Unnamed: 0_level_0,AccountId,DefinitionId,OccurredTime,OccurredTimeTicks,PersonId
__null_dask_index__,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,312571,loadTests2,2019-12-31 21:30:02,1577836802678,2
1,312571,loadTests3,2020-01-03 23:53:22,1578104602678,2
2,312571,loadTests1,2020-01-07 02:16:42,1578372402678,2
3,312571,loadTests2,2020-01-10 04:40:02,1578640202678,2
4,312571,loadTests3,2020-01-13 07:03:22,1578908002678,2


[Path('/tmp/s3test-airt-serviceaccount_312571_events_cached_uhj2c4yl/_metadata'),
 Path('/tmp/s3test-airt-serviceaccount_312571_events_cached_uhj2c4yl/_common_metadata'),
 Path('/tmp/s3test-airt-serviceaccount_312571_events_cached_uhj2c4yl/file.csv'),
 Path('/tmp/s3test-airt-serviceaccount_312571_events_cached_uhj2c4yl/part.3.parquet'),
 Path('/tmp/s3test-airt-serviceaccount_312571_events_cached_uhj2c4yl/part.0.parquet'),
 Path('/tmp/s3test-airt-serviceaccount_312571_events_cached_uhj2c4yl/part.1.parquet'),
 Path('/tmp/s3test-airt-serviceaccount_312571_events_cached_uhj2c4yl/part.4.parquet'),
 Path('/tmp/s3test-airt-serviceaccount_312571_events_cached_uhj2c4yl/part.2.parquet')]

[INFO] airt.remote_path: S3Path._clean_up(): removing local cache path /tmp/s3test-airt-serviceaccount_312571_events_cached_uhj2c4yl


46

[INFO] airt_service.data.csv: process_csv(datablob_id=46, datasource_id=29): processing user uploaded csv file for datablob_id=46 and uploading parquet back to S3 for datasource_id=29
[INFO] airt_service.data.csv: process_csv(datablob_id=46, datasource_id=29): step 1/4: downloading user uploaded file from bucket s3://kumaran-airt-service-eu-west-1/135/datablob/46
[INFO] airt.remote_path: RemotePath.from_url(): creating remote path with the following url s3://kumaran-airt-service-eu-west-1/135/datablob/46
[INFO] airt.remote_path: S3Path._create_cache_path(): created cache path: /tmp/s3kumaran-airt-service-eu-west-1135datablob46_cached_oq_me54e
[INFO] airt.remote_path: S3Path.__init__(): created object for accessing s3://kumaran-airt-service-eu-west-1/135/datablob/46 locally in /tmp/s3kumaran-airt-service-eu-west-1135datablob46_cached_oq_me54e
[INFO] airt.remote_path: S3Path.__enter__(): pulling data from s3://kumaran-airt-service-eu-west-1/135/datablob/46 to /tmp/s3kumaran-airt-service-

DataSource(id=29, uuid=UUID('d7bbea03-52c0-4704-b348-4016c2654734'), hash='1dd8ee7a0f96a48110dec6e25891d18d', total_steps=1, completed_steps=1, folder_size=6619982, no_of_rows=498961, cloud_provider=<CloudProvider.aws: 'aws'>, region='eu-west-1', error=None, disabled=False, path='s3://kumaran-airt-service-eu-west-1/135/datasource/29', created=datetime.datetime(2023, 1, 6, 9, 14, 56), user_id=135, pulled_on=datetime.datetime(2023, 1, 6, 9, 15, 5), tags=[])

Model(predict_after=datetime.timedelta(days=20), id=13, timestamp_column=None, uuid=UUID('27ec9df7-07f0-4b42-85e0-94644470b3fb'), total_steps=5, path=None, client_column='AccountId', completed_steps=0, datasource_id=29, cloud_provider=<CloudProvider.aws: 'aws'>, error=None, user_id=135, target_column='DefinitionId', region='eu-west-1', target='load*', disabled=False, created=datetime.datetime(2023, 1, 6, 9, 15, 26))

[INFO] airt_service.batch_job: create_batch_job(): command='predict 14', task='csv_processing'
[INFO] airt_service.batch_job_components.base: Entering FastAPIBatchJobContext(task=csv_processing)
[INFO] airt_service.batch_job: batch_ctx=FastAPIBatchJobContext(task=csv_processing)
[INFO] airt_service.batch_job_components.fastapi: FastAPIBatchJobContext.create_job(self=FastAPIBatchJobContext(task=csv_processing), command='predict 14', environment_vars={'AWS_ACCESS_KEY_ID': '********************', 'AWS_SECRET_ACCESS_KEY': '****************************************', 'AWS_DEFAULT_REGION': 'eu-west-1', 'AZURE_SUBSCRIPTION_ID': '************************************', 'AZURE_TENANT_ID': '************************************', 'AZURE_CLIENT_ID': '************************************', 'AZURE_CLIENT_SECRET': '****************************************', 'AZURE_STORAGE_ACCOUNT_PREFIX': 'kumsairtsdev', 'AZURE_RESOURCE_GROUP': 'kumaran-airt-service-dev', 'STORAGE_BUCKET_PREFIX': 'kumaran-airt-service'

Prediction(error=None, uuid=UUID('e6ed0994-ee26-4fcc-9609-7ba789e884c9'), datasource_id=29, cloud_provider=<CloudProvider.aws: 'aws'>, disabled=False, created=datetime.datetime(2023, 1, 6, 9, 15, 26), region='eu-west-1', id=14, model_id=13, total_steps=3, path=None, completed_steps=0)

In [None]:
# | export


def cleanup_predictions(user_to_cleanup: User, session: Session):
    """Cleanup predictions"""
    logger.info("deleting predictions")
    predictions = session.exec(
        select(Prediction).join(Model).where(Model.user == user_to_cleanup)
    ).all()

    for prediction in predictions:
        delete_prediction(
            prediction_uuid=prediction.uuid,  # type: ignore
            user=user_to_cleanup,
            session=session,
        )
        session.delete(prediction)
    session.commit()

In [None]:
with get_session_with_context() as session:
    user_to_cleanup = session.exec(
        select(User).where(User.username == test_username)
    ).one()

    cleanup_predictions(user_to_cleanup, session)

    predictions = session.exec(
        select(Prediction).join(Model).where(Model.user == user_to_cleanup)
    ).all()
    assert len(predictions) == 0

[INFO] __main__: deleting predictions


In [None]:
# | export


def cleanup_models(user_to_cleanup: User, session: Session):
    """Cleanup models"""
    logger.info("deleting models")
    models = session.exec(select(Model).where(Model.user == user_to_cleanup)).all()

    for model in models:
        delete_model(
            model_uuid=model.uuid,  # type: ignore
            user=user_to_cleanup,
            session=session,
        )
        session.delete(model)
    session.commit()

In [None]:
with get_session_with_context() as session:
    user_to_cleanup = session.exec(
        select(User).where(User.username == test_username)
    ).one()

    cleanup_models(user_to_cleanup, session)

    models = session.exec(select(Model).where(Model.user == user_to_cleanup)).all()
    assert len(models) == 0

[INFO] __main__: deleting models


In [None]:
# | export


def cleanup_datasources(user_to_cleanup: User, session: Session):
    """Cleanup datasources"""
    logger.info("deleting datasources")
    datasources = session.exec(
        select(DataSource).where(DataSource.user == user_to_cleanup)
    ).all()

    for datasource in datasources:
        delete_datasource(
            datasource_uuid=datasource.uuid,  # type: ignore
            user=user_to_cleanup,
            session=session,
        )
        session.delete(datasource)
    session.commit()

In [None]:
with get_session_with_context() as session:
    user_to_cleanup = session.exec(
        select(User).where(User.username == test_username)
    ).one()

    cleanup_datasources(user_to_cleanup, session)

    datasources = session.exec(
        select(DataSource).where(DataSource.user == user_to_cleanup)
    ).all()
    assert len(datasources) == 0

[INFO] __main__: deleting datasources


In [None]:
# | export


def cleanup_datablobs(user_to_cleanup: User, session: Session):
    """Cleanup datablobs"""
    logger.info("deleting datablobs")
    datablobs = session.exec(
        select(DataBlob).where(DataBlob.user == user_to_cleanup)
    ).all()

    for datablob in datablobs:
        delete_datablob(
            datablob_uuid=datablob.uuid,  # type: ignore
            user=user_to_cleanup,
            session=session,
        )
        session.delete(datablob)
    session.commit()

In [None]:
with get_session_with_context() as session:
    user_to_cleanup = session.exec(
        select(User).where(User.username == test_username)
    ).one()

    cleanup_datablobs(user_to_cleanup, session)

    datablobs = session.exec(
        select(DataBlob).where(DataBlob.user == user_to_cleanup)
    ).all()
    assert len(datablobs) == 0

[INFO] __main__: deleting datablobs


In [None]:
# | export


def cleanup_apikeys(user_to_cleanup: User, session: Session):
    """Cleanup apikeys"""
    logger.info("deleting apikeys")
    apikeys = session.exec(select(APIKey).where(APIKey.user == user_to_cleanup)).all()

    for apikey in apikeys:
        delete_apikey(
            user_uuid_or_name=str(user_to_cleanup.uuid),  # type: ignore
            key_uuid_or_name=str(apikey.uuid),  # type: ignore
            user=user_to_cleanup,
            session=session,
        )
        session.delete(apikey)
    session.commit()

In [None]:
with get_session_with_context() as session:
    user_to_cleanup = session.exec(
        select(User).where(User.username == test_username)
    ).one()

    cleanup_apikeys(user_to_cleanup, session)

    apikeys = session.exec(select(APIKey).where(APIKey.user == user_to_cleanup)).all()
    assert len(apikeys) == 0

[INFO] __main__: deleting apikeys


In [None]:
# | export


def cleanup_user(user_to_cleanup: User, session: Session):
    """Cleanup user"""
    cleanup_predictions(user_to_cleanup, session)
    cleanup_models(user_to_cleanup, session)
    cleanup_datasources(user_to_cleanup, session)
    cleanup_datablobs(user_to_cleanup, session)
    cleanup_apikeys(user_to_cleanup, session)

    bucket, base_path = get_s3_storage_bucket()  # type: ignore
    s3_path = (
        f"{base_path}/{user_to_cleanup.id}" if base_path else str(user_to_cleanup.id)
    )
    logger.info(f"Deleting user files in s3://{bucket.name}/{s3_path}")
    bucket.objects.filter(Prefix=s3_path + "/").delete()

    delete_topics_for_user(username=user_to_cleanup.username)

    logger.info("deleting user")
    session.delete(user_to_cleanup)
    session.commit()

In [None]:
with get_session_with_context() as session:
    user_to_cleanup = session.exec(
        select(User).where(User.username == test_username)
    ).one()

    cleanup_user(user_to_cleanup, session)

    with pytest.raises(NoResultFound):
        session.exec(select(User).where(User.username == test_username)).one()

[INFO] __main__: deleting predictions
[INFO] __main__: deleting models
[INFO] __main__: deleting datasources
[INFO] __main__: deleting datablobs
[INFO] __main__: deleting apikeys
[INFO] __main__: Deleting user files in s3://kumaran-airt-service-eu-west-1/135
[INFO] airt_service.confluent: Topic airt_service_glmfgiaofy_training_data deleted
[INFO] airt_service.confluent: Topic airt_service_glmfgiaofy_realitime_data deleted
[INFO] airt_service.confluent: Topic airt_service_glmfgiaofy_training_data_status deleted
[INFO] airt_service.confluent: Topic airt_service_glmfgiaofy_training_model_status deleted
[INFO] airt_service.confluent: Topic airt_service_glmfgiaofy_model_metrics deleted
[INFO] airt_service.confluent: Topic airt_service_glmfgiaofy_prediction deleted
[INFO] __main__: deleting user


%4|1672996529.940|CONFWARN|rdkafka#producer-2| [thrd:app]: Configuration property group.id is a consumer property and will be ignored by this producer instance
%4|1672996529.940|CONFWARN|rdkafka#producer-2| [thrd:app]: Configuration property auto.offset.reset is a consumer property and will be ignored by this producer instance
