In [None]:
#| default_exp cleanup

In [None]:
from airt.testing import activate_by_import

[INFO] airt.testing.activate_by_import: Testing environment activated.
[INFO] numexpr.utils: Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
[INFO] numexpr.utils: NumExpr defaulting to 8 threads.
[INFO] airt.keras.helpers: Using a single GPU #0 with memory_limit 1024 MB


In [None]:
#| export

from typing import *

from sqlmodel import Session, select

import airt_service.sanitizer
from airt.logger import get_logger
from airt_service.auth import delete_apikey
from airt_service.data.datasource import delete_datasource
from airt_service.data.datablob import delete_datablob
from airt_service.db.models import (
    User,
    Prediction,
    Model,
    DataSource,
    DataBlob,
    APIKey,
)
from airt_service.aws.utils import get_s3_storage_bucket
from airt_service.model.prediction import delete_prediction
from airt_service.model.train import delete_model

[INFO] airt.executor.subcommand: Module loaded.


In [None]:
import json
from datetime import datetime, timedelta
from os import environ

import pandas as pd
import pytest
import requests
from fastapi import BackgroundTasks
from sqlalchemy.exc import NoResultFound

from airt.remote_path import RemotePath
from airt_service.auth import create_apikey
from airt_service.aws.utils import upload_to_s3_with_retry
from airt_service.data.csv import process_csv
from airt_service.data.datablob import FromLocalRequest, from_local_start_route
from airt_service.db.models import (
    get_session,
    get_session_with_context,
    create_user_for_testing,
    APIKeyCreate,
)
from airt_service.helpers import set_env_variable_context
from airt_service.model.train import TrainRequest, train_model, predict_model

[INFO] airt.data.importers: Module loaded:
[INFO] airt.data.importers:  - using pandas     : 1.5.1
[INFO] airt.data.importers:  - using dask       : 2022.10.0


In [None]:
#| exporti

logger = get_logger(__name__)

In [None]:
test_username = create_user_for_testing(subscription_type="small")
display(test_username)
with get_session_with_context() as session:
    display(session.exec(select(User).where(User.username == test_username)).one())

'ngkjpwvkdg'

User(id=4, uuid=UUID('c23cdcaf-293b-47c8-96ff-655b50d18fc3'), username='ngkjpwvkdg', first_name='unittest', last_name='user', email='ngkjpwvkdg@email.com', subscription_type=<SubscriptionType.small: 'small'>, super_user=False, disabled=False, created=datetime.datetime(2022, 11, 7, 9, 9, 59), phone_number=None, is_phone_number_verified=False, mfa_secret=****, is_mfa_active=False)

In [None]:
# Create and pull datasource to use in following tests


def _populate_user(username: str):
    """
    Helper function to create valid apikey, datablob, datasource and predictions for given user

    Args:
        username: username to use to create objects
    """
    with get_session_with_context() as session:
        user = session.exec(select(User).where(User.username == username)).one()

        create_apikey(
            apikey_to_create=APIKeyCreate(expiry=datetime.utcnow() + timedelta(days=1)),
            user=user,
            session=session,
        )

        from_local_request = FromLocalRequest(
            path="tmp/test-folder/", tag="my_csv_datasource_tag"
        )
        from_local_response = from_local_start_route(
            from_local_request=from_local_request,
            user=user,
            session=session,
        )

        with RemotePath.from_url(
            remote_url=f"s3://test-airt-service/account_312571_events",
            pull_on_enter=True,
            push_on_exit=False,
            exist_ok=True,
            parents=False,
            access_key=environ["AWS_ACCESS_KEY_ID"],
            secret_key=environ["AWS_SECRET_ACCESS_KEY"],
        ) as test_s3_path:
            df = pd.read_parquet(test_s3_path.as_path())
            display(df.head())
            df.to_csv(test_s3_path.as_path() / "file.csv", index=False)
            display(list(test_s3_path.as_path().glob("*")))
            #         !head -n 10 {test_s3_path.as_path()/"file.csv"}

            upload_to_s3_with_retry(
                test_s3_path.as_path() / "file.csv",
                from_local_response.presigned["url"],
                from_local_response.presigned["fields"],
            )

        datablob_id = session.exec(
            select(DataBlob).where(DataBlob.uuid == from_local_response.uuid)
        ).one().id
        
        display(datablob_id)
        assert datablob_id > 0
        
        datasource = DataSource(
            datablob_id=datablob_id,
            cloud_provider="aws",
            region="eu-west-1",
            total_steps=1,
            user=user,
        )
        session.add(datasource)
        session.commit()

        process_csv(
            datablob_id=datablob_id,
            datasource_id=datasource.id,
            deduplicate_data=True,
            index_column="PersonId",
            sort_by="OccurredTime",
            blocksize="256MB",
            kwargs_json=json.dumps(
                dict(
                    usecols=[0, 1, 2, 3, 4],
                    parse_dates=["OccurredTime"],
                )
            ),
        )

    with get_session_with_context() as session:
        datasource = session.exec(
            select(DataSource).where(DataSource.id == datasource.id)
        ).one()
        display(datasource)

        train_request = TrainRequest(
            data_uuid=datasource.uuid,
            client_column="AccountId",
            target_column="DefinitionId",
            target="load*",
            predict_after=timedelta(seconds=20 * 24 * 60 * 60),
        )

        model = train_model(train_request=train_request, user=user, session=session)
        display(model)
        # Call exec_cli train_model

        b = BackgroundTasks()
        with set_env_variable_context(variable="JOB_EXECUTOR", value="fastapi"):
            predicted = predict_model(
                model_uuid=model.uuid, user=user, session=session, background_tasks=b
            )
        display(predicted)
        # Call exec_cli predict_model

In [None]:
_populate_user(test_username)

[INFO] botocore.credentials: Found credentials in environment variables.
[INFO] airt_service.data.datablob: DataBlob.from_local(): FromLocalResponse(uuid=UUID('334ee5c9-79a5-4751-be27-69a6cf185e3e'), type='local', presigned={'url': 'https://kumaran-airt-service-eu-west-1.s3.amazonaws.com/', 'fields': {'key': '****************************************', 'x-amz-algorithm': 'AWS4-HMAC-SHA256', 'x-amz-credential': '********************/20221107/eu-west-1/s3/aws4_request', 'x-amz-date': '20221107T091000Z', 'policy': '************************************************************************************************************************************************************************************************************************************************************', 'x-amz-signature': '****************************'}})
[INFO] airt.remote_path: RemotePath.from_url(): creating remote path with the following url s3://test-airt-service/account_312571_events
[INFO] airt.remote_path: S3Path._creat

Unnamed: 0_level_0,AccountId,DefinitionId,OccurredTime,OccurredTimeTicks,PersonId
__null_dask_index__,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,312571,loadTests2,2019-12-31 21:30:02,1577836802678,2
1,312571,loadTests3,2020-01-03 23:53:22,1578104602678,2
2,312571,loadTests1,2020-01-07 02:16:42,1578372402678,2
3,312571,loadTests2,2020-01-10 04:40:02,1578640202678,2
4,312571,loadTests3,2020-01-13 07:03:22,1578908002678,2


[Path('/tmp/s3test-airt-serviceaccount_312571_events_cached_bfs0fv0k/_metadata'),
 Path('/tmp/s3test-airt-serviceaccount_312571_events_cached_bfs0fv0k/_common_metadata'),
 Path('/tmp/s3test-airt-serviceaccount_312571_events_cached_bfs0fv0k/file.csv'),
 Path('/tmp/s3test-airt-serviceaccount_312571_events_cached_bfs0fv0k/part.3.parquet'),
 Path('/tmp/s3test-airt-serviceaccount_312571_events_cached_bfs0fv0k/part.0.parquet'),
 Path('/tmp/s3test-airt-serviceaccount_312571_events_cached_bfs0fv0k/part.1.parquet'),
 Path('/tmp/s3test-airt-serviceaccount_312571_events_cached_bfs0fv0k/part.4.parquet'),
 Path('/tmp/s3test-airt-serviceaccount_312571_events_cached_bfs0fv0k/part.2.parquet')]

[INFO] airt.remote_path: S3Path._clean_up(): removing local cache path /tmp/s3test-airt-serviceaccount_312571_events_cached_bfs0fv0k


1

[INFO] airt_service.data.csv: process_csv(datablob_id=1, datasource_id=2): processing user uploaded csv file for datablob_id=1 and uploading parquet back to S3 for datasource_id=2
[INFO] airt_service.data.csv: process_csv(datablob_id=1, datasource_id=2): step 1/4: downloading user uploaded file from bucket s3://kumaran-airt-service-eu-west-1/4/datablob/1
[INFO] airt.remote_path: RemotePath.from_url(): creating remote path with the following url s3://kumaran-airt-service-eu-west-1/4/datablob/1
[INFO] airt.remote_path: S3Path._create_cache_path(): created cache path: /tmp/s3kumaran-airt-service-eu-west-14datablob1_cached_iapbe7vp
[INFO] airt.remote_path: S3Path.__init__(): created object for accessing s3://kumaran-airt-service-eu-west-1/4/datablob/1 locally in /tmp/s3kumaran-airt-service-eu-west-14datablob1_cached_iapbe7vp
[INFO] airt.remote_path: S3Path.__enter__(): pulling data from s3://kumaran-airt-service-eu-west-1/4/datablob/1 to /tmp/s3kumaran-airt-service-eu-west-14datablob1_cach

DataSource(id=2, uuid=UUID('03c6196f-0042-4733-80d7-0e285b353be6'), hash='1dd8ee7a0f96a48110dec6e25891d18d', total_steps=1, completed_steps=1, folder_size=6619982, no_of_rows=498961, cloud_provider=<CloudProvider.aws: 'aws'>, region='eu-west-1', error=None, disabled=False, path='s3://kumaran-airt-service-eu-west-1/4/datasource/2', created=datetime.datetime(2022, 11, 7, 9, 10, 13), user_id=4, pulled_on=datetime.datetime(2022, 11, 7, 9, 10, 19), tags=[])

Model(total_steps=5, path=None, cloud_provider=<CloudProvider.aws: 'aws'>, completed_steps=0, datasource_id=2, client_column='AccountId', error=None, user_id=4, target_column='DefinitionId', region='eu-west-1', target='load*', disabled=False, predict_after=datetime.timedelta(days=20), created=datetime.datetime(2022, 11, 7, 9, 10, 38), timestamp_column=None, id=1, uuid=UUID('91e40f4e-9f9d-45a1-a016-dde4c1d5b29a'))

[INFO] airt_service.batch_job: create_batch_job(): command='predict 1', task='csv_processing'
[INFO] airt_service.batch_job_components.base: Entering FastAPIBatchJobContext(task=csv_processing)
[INFO] airt_service.batch_job: batch_ctx=FastAPIBatchJobContext(task=csv_processing)
[INFO] airt_service.batch_job_components.fastapi: FastAPIBatchJobContext.create_job(self=FastAPIBatchJobContext(task=csv_processing), command='predict 1', environment_vars={'AWS_ACCESS_KEY_ID': '********************', 'AWS_SECRET_ACCESS_KEY': '****************************************', 'AWS_DEFAULT_REGION': 'eu-west-1', 'AZURE_SUBSCRIPTION_ID': '************************************', 'AZURE_TENANT_ID': '************************************', 'AZURE_CLIENT_ID': '************************************', 'AZURE_CLIENT_SECRET': '****************************************', 'AZURE_STORAGE_ACCOUNT_PREFIX': 'kumsairtsdev', 'AZURE_RESOURCE_GROUP': 'kumaran-airt-service-dev', 'STORAGE_BUCKET_PREFIX': 'kumaran-airt-service', 

Prediction(id=1, datasource_id=2, uuid=UUID('973cc1af-23d0-45b8-a519-cc0644c3186d'), error=None, total_steps=3, disabled=False, cloud_provider=<CloudProvider.aws: 'aws'>, completed_steps=0, model_id=1, created=datetime.datetime(2022, 11, 7, 9, 10, 38), path=None, region='eu-west-1')

In [None]:
#| export


def cleanup_predictions(user_to_cleanup: User, session: Session):
    """Cleanup predictions"""
    logger.info("deleting predictions")
    predictions = session.exec(
        select(Prediction).join(Model).where(Model.user == user_to_cleanup)
    ).all()

    for prediction in predictions:
        delete_prediction(
            prediction_uuid=prediction.uuid,  # type: ignore
            user=user_to_cleanup,
            session=session,
        )
        session.delete(prediction)
    session.commit()

In [None]:
with get_session_with_context() as session:
    user_to_cleanup = session.exec(
        select(User).where(User.username == test_username)
    ).one()

    cleanup_predictions(user_to_cleanup, session)

    predictions = session.exec(
        select(Prediction).join(Model).where(Model.user == user_to_cleanup)
    ).all()
    assert len(predictions) == 0

[INFO] __main__: deleting predictions


In [None]:
#| export


def cleanup_models(user_to_cleanup: User, session: Session):
    """Cleanup models"""
    logger.info("deleting models")
    models = session.exec(select(Model).where(Model.user == user_to_cleanup)).all()

    for model in models:
        delete_model(
            model_uuid=model.uuid,  # type: ignore
            user=user_to_cleanup,
            session=session,
        )
        session.delete(model)
    session.commit()

In [None]:
with get_session_with_context() as session:
    user_to_cleanup = session.exec(
        select(User).where(User.username == test_username)
    ).one()

    cleanup_models(user_to_cleanup, session)

    models = session.exec(select(Model).where(Model.user == user_to_cleanup)).all()
    assert len(models) == 0

[INFO] __main__: deleting models


In [None]:
#| export


def cleanup_datasources(user_to_cleanup: User, session: Session):
    """Cleanup datasources"""
    logger.info("deleting datasources")
    datasources = session.exec(
        select(DataSource).where(DataSource.user == user_to_cleanup)
    ).all()

    for datasource in datasources:
        delete_datasource(
            datasource_uuid=datasource.uuid,  # type: ignore
            user=user_to_cleanup,
            session=session,
        )
        session.delete(datasource)
    session.commit()

In [None]:
with get_session_with_context() as session:
    user_to_cleanup = session.exec(
        select(User).where(User.username == test_username)
    ).one()

    cleanup_datasources(user_to_cleanup, session)

    datasources = session.exec(
        select(DataSource).where(DataSource.user == user_to_cleanup)
    ).all()
    assert len(datasources) == 0

[INFO] __main__: deleting datasources


In [None]:
#| export


def cleanup_datablobs(user_to_cleanup: User, session: Session):
    """Cleanup datablobs"""
    logger.info("deleting datablobs")
    datablobs = session.exec(
        select(DataBlob).where(DataBlob.user == user_to_cleanup)
    ).all()

    for datablob in datablobs:
        delete_datablob(
            datablob_uuid=datablob.uuid,  # type: ignore
            user=user_to_cleanup,
            session=session,
        )
        session.delete(datablob)
    session.commit()

In [None]:
with get_session_with_context() as session:
    user_to_cleanup = session.exec(
        select(User).where(User.username == test_username)
    ).one()

    cleanup_datablobs(user_to_cleanup, session)

    datablobs = session.exec(
        select(DataBlob).where(DataBlob.user == user_to_cleanup)
    ).all()
    assert len(datablobs) == 0

[INFO] __main__: deleting datablobs


In [None]:
#| export


def cleanup_apikeys(user_to_cleanup: User, session: Session):
    """Cleanup apikeys"""
    logger.info("deleting apikeys")
    apikeys = session.exec(select(APIKey).where(APIKey.user == user_to_cleanup)).all()

    for apikey in apikeys:
        delete_apikey(
            user_uuid_or_name=str(user_to_cleanup.uuid),  # type: ignore
            key_uuid_or_name=str(apikey.uuid),  # type: ignore
            user=user_to_cleanup,
            session=session,
        )
        session.delete(apikey)
    session.commit()

In [None]:
with get_session_with_context() as session:
    user_to_cleanup = session.exec(
        select(User).where(User.username == test_username)
    ).one()

    cleanup_apikeys(user_to_cleanup, session)

    apikeys = session.exec(select(APIKey).where(APIKey.user == user_to_cleanup)).all()
    assert len(apikeys) == 0

[INFO] __main__: deleting apikeys


In [None]:
#| export


def cleanup_user(user_to_cleanup: User, session: Session):
    """Cleanup user"""
    cleanup_predictions(user_to_cleanup, session)
    cleanup_models(user_to_cleanup, session)
    cleanup_datasources(user_to_cleanup, session)
    cleanup_datablobs(user_to_cleanup, session)
    cleanup_apikeys(user_to_cleanup, session)

    bucket, base_path = get_s3_storage_bucket()  # type: ignore
    s3_path = (
        f"{base_path}/{user_to_cleanup.id}" if base_path else str(user_to_cleanup.id)
    )
    logger.info(f"Deleting user files in s3://{bucket.name}/{s3_path}")
    bucket.objects.filter(Prefix=s3_path + "/").delete()

    logger.info("deleting user")
    session.delete(user_to_cleanup)
    session.commit()

In [None]:
with get_session_with_context() as session:
    user_to_cleanup = session.exec(
        select(User).where(User.username == test_username)
    ).one()

    cleanup_user(user_to_cleanup, session)

    with pytest.raises(NoResultFound):
        session.exec(select(User).where(User.username == test_username)).one()

[INFO] __main__: deleting predictions
[INFO] __main__: deleting models
[INFO] __main__: deleting datasources
[INFO] __main__: deleting datablobs
[INFO] __main__: deleting apikeys
[INFO] __main__: Deleting user files in s3://kumaran-airt-service-eu-west-1/4
[INFO] __main__: deleting user
