In [None]:
# | default_exp data.parquet

In [None]:
from airt.testing import activate_by_import

[INFO] airt.testing.activate_by_import: Testing environment activated.
[INFO] numexpr.utils: Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
[INFO] numexpr.utils: NumExpr defaulting to 8 threads.
[INFO] airt.keras.helpers: Using a single GPU #0 with memory_limit 1024 MB


In [None]:
# | export

import json
from typing import *

import airt_service.sanitizer
from airt.data.importers import import_parquet
from airt.logger import get_logger
from airt.remote_path import RemotePath
from airt_service.aws.utils import create_s3_datasource_path
from airt_service.azure.utils import create_azure_blob_storage_datasource_path
from airt_service.data.datasource import DataSource
from airt_service.data.utils import (
    calculate_azure_data_object_folder_size_and_path,
    calculate_data_object_folder_size_and_path,
    calculate_data_object_pulled_on,
)
from airt_service.db.models import DataBlob, get_session_with_context
from airt_service.helpers import truncate
from fastcore.script import Param, call_parse
from fastcore.utils import *
from sqlmodel import select

[INFO] airt.data.importers: Module loaded:
[INFO] airt.data.importers:  - using pandas     : 1.5.1
[INFO] airt.data.importers:  - using dask       : 2022.10.0


In [None]:
from os import environ

import dask.dataframe as dd
import pandas as pd
import requests
from airt_service.constants import DS_HEAD_FILE_NAME, METADATA_FOLDER_PATH
from airt_service.data.azure_blob_storage import azure_blob_storage_pull
from airt_service.data.datablob import (
    FromAzureBlobStorageRequest,
    from_azure_blob_storage_route,
)
from airt_service.data.s3 import s3_pull
from airt_service.data.utils import create_db_uri_for_s3_datablob
from airt_service.db.models import User, create_user_for_testing, get_session
from airt_service.helpers import commit_or_rollback, set_env_variable_context
from azure.identity import DefaultAzureCredential
from azure.mgmt.storage import StorageManagementClient
from fastapi import BackgroundTasks

[INFO] airt.executor.subcommand: Module loaded.


In [None]:
test_username = create_user_for_testing()
display(test_username)

'xityalounu'

In [None]:
# | exporti

logger = get_logger(__name__)

In [None]:
# | export


@call_parse
def process_parquet(
    datablob_id: Param("datablob_id", int),  # type: ignore
    datasource_id: Param("datasource_id", int),  # type: ignore
    *,
    deduplicate_data: Param("deduplicate_data", bool) = False,  # type: ignore
    index_column: Param("index_column", str),  # type: ignore
    sort_by: Param("sort_by", str),  # type: ignore
    blocksize: Param("blocksize", str) = "256MB",  # type: ignore
    kwargs_json: Param("kwargs_json", str) = "{}",  # type: ignore
):
    """Download the user uploaded parquet file from S3, processes it and upload the processed parquet files to S3

    Args:
        datablob_id: Id of the datablob
        datasource_id: Id of the datasource
        deduplicate_data: If set to True, then duplicate rows are removed while uploading.
        index_column: Name of the column used to index and partition the data into partitions
        sort_by: Name of the column used to sort data within the same index value
        blocksize: Size of partition
        kwargs_json: Parameters as json string which are passed to the **dask.dataframe.read_csv()** function,
            typically params for underlining **pd.read_csv()** from Pandas.
    """
    logger.info(
        f"process_parquet({datablob_id=}, {datasource_id=}): processing user uploaded parquet files for {datablob_id=} and uploading parquet back to S3 for {datasource_id=}"
    )
    with get_session_with_context() as session:
        datablob = session.exec(
            select(DataBlob).where(DataBlob.id == datablob_id)
        ).one()
        datasource = session.exec(
            select(DataSource).where(DataSource.id == datasource_id)
        ).one()

        # Following is needed if datablob was created from user uploaded parquet files
        calculate_data_object_folder_size_and_path(datablob)

        datasource.error = None
        datasource.completed_steps = 0
        datasource.folder_size = None
        datasource.no_of_rows = None
        datasource.path = None
        datasource.hash = None

        try:
            source_path = datablob.path
            if datasource.cloud_provider == "aws":
                destination_bucket, s3_path = create_s3_datasource_path(
                    user_id=datasource.user.id,
                    datasource_id=datasource.id,
                    region=datasource.region,
                )
                destination_remote_url = f"s3://{destination_bucket.name}/{s3_path}"
            elif datasource.cloud_provider == "azure":
                (
                    destination_container_client,
                    destination_azure_blob_storage_path,
                ) = create_azure_blob_storage_datasource_path(
                    user_id=datasource.user.id,
                    datasource_id=datasource.id,
                    region=datasource.region,
                )
                destination_remote_url = f"{destination_container_client.url}/{destination_azure_blob_storage_path}"
            logger.info(
                f"process_parquet({datablob_id=}, {datasource_id=}): step 1/4: downloading user uploaded file from bucket {source_path}"
            )

            with RemotePath.from_url(
                remote_url=source_path,
                pull_on_enter=True,
                push_on_exit=False,
                exist_ok=True,
                parents=False,
            ) as source_s3_path:
                calculate_data_object_pulled_on(datasource)
                user_uploaded_parquet_files = list(
                    source_s3_path.as_path().glob("*.parquet")
                )
                if len(user_uploaded_parquet_files) == 0:
                    raise ValueError("parquet files not found")

                logger.info(
                    f"process_parquet({datablob_id=}, {datasource_id=}): step 2/4: processing parquet files"
                )

                with RemotePath.from_url(
                    remote_url=destination_remote_url,
                    pull_on_enter=False,
                    push_on_exit=True,
                    exist_ok=True,
                    parents=True,
                ) as destination_path:
                    processed_path = destination_path.as_path()
                    kwargs = json.loads(kwargs_json)
                    try:
                        sort_by = json.loads(sort_by)
                    except json.JSONDecodeError:
                        pass
                    import_parquet(
                        input_path=source_s3_path.as_path(),
                        output_path=processed_path,
                        index_column=index_column,
                        sort_by=sort_by,
                        blocksize=blocksize,
                        **kwargs,
                    )
                    datasource.calculate_properties(processed_path)

                    if not len(list(processed_path.glob("*.parquet"))) > 0:
                        raise ValueError(
                            f"processing failed; parquet files not available"
                        )

                    logger.info(
                        f"process_parquet({datablob_id=}, {datasource_id=}): step 3/4: uploading parquet files back to path {destination_path}"
                    )
            logger.info(
                f"process_parquet({datablob_id=}, {datasource_id=}): step 4/4: calculating datasource attributes - folder_size, no_of_rows, head, hash"
            )

            calculate_data_object_folder_size_and_path(datasource)
        except Exception as e:
            logger.error(f"process_parquet({datasource_id=}): error: {str(e)}")
            datasource.error = truncate(str(e))
        logger.info(f"process_parquet({datablob_id=}, {datasource_id=}): completed")
        session.add(datablob)
        session.add(datasource)
        session.commit()

In [None]:
# Create a datablob and upload multiple parquet files using presigned url
with get_session_with_context() as session:
    user = session.exec(select(User).where(User.username == test_username)).one()
    uri = "s3://test-airt-service/account_312571_events"
    datablob = DataBlob(
        type="s3",
        uri=create_db_uri_for_s3_datablob(
            uri=uri,
            access_key=environ["AWS_ACCESS_KEY_ID"],
            secret_key=environ["AWS_SECRET_ACCESS_KEY"],
        ),
        source=uri,
        cloud_provider="aws",
        region="eu-west-1",
        total_steps=1,
        user=user,
    )
    with commit_or_rollback(session):
        session.add(datablob)

    s3_pull(datablob_id=datablob.id)

    datablob_id = (
        session.exec(select(DataBlob).where(DataBlob.uuid == datablob.uuid)).one().id
    )
    datasource = DataSource(
        datablob_id=datablob_id,
        cloud_provider=datablob.cloud_provider,
        region=datablob.region,
        total_steps=1,
        user=user,
    )
    session.add(datasource)
    session.commit()

    datasource_id = datasource.id
    datablob_id = datablob.id
    user_id = user.id

[INFO] botocore.credentials: Found credentials in environment variables.
[INFO] airt.remote_path: RemotePath.from_url(): creating remote path with the following url s3://kumaran-airt-service-eu-west-1/131/datablob/50
[INFO] airt.remote_path: S3Path._create_cache_path(): created cache path: /tmp/s3kumaran-airt-service-eu-west-1131datablob50_cached_j5v7or1k
[INFO] airt.remote_path: S3Path.__init__(): created object for accessing s3://kumaran-airt-service-eu-west-1/131/datablob/50 locally in /tmp/s3kumaran-airt-service-eu-west-1131datablob50_cached_j5v7or1k
[INFO] airt.remote_path: RemotePath.from_url(): creating remote path with the following url s3://test-airt-service/account_312571_events
[INFO] airt.remote_path: S3Path._create_cache_path(): created cache path: /tmp/s3test-airt-serviceaccount_312571_events_cached_iir5s2mh
[INFO] airt.remote_path: S3Path.__init__(): created object for accessing s3://test-airt-service/account_312571_events locally in /tmp/s3test-airt-serviceaccount_31257

In [None]:
# Test process_parquet
process_parquet(
    datablob_id=datablob_id,
    datasource_id=datasource_id,
    deduplicate_data=True,
    index_column="PersonId",
    sort_by="OccurredTime",
    blocksize="256MB",
    kwargs_json=json.dumps(
        dict(
            usecols=[0, 1, 2, 3, 4],
            parse_dates=["OccurredTime"],
        )
    ),
)

with get_session_with_context() as session:
    datasource = session.exec(
        select(DataSource).where(DataSource.id == datasource_id)
    ).one()
    display(datasource)
    assert (
        datasource.folder_size == 6619982
    ), f"{datasource=}, {datasource.folder_size=}"
    assert datasource.no_of_rows == 498961
    assert (
        datasource.path
        == f"s3://{environ['STORAGE_BUCKET_PREFIX']}-eu-west-1/{user_id}/datasource/{datasource.id}"
    ), datasource.path
    assert datasource.hash == "1dd8ee7a0f96a48110dec6e25891d18d", datasource.hash

    destination_bucket, parquet_s3_path = create_s3_datasource_path(
        user_id=datasource.user.id,
        datasource_id=datasource.id,
        region=datasource.region,
    )

    # tests for datasource head and dtypes
    with RemotePath.from_url(
        remote_url=f"s3://{destination_bucket.name}/{parquet_s3_path}",
        pull_on_enter=True,
        push_on_exit=False,
        exist_ok=True,
        parents=False,
    ) as s3_path:
        ddf = dd.read_parquet(s3_path.as_path())
        ddf_head = ddf.head(n=10)
        display(ddf_head)

        with RemotePath.from_url(
            remote_url=f"{datasource.path}",
            pull_on_enter=True,
            push_on_exit=False,
            exist_ok=True,
            parents=False,
        ) as test_s3_info_path:
            processed_test_s3_info_path = test_s3_info_path.as_path()

            head_df = pd.read_parquet(
                processed_test_s3_info_path / METADATA_FOLDER_PATH / DS_HEAD_FILE_NAME
            )
            assert head_df.index.name == "PersonId"
            assert head_df.shape == (10, 4)

            dtypes_dict = head_df.dtypes.apply(lambda x: x.name).to_dict()
            assert dtypes_dict == {
                "AccountId": "int64",
                "DefinitionId": "object",
                "OccurredTime": "datetime64[ns]",
                "OccurredTimeTicks": "int64",
            }, dtypes_dict

[INFO] __main__: process_parquet(datablob_id=50, datasource_id=30): processing user uploaded parquet files for datablob_id=50 and uploading parquet back to S3 for datasource_id=30
[INFO] __main__: process_parquet(datablob_id=50, datasource_id=30): step 1/4: downloading user uploaded file from bucket s3://kumaran-airt-service-eu-west-1/131/datablob/50
[INFO] airt.remote_path: RemotePath.from_url(): creating remote path with the following url s3://kumaran-airt-service-eu-west-1/131/datablob/50
[INFO] airt.remote_path: S3Path._create_cache_path(): created cache path: /tmp/s3kumaran-airt-service-eu-west-1131datablob50_cached_gbxl8lmn
[INFO] airt.remote_path: S3Path.__init__(): created object for accessing s3://kumaran-airt-service-eu-west-1/131/datablob/50 locally in /tmp/s3kumaran-airt-service-eu-west-1131datablob50_cached_gbxl8lmn
[INFO] airt.remote_path: S3Path.__enter__(): pulling data from s3://kumaran-airt-service-eu-west-1/131/datablob/50 to /tmp/s3kumaran-airt-service-eu-west-1131d

DataSource(id=30, uuid=UUID('042590ba-afaf-4aff-8750-84de221e73dc'), hash='1dd8ee7a0f96a48110dec6e25891d18d', total_steps=1, completed_steps=1, folder_size=6619982, no_of_rows=498961, cloud_provider=<CloudProvider.aws: 'aws'>, region='eu-west-1', error=None, disabled=False, path='s3://kumaran-airt-service-eu-west-1/131/datasource/30', created=datetime.datetime(2022, 10, 27, 8, 10, 36), user_id=131, pulled_on=datetime.datetime(2022, 10, 27, 8, 10, 46), tags=[])

[INFO] airt.remote_path: RemotePath.from_url(): creating remote path with the following url s3://kumaran-airt-service-eu-west-1/131/datasource/30
[INFO] airt.remote_path: S3Path._create_cache_path(): created cache path: /tmp/s3kumaran-airt-service-eu-west-1131datasource30_cached_oa1qbq69
[INFO] airt.remote_path: S3Path.__init__(): created object for accessing s3://kumaran-airt-service-eu-west-1/131/datasource/30 locally in /tmp/s3kumaran-airt-service-eu-west-1131datasource30_cached_oa1qbq69
[INFO] airt.remote_path: S3Path.__enter__(): pulling data from s3://kumaran-airt-service-eu-west-1/131/datasource/30 to /tmp/s3kumaran-airt-service-eu-west-1131datasource30_cached_oa1qbq69


Unnamed: 0_level_0,AccountId,DefinitionId,OccurredTime,OccurredTimeTicks
PersonId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,312571,loadTests2,2019-12-31 21:30:02,1577836802678
2,312571,loadTests3,2020-01-03 23:53:22,1578104602678
2,312571,loadTests1,2020-01-07 02:16:42,1578372402678
2,312571,loadTests2,2020-01-10 04:40:02,1578640202678
2,312571,loadTests3,2020-01-13 07:03:22,1578908002678
2,312571,loadTests1,2020-01-16 09:26:42,1579175802678
2,312571,loadTests2,2020-01-19 11:50:02,1579443602678
2,312571,loadTests3,2020-01-22 14:13:22,1579711402678
2,312571,loadTests1,2020-01-25 16:36:42,1579979202678
2,312571,loadTests2,2020-01-28 19:00:02,1580247002678


[INFO] airt.remote_path: RemotePath.from_url(): creating remote path with the following url s3://kumaran-airt-service-eu-west-1/131/datasource/30
[INFO] airt.remote_path: S3Path._create_cache_path(): created cache path: /tmp/s3kumaran-airt-service-eu-west-1131datasource30_cached_om_fvjrc
[INFO] airt.remote_path: S3Path.__init__(): created object for accessing s3://kumaran-airt-service-eu-west-1/131/datasource/30 locally in /tmp/s3kumaran-airt-service-eu-west-1131datasource30_cached_om_fvjrc
[INFO] airt.remote_path: S3Path.__enter__(): pulling data from s3://kumaran-airt-service-eu-west-1/131/datasource/30 to /tmp/s3kumaran-airt-service-eu-west-1131datasource30_cached_om_fvjrc
[INFO] airt.remote_path: S3Path._clean_up(): removing local cache path /tmp/s3kumaran-airt-service-eu-west-1131datasource30_cached_om_fvjrc
[INFO] airt.remote_path: S3Path._clean_up(): removing local cache path /tmp/s3kumaran-airt-service-eu-west-1131datasource30_cached_oa1qbq69


In [None]:
# Create azure datablob and datasource from it

with get_session_with_context() as session:
    user = session.exec(select(User).where(User.username == test_username)).one()
    uri = "https://testairtservice.blob.core.windows.net/test-container/account_312571_events"

    storage_client = StorageManagementClient(
        DefaultAzureCredential(), environ["AZURE_SUBSCRIPTION_ID"]
    )
    keys = storage_client.storage_accounts.list_keys(
        "test-airt-service", "testairtservice"
    )
    credential = keys.keys[0].value

    from_azure_blob_storage_request = FromAzureBlobStorageRequest(
        uri=uri,
        credential=credential,
        region="westeurope",
        tag="my_azure_blob_storage_datablob_tag",
    )
    b = BackgroundTasks()

    # Test using FastAPIBatchJobContext with set_env_variable_context
    with set_env_variable_context(variable="JOB_EXECUTOR", value="fastapi"):
        datablob = from_azure_blob_storage_route(
            from_azure_blob_storage_request=from_azure_blob_storage_request,
            user=user,
            session=session,
            background_tasks=b,
        )
    display(datablob)

    azure_blob_storage_pull(datablob_id=datablob.id)

    datablob_id = (
        session.exec(select(DataBlob).where(DataBlob.uuid == datablob.uuid)).one().id
    )
    datasource = DataSource(
        datablob_id=datablob_id,
        cloud_provider=datablob.cloud_provider,
        region=datablob.region,
        total_steps=1,
        user=user,
    )
    session.add(datasource)
    session.commit()

    datasource_id = datasource.id
    datablob_id = datablob.id
    user_id = user.id

[INFO] azure.identity._credentials.environment: Environment is configured for ClientSecretCredential
[INFO] azure.identity._credentials.managed_identity: ManagedIdentityCredential will use IMDS
[INFO] azure.identity._credentials.chained: DefaultAzureCredential acquired a token from EnvironmentCredential
[INFO] airt_service.batch_job: create_batch_job(): command='azure_blob_storage_pull 52', task='csv_processing'
[INFO] airt_service.batch_job_components.base: Entering FastAPIBatchJobContext(task=csv_processing)
[INFO] airt_service.batch_job: batch_ctx=FastAPIBatchJobContext(task=csv_processing)
[INFO] airt_service.batch_job_components.fastapi: FastAPIBatchJobContext.create_job(self=FastAPIBatchJobContext(task=csv_processing), command='azure_blob_storage_pull 52', environment_vars={'AWS_ACCESS_KEY_ID': '********************', 'AWS_SECRET_ACCESS_KEY': '****************************************', 'AWS_DEFAULT_REGION': 'eu-west-1', 'AZURE_SUBSCRIPTION_ID': '**********************************

DataBlob(id=52, uuid=UUID('be427bf1-11c5-47c1-b725-99ac8456fe60'), type='azure_blob_storage', uri='https://****************************************@testairtservice.blob.core.windows.net/test-container/account_312571_events', source='https://testairtservice.blob.core.windows.net/test-container/account_312571_events', total_steps=1, completed_steps=0, folder_size=None, cloud_provider=<CloudProvider.azure: 'azure'>, region='westeurope', error=None, disabled=False, path=None, created=datetime.datetime(2022, 10, 27, 8, 11, 46), user_id=131, pulled_on=None, tags=[Tag(uuid=UUID('29dc9d93-1484-46f6-85dd-e5e8b9e57319'), name='latest', id=2, created=datetime.datetime(2022, 10, 27, 7, 59, 7)), Tag(uuid=UUID('3bd8829c-d4e4-4e27-af72-13a75e4df15f'), name='my_azure_blob_storage_datablob_tag', id=6, created=datetime.datetime(2022, 10, 27, 8, 0, 49))])

[INFO] azure.identity._credentials.environment: Environment is configured for ClientSecretCredential
[INFO] azure.identity._credentials.managed_identity: ManagedIdentityCredential will use IMDS
[INFO] azure.identity._credentials.chained: DefaultAzureCredential acquired a token from EnvironmentCredential
[INFO] azure.identity._credentials.default: DefaultAzureCredential acquired a token from EnvironmentCredential
[INFO] azure.identity._credentials.environment: Environment is configured for ClientSecretCredential
[INFO] azure.identity._credentials.managed_identity: ManagedIdentityCredential will use IMDS
[INFO] airt.remote_path: RemotePath.from_url(): creating remote path with the following url https://kumsairtsdevwesteurope.blob.core.windows.net/kumsairtsdevwesteurope/131/datablob/52
[INFO] airt.remote_path: AzureBlobPath._create_cache_path(): created cache path: /tmp/httpskumsairtsdevwesteuropeblobcorewindowsnetkumsairtsdevwesteurope131datablob52_cached_ms1bfay0
[INFO] airt.remote_path

In [None]:
# Test process_parquet for azure datablob's datasource
process_parquet(
    datablob_id=datablob_id,
    datasource_id=datasource_id,
    deduplicate_data=True,
    index_column="PersonId",
    sort_by="OccurredTime",
    blocksize="256MB",
    kwargs_json=json.dumps(
        dict(
            usecols=[0, 1, 2, 3, 4],
            parse_dates=["OccurredTime"],
        )
    ),
)

with get_session_with_context() as session:
    datasource = session.exec(
        select(DataSource).where(DataSource.id == datasource_id)
    ).one()
    display(datasource)

[INFO] __main__: process_parquet(datablob_id=52, datasource_id=31): processing user uploaded parquet files for datablob_id=52 and uploading parquet back to S3 for datasource_id=31
[INFO] azure.identity._credentials.environment: Environment is configured for ClientSecretCredential
[INFO] azure.identity._credentials.managed_identity: ManagedIdentityCredential will use IMDS
[INFO] azure.identity._credentials.chained: DefaultAzureCredential acquired a token from EnvironmentCredential
[INFO] azure.identity._credentials.default: DefaultAzureCredential acquired a token from EnvironmentCredential
[INFO] azure.identity._credentials.environment: Environment is configured for ClientSecretCredential
[INFO] azure.identity._credentials.managed_identity: ManagedIdentityCredential will use IMDS
[INFO] azure.identity._credentials.chained: DefaultAzureCredential acquired a token from EnvironmentCredential
[INFO] azure.identity._credentials.environment: Environment is configured for ClientSecretCredentia

[INFO] azure.identity._credentials.managed_identity: ManagedIdentityCredential will use IMDS
[INFO] azure.identity._credentials.chained: DefaultAzureCredential acquired a token from EnvironmentCredential
[INFO] azure.identity._credentials.default: DefaultAzureCredential acquired a token from EnvironmentCredential
[INFO] azure.identity._credentials.environment: Environment is configured for ClientSecretCredential
[INFO] azure.identity._credentials.managed_identity: ManagedIdentityCredential will use IMDS
[INFO] azure.identity._credentials.chained: DefaultAzureCredential acquired a token from EnvironmentCredential
[INFO] __main__: process_parquet(datablob_id=52, datasource_id=31): completed


DataSource(id=31, uuid=UUID('5634e432-0102-4fd8-94fe-d5996b62fd86'), hash='1dd8ee7a0f96a48110dec6e25891d18d', total_steps=1, completed_steps=1, folder_size=6619982, no_of_rows=498961, cloud_provider=<CloudProvider.azure: 'azure'>, region='westeurope', error=None, disabled=False, path='https://kumsairtsdevwesteurope.blob.core.windows.net/kumsairtsdevwesteurope/131/datasource/31', created=datetime.datetime(2022, 10, 27, 8, 12, 17), user_id=131, pulled_on=datetime.datetime(2022, 10, 27, 8, 12, 32), tags=[])