In [1]:
import boto3
from utils import apply_schema
import awswrangler as wr
import pandas as pd

session = boto3.Session(profile_name="prod-nomo")
glue_client = session.client("glue")

############################################
athena_table_name = "dynamo_sls_clearbank_transactions"
partition = "date=2025-06-18"
############################################

partition_prefix = f"s3://bb2-prod-datalake-raw/{athena_table_name}/{partition}"


def get_athena_schema(database_name, table_name, glue_client):
    try:
        response = glue_client.get_table(DatabaseName=database_name, Name=table_name)
        columns = response["Table"]["StorageDescriptor"]["Columns"]

        schema = {col["Name"]: col["Type"] for col in columns}
        return schema

    except glue_client.exceptions.EntityNotFoundException:
        print(f"Table '{table_name}' not found in database '{database_name}'.")
        return None
    except Exception as e:
        print(f"Error fetching schema: {e}")
        return None


def write_processed_to_s3(df, file):

    print("info", "Writing data to Athena", {"path": file})

    res = wr.s3.to_parquet(
        df=df,
        path=file,
        index=False,
        dataset=False,
        compression="snappy",
        boto3_session=session,
    )
    return res

In [2]:
schema = get_athena_schema("datalake_raw", athena_table_name, glue_client)
schema

{'aws_region': 'string',
 'dynamodb_new_image_transaction_id_s': 'string',
 'dynamodb_new_image_transaction_m_status_s': 'string',
 'dynamodb_new_image_transaction_m_supplementary_data_l_0_m_value_s': 'string',
 'dynamodb_new_image_transaction_m_supplementary_data_l_0_m_name_s': 'string',
 'dynamodb_new_image_transaction_m_supplementary_data_l_1_m_value_s': 'string',
 'dynamodb_new_image_transaction_m_supplementary_data_l_1_m_name_s': 'string',
 'dynamodb_new_image_transaction_m_supplementary_data_l_2_m_value_s': 'string',
 'dynamodb_new_image_transaction_m_supplementary_data_l_2_m_name_s': 'string',
 'dynamodb_new_image_transaction_m_supplementary_data_l_3_m_value_s': 'string',
 'dynamodb_new_image_transaction_m_supplementary_data_l_3_m_name_s': 'string',
 'dynamodb_new_image_transaction_m_supplementary_data_l_4_m_value_s': 'string',
 'dynamodb_new_image_transaction_m_supplementary_data_l_4_m_name_s': 'string',
 'dynamodb_new_image_transaction_m_supplementary_data_l_5_m_value_s': 'str

In [3]:
file = "s3://bb2-prod-datalake-raw/dynamo_sls_clearbank_transactions/date=2024-10-25/00c8d2c350e8472ea0d2b8ef9d18166d.snappy.parquet"

In [4]:
df = wr.s3.read_parquet(path=file, boto3_session=session)
df.head()

Unnamed: 0,aws_region,event_id,event_name,user_identity,record_format,table_name,dynamodb_approximate_creation_date_time,dynamodb_keys_transaction_id_s,dynamodb_new_image_transaction_id_s,dynamodb_new_image_transaction_m_status_s,...,dynamodb_new_image_transaction_m_counterpart_account_m_owner_name_s,dynamodb_new_image_transaction_m_counterpart_account_m_iban_s,dynamodb_new_image_transaction_m_counterpart_account_m_institution_name_s,dynamodb_new_image_transaction_m_counterpart_account_m_transaction_owner_name_s,dynamodb_new_image_transaction_m_end_to_end_transaction_id_s,dynamodb_new_image_transaction_m_timestamp_settled_s,dynamodb_new_image_transaction_m_actual_end_to_end_transaction_id_s,dynamodb_size_bytes,event_source,timestamp_extracted
0,eu-west-2,b08b5dc9-07da-42bf-81ac-43b2587dd2a9,INSERT,,application/json,sls-clearbank-default-ClearBankTransactions-1L...,2024-10-25 08:27:54.206,417d15995616499a96e5f1705ac80418,417d15995616499a96e5f1705ac80418,Settled,...,Not Provided,GB80BLME04136800695958,Bank of London and The Middle East,LEENA H S A ALABDELI,8a5e05cf92c210000192c2c9d937111a,2024-10-25T08:27:53.56Z,c64dfdf051d44db3bf1020241025826041368,812,aws:dynamodb,2024-10-25 08:28:51.252


In [None]:


files = wr.s3.list_objects(
    path=partition_prefix,
    suffix=".parquet",
    boto3_session=session,
)

for file in files:
    print("********")
    print(file)
    df = wr.s3.read_parquet(path=file, boto3_session=session)
    final_df = apply_schema(df, schema)
    write_processed_to_s3(final_df, file)
    print("********")


********
s3://bb2-prod-datalake-raw/dynamo_sls_payees_v2/date=2025-01-30/004d6604b83b4c258b6c5ea4bd1cdd1d.snappy.parquet
info Writing data to Athena {'path': 's3://bb2-prod-datalake-raw/dynamo_sls_payees_v2/date=2025-01-30/004d6604b83b4c258b6c5ea4bd1cdd1d.snappy.parquet'}
********
********
s3://bb2-prod-datalake-raw/dynamo_sls_payees_v2/date=2025-01-30/0079c87d79a74214a0de65bb33a34c5f.snappy.parquet
info Writing data to Athena {'path': 's3://bb2-prod-datalake-raw/dynamo_sls_payees_v2/date=2025-01-30/0079c87d79a74214a0de65bb33a34c5f.snappy.parquet'}
********
********
s3://bb2-prod-datalake-raw/dynamo_sls_payees_v2/date=2025-01-30/00e9b9160d6a47a4afdbe6953afd1a1a.snappy.parquet
info Writing data to Athena {'path': 's3://bb2-prod-datalake-raw/dynamo_sls_payees_v2/date=2025-01-30/00e9b9160d6a47a4afdbe6953afd1a1a.snappy.parquet'}
********
********
s3://bb2-prod-datalake-raw/dynamo_sls_payees_v2/date=2025-01-30/013d0997c33746608b82d7bc8ab5d8e3.snappy.parquet
info Writing data to Athena {'pat