In [1]:
import boto3
from utils import apply_schema
import awswrangler as wr
import pandas as pd

session = boto3.Session(profile_name="prod-nomo")
glue_client = session.client("glue")

############################################
athena_table_name = "dynamo_sls_payees_v2"
partition = "date=2025-01-31"
############################################

partition_prefix = f"s3://bb2-prod-datalake-raw/{athena_table_name}/{partition}"


def get_athena_schema(database_name, table_name, glue_client):
    try:
        response = glue_client.get_table(DatabaseName=database_name, Name=table_name)
        columns = response["Table"]["StorageDescriptor"]["Columns"]

        schema = {col["Name"]: col["Type"] for col in columns}
        return schema

    except glue_client.exceptions.EntityNotFoundException:
        print(f"Table '{table_name}' not found in database '{database_name}'.")
        return None
    except Exception as e:
        print(f"Error fetching schema: {e}")
        return None


def write_processed_to_s3(df, file):

    print("info", "Writing data to Athena", {"path": file})

    res = wr.s3.to_parquet(
        df=df,
        path=file,
        index=False,
        dataset=False,
        compression="snappy",
        boto3_session=session,
    )
    return res

In [2]:
schema = get_athena_schema("datalake_raw", athena_table_name, glue_client)

files = wr.s3.list_objects(
    path=partition_prefix,
    suffix=".parquet",
    boto3_session=session,
)

for file in files:
    print("********")
    print(file)
    df = wr.s3.read_parquet(path=file, boto3_session=session)
    final_df = apply_schema(df, schema)
    write_processed_to_s3(final_df, file)
    print("********")


********
s3://bb2-prod-datalake-raw/dynamo_sls_payees_v2/date=2025-01-31/001cf1eac17849b3940efd7e8d0b5508.snappy.parquet
info Writing data to Athena {'path': 's3://bb2-prod-datalake-raw/dynamo_sls_payees_v2/date=2025-01-31/001cf1eac17849b3940efd7e8d0b5508.snappy.parquet'}
********
********
s3://bb2-prod-datalake-raw/dynamo_sls_payees_v2/date=2025-01-31/003a6dbbf00a431b89afa47dc8a9eb52.snappy.parquet
info Writing data to Athena {'path': 's3://bb2-prod-datalake-raw/dynamo_sls_payees_v2/date=2025-01-31/003a6dbbf00a431b89afa47dc8a9eb52.snappy.parquet'}
********
********
s3://bb2-prod-datalake-raw/dynamo_sls_payees_v2/date=2025-01-31/00b3813007974f6090cb6af7ca12e9ed.snappy.parquet
info Writing data to Athena {'path': 's3://bb2-prod-datalake-raw/dynamo_sls_payees_v2/date=2025-01-31/00b3813007974f6090cb6af7ca12e9ed.snappy.parquet'}
********
********
s3://bb2-prod-datalake-raw/dynamo_sls_payees_v2/date=2025-01-31/028f25b0341b430ca923eaaae0acb1ce.snappy.parquet
info Writing data to Athena {'pat

In [3]:
# schema = get_athena_schema("datalake_raw", athena_table_name, glue_client)

# files = wr.s3.list_objects(
#     path=partition_prefix,
#     suffix=".parquet",
#     boto3_session=session,
# )

# dfs = []

# for file in files:
#     print("********")
#     print(file)
#     print("********")
#     df = wr.s3.read_parquet(path=file, boto3_session=session)

#     if not df.empty:
#         dfs.append(df)

# # Concatenate all into one dataframe
# if dfs:
#     final_df = pd.concat(dfs, ignore_index=True)
#     final_df = apply_schema(final_df, schema)

#     # Write processed dataframe to S3
#     write_processed_to_s3(final_df, "s3://your-output-path/processed/")
# else:
#     print("⚠️ No dataframes collected. Nothing to write.")