In [None]:
from pyspark.sql import SparkSession
import os

os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk@11/libexec/openjdk.jdk/Contents/Home"

spark = (
    SparkSession.builder
    .appName("PaymentsBronzeTest")
    .master("local[*]")
    .config("spark.driver.bindAddress", "127.0.0.1")
    .config("spark.driver.host", "127.0.0.1")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("ERROR")
print("Spark version:", spark.version)


In [None]:
raw_transactions_df = spark.read.csv(
    "../data/raw/transactions/ingest_date=2025-09-20/transactions_2025-09-20.csv",
    header=True,
    inferSchema=True
)

raw_transactions_df.show(10)


# Data cleaning / transformation

### Cast numeric and timestamp fields

Input: amount (string/float), txn_ts (string).

Output:

amount → DecimalType(12,2)

txn_ts → TimestampType

Why: Ensures schema consistency and numeric precision.


In [5]:
from pyspark.sql.types import DecimalType
from pyspark.sql import functions as F


def cast_dtypes(df: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:
    """
    Cast columns to correct data types for Bronze layer.

    Args:
        df (pyspark.sql.DataFrame): Input dataframe with raw schema.

    Returns:
        pyspark.sql.DataFrame: Dataframe with amount cast to Decimal(12,2)
        and txn_ts cast to Timestamp.
    """
    cast_df = (
        df
        .withColumn("amount", F.col("amount").cast(DecimalType(12,2)))
        .withColumn("txn_ts", F.to_timestamp("txn_ts"))
    )
    return cast_df

In [None]:
cast_dtypes(raw_transactions_df).printSchema()

In [None]:
def normalise_strings(df: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:
    """Normalize string columns in the DataFrame by trimming whitespace and converting to uppercase.

    Args:
        df (pyspark.sql.dataframe.DataFrame): Input DataFrame with string columns to normalize.

    Returns:
        pyspark.sql.dataframe.DataFrame: DataFrame with normalized string columns.
    """
     
    string_cols = [field.name for field in df.schema.fields if field.dataType == 'string']

    for col in string_cols: 
        df = df.withColumn(col, F.upper(F.trim(F.col(col))))

    return df

In [None]:
normalise_strings(raw_transactions_df).printSchema()