#DATASET 5 â€” TRANSACTION LOGS (MIXED DATES & NUMBERS)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder \
.appName("TRANSACTION LOGS") \
.getOrCreate()

In [1]:
raw_transactions = [
("T001","2024-01-05","45000"),
("T002","05/01/2024",52000),
("T003","Jan 06 2024","Thirty Thousand"),
("T004",None,38000),
("T005","2024/01/07","42000")
]

1. Design schema using StructType

In [3]:
from pyspark.sql.types import StructType, StructField, StringType
transactions_schema= StructType([
    StructField('transaction_id',StringType(),nullable=False),
    StructField('transaction_date',StringType(),nullable=True),
    StructField('transaction_amount',StringType(),nullable=True)
])

2. Normalize all dates into DateType

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, when
from pyspark.sql.types import DateType

# Initialize SparkSession if not already active
if 'spark' not in locals() or not isinstance(spark, SparkSession):
    spark = SparkSession.builder \
        .appName("TRANSACTION LOGS") \
        .getOrCreate()

df_transactions = spark.createDataFrame(data=raw_transactions, schema=transactions_schema)

df_transactions_clean = df_transactions.withColumn(
    "transaction_date_clean",
    when(col("transaction_date").rlike("^\\d{4}-\\d{2}-\\d{2}$"), to_date(col("transaction_date"), "yyyy-MM-dd"))
    .when(col("transaction_date").rlike("^\\d{2}/\\d{2}/\\d{4}$"), to_date(col("transaction_date"), "dd/MM/yyyy"))
    .when(col("transaction_date").rlike("^[A-Za-z]{3} \\d{2} \\d{4}$|^[A-Za-z]{3}\\s\\d{2}\\s\\d{4}$|^[A-Za-z]{3}\\.\s\\d{2}\\s\\d{4}$|^[A-Za-z]{3}\\s\\d{1}\\s\\d{4}$|^[A-Za-z]{3}\\.\s\\d{1}\\s\\d{4}$|^[A-Za-z]{3}\\d{2}\\d{4}$"), to_date(col("transaction_date"), "MMM dd yyyy"))
    .when(col("transaction_date").rlike("^\\d{4}/\\d{2}/\\d{2}$"), to_date(col("transaction_date"), "yyyy/MM/dd"))
    .otherwise(None)
)

print("Transactions DataFrame with normalized dates:")
df_transactions_clean.select("transaction_id", "transaction_date", "transaction_date_clean", "transaction_amount").show(truncate=False)
df_transactions_clean.printSchema()

  .when(col("transaction_date").rlike("^[A-Za-z]{3} \\d{2} \\d{4}$|^[A-Za-z]{3}\\s\\d{2}\\s\\d{4}$|^[A-Za-z]{3}\\.\s\\d{2}\\s\\d{4}$|^[A-Za-z]{3}\\s\\d{1}\\s\\d{4}$|^[A-Za-z]{3}\\.\s\\d{1}\\s\\d{4}$|^[A-Za-z]{3}\\d{2}\\d{4}$"), to_date(col("transaction_date"), "MMM dd yyyy"))


Transactions DataFrame with normalized dates:
+--------------+----------------+----------------------+------------------+
|transaction_id|transaction_date|transaction_date_clean|transaction_amount|
+--------------+----------------+----------------------+------------------+
|T001          |2024-01-05      |2024-01-05            |45000             |
|T002          |05/01/2024      |2024-01-05            |52000             |
|T003          |Jan 06 2024     |2024-01-06            |Thirty Thousand   |
|T004          |NULL            |NULL                  |38000             |
|T005          |2024/01/07      |2024-01-07            |42000             |
+--------------+----------------+----------------------+------------------+

root
 |-- transaction_id: string (nullable = false)
 |-- transaction_date: string (nullable = true)
 |-- transaction_amount: string (nullable = true)
 |-- transaction_date_clean: date (nullable = true)



3. Convert amount into integer

In [6]:
from pyspark.sql.functions import col, when
from pyspark.sql.types import IntegerType

df_amount = df_transactions_clean.withColumn(
    "transaction_amount_int",
    when(
        (col("transaction_amount").isNotNull()) & (col("transaction_amount") != '') & col("transaction_amount").rlike("^[0-9]+$"),
        col("transaction_amount").cast(IntegerType())
    ).otherwise(None)
)

print("Transactions DataFrame with normalized amounts:")
df_amount.select("transaction_id", "transaction_amount", "transaction_amount_int").show(truncate=False)
df_amount.printSchema()

Transactions DataFrame with normalized amounts:
+--------------+------------------+----------------------+
|transaction_id|transaction_amount|transaction_amount_int|
+--------------+------------------+----------------------+
|T001          |45000             |45000                 |
|T002          |52000             |52000                 |
|T003          |Thirty Thousand   |NULL                  |
|T004          |38000             |38000                 |
|T005          |42000             |42000                 |
+--------------+------------------+----------------------+

root
 |-- transaction_id: string (nullable = false)
 |-- transaction_date: string (nullable = true)
 |-- transaction_amount: string (nullable = true)
 |-- transaction_date_clean: date (nullable = true)
 |-- transaction_amount_int: integer (nullable = true)



4. Identify unrecoverable records

In [7]:
from pyspark.sql.functions import col

failed_date_conversion = df_amount.filter(
    (col("transaction_date_clean").isNull()) &
    (col("transaction_date").isNotNull()) &
    (col("transaction_date") != '')
)

failed_amount_conversion = df_amount.filter(
    (col("transaction_amount_int").isNull()) &
    (col("transaction_amount").isNotNull()) &
    (col("transaction_amount") != '')
)

print("Records with unrecoverable transaction dates:")
failed_date_conversion.select("transaction_id", "transaction_date", "transaction_date_clean").show(truncate=False)

print("Records with unrecoverable transaction amounts:")
failed_amount_conversion.select("transaction_id", "transaction_amount", "transaction_amount_int").show(truncate=False)

Records with unrecoverable transaction dates:
+--------------+----------------+----------------------+
|transaction_id|transaction_date|transaction_date_clean|
+--------------+----------------+----------------------+
+--------------+----------------+----------------------+

Records with unrecoverable transaction amounts:
+--------------+------------------+----------------------+
|transaction_id|transaction_amount|transaction_amount_int|
+--------------+------------------+----------------------+
|T003          |Thirty Thousand   |NULL                  |
+--------------+------------------+----------------------+



5. Separate valid vs invalid transactions

In [8]:
from pyspark.sql.functions import col

df_valid_transactions = df_amount.filter(
    col("transaction_date_clean").isNotNull() & col("transaction_amount_int").isNotNull()
)

df_invalid_transactions = df_amount.filter(
    (col("transaction_date_clean").isNull() & col("transaction_date").isNotNull() & (col("transaction_date") != '')) |
    (col("transaction_amount_int").isNull() & col("transaction_amount").isNotNull() & (col("transaction_amount") != ''))
)

print("Valid Transactions:")
df_valid_transactions.show(truncate=False)

print("Invalid Transactions:")
df_invalid_transactions.show(truncate=False)

Valid Transactions:
+--------------+----------------+------------------+----------------------+----------------------+
|transaction_id|transaction_date|transaction_amount|transaction_date_clean|transaction_amount_int|
+--------------+----------------+------------------+----------------------+----------------------+
|T001          |2024-01-05      |45000             |2024-01-05            |45000                 |
|T002          |05/01/2024      |52000             |2024-01-05            |52000                 |
|T005          |2024/01/07      |42000             |2024-01-07            |42000                 |
+--------------+----------------+------------------+----------------------+----------------------+

Invalid Transactions:
+--------------+----------------+------------------+----------------------+----------------------+
|transaction_id|transaction_date|transaction_amount|transaction_date_clean|transaction_amount_int|
+--------------+----------------+------------------+--------------

6. Produce a clean transactions DataFrame

In [9]:
df_final_clean_transactions = df_valid_transactions.select("transaction_id", col("transaction_date_clean").alias("transaction_date"), col("transaction_amount_int").alias("transaction_amount"))

print("Final Clean Transactions DataFrame:")
df_final_clean_transactions.show(truncate=False)
df_final_clean_transactions.printSchema()

Final Clean Transactions DataFrame:
+--------------+----------------+------------------+
|transaction_id|transaction_date|transaction_amount|
+--------------+----------------+------------------+
|T001          |2024-01-05      |45000             |
|T002          |2024-01-05      |52000             |
|T005          |2024-01-07      |42000             |
+--------------+----------------+------------------+

root
 |-- transaction_id: string (nullable = false)
 |-- transaction_date: date (nullable = true)
 |-- transaction_amount: integer (nullable = true)

