# 1. Read "transactions.csv" file

In [1]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Spark CSV Analysis") \
    .getOrCreate()

# Read transactions.csv
transactionsDF = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("delimiter", ";") \
    .csv("input_csv_files/transactions.csv")


# 2. Create the function to determine the levels

In [2]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import when, col

def assignTransactionLevel(transactions: DataFrame) -> DataFrame:
    # Calculate quantiles
    quantiles = transactions.stat.approxQuantile("amount", [0.25, 0.75], 0.0)
    lowerQuantile = quantiles[0]
    upperQuantile = quantiles[1]

    # Assign levels based on quantiles
    return transactions.withColumn("level", 
        when(col("amount") > upperQuantile, "high") \
        .when((col("amount") <= upperQuantile) & (col("amount") > lowerQuantile), "average") \
        .otherwise("low"))

# Using the function
transactionsWithLevel = assignTransactionLevel(transactionsDF)
transactionsWithLevel.show()


+------+-------+------------+----------------+-------+-------+
|    id| amount|account_type|transaction_date|country|  level|
+------+-------+------------+----------------+-------+-------+
|179528|-730.86|    Business|      2013-07-10|     SV|    low|
|378343|-946.98|    Personal|      2018-04-06|     YE|    low|
| 75450|7816.92|Professional|      2016-11-20|     SI|   high|
|357719| 704.02|    Business|      2016-11-06|     ID|    low|
|110511| 3462.6|    Personal|      2018-01-18|     BS|average|
|461830| 762.81|Professional|      2017-06-20|     CN|    low|
| 30180|5390.24|Professional|      2021-05-26|     GN|average|
| 65398|4765.77|    Personal|      2018-05-01|     TR|average|
|170899|8775.89|    Business|      2013-10-16|     SK|   high|
|234300|8455.18|Professional|      2015-10-06|     LU|   high|
|208027| 6244.1|    Business|      2020-03-06|     AE|average|
|161212|5904.56|    Personal|      2016-09-07|     EG|average|
|105372|4079.76|Professional|      2015-02-12|     MT|a