# 1. Calculate how many accounts of each type there are in "transactions.csv" file using Spark SQL.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType, DateType

def calculateAccountTypes(filePath: str):
    # Define the schema
    schema = StructType([
        StructField("id", IntegerType(), nullable=True),
        StructField("amount", DoubleType(), nullable=True),
        StructField("account_type", StringType(), nullable=True),
        StructField("transaction_date", DateType(), nullable=True),
        StructField("country", StringType(), nullable=True)
    ])

    # Initialize Spark session
    spark = SparkSession.builder.appName("AccountTypeCalculator").getOrCreate()

    # Read CSV file with the specified schema
    transactionsDF = spark.read \
        .option("header", "true") \
        .option("delimiter", ";") \
        .schema(schema) \
        .csv(filePath)

    # Register the DataFrame as a SQL temporary view
    transactionsDF.createOrReplaceTempView("transactions")

    # Execute SQL query to group by account_type and count
    result = spark.sql(
        "SELECT account_type, COUNT(*) as account_type_count " +
        "FROM transactions " +
        "GROUP BY account_type"
    )

    return result

# Call the function to calculate the count of account types
accountCounts = calculateAccountTypes("input_csv_files//transactions.csv")

# Show the result
accountCounts.show()


+------------+------------------+
|account_type|account_type_count|
+------------+------------------+
|    Personal|           1667072|
|Professional|           1667358|
|    Business|           1665570|
+------------+------------------+

