In [1]:
from email.feedparser import headerRE

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os

from pyspark.sql.types import StringType

In [2]:
os.environ['SPARK_HOME'] = r'C:\spark\spark-3.5.4-bin-hadoop3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [3]:
spark = (
    SparkSession
    .builder
    .appName("PySpark Zero to Hero")
    .master("local[*]")
    .config("spark.executor.memory", "16g")
    .config("spark.driver.memory", "16g")
    .config("spark.executor.cores", "4")
    .config("spark.sql.shuffle.partitions", "80")
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "2")
    .config("spark.dynamicAllocation.initialExecutors", "24")
    .config("spark.dynamicAllocation.maxExecutors", "50")
    .config('spark.dynamicAllocation.shuffleTrackingEnabled', 'true')
    .config('spark.dynamicAllocation.executorIdleTimeout', '60')
    .config("spark.shuffle.service.enabled", "true")
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .enableHiveSupport()
    .getOrCreate()
)

In [4]:
spark.conf.set('spark.sql.adaptive.enabled', 'true')
spark.conf.set('spark.sql.adaptive.coalescePartitions.enabled', 'true')
spark.conf.set('spark.sql.autoBroadcastJoinThreshold', '-1')

In [5]:
df_path = r"F:\Datasets\CSV datasets\Bank_Transaction_Fraud_Detection.csv"

In [6]:
df = (
    spark.read.format('csv')
      .options(header='true', inferSchema='true')
      .load(df_path)
)

In [7]:
df.show(truncate=False)

+------------------------------------+-------------------+------+---+----------------------------------------+------------------+-------------------------+------------+------------------------------------+----------------+-------------------+------------------+------------------------------------+----------------+-----------------+---------------+----------------------------+--------------------------------------------------+-----------+--------+--------------------+----------------+------------------------------+------------------------+
|Customer_ID                         |Customer_Name      |Gender|Age|State                                   |City              |Bank_Branch              |Account_Type|Transaction_ID                      |Transaction_Date|Transaction_Time   |Transaction_Amount|Merchant_ID                         |Transaction_Type|Merchant_Category|Account_Balance|Transaction_Device          |Transaction_Location                              |Device_Type|Is_Fraud|Trans

In [8]:
df.select(*[F.count_distinct(col).alias(col) for col in df.columns]).show()

+-----------+-------------+------+---+-----+----+-----------+------------+--------------+----------------+----------------+------------------+-----------+----------------+-----------------+---------------+------------------+--------------------+-----------+--------+--------------------+----------------+-----------------------+--------------+
|Customer_ID|Customer_Name|Gender|Age|State|City|Bank_Branch|Account_Type|Transaction_ID|Transaction_Date|Transaction_Time|Transaction_Amount|Merchant_ID|Transaction_Type|Merchant_Category|Account_Balance|Transaction_Device|Transaction_Location|Device_Type|Is_Fraud|Transaction_Currency|Customer_Contact|Transaction_Description|Customer_Email|
+-----------+-------------+------+---+-----+----+-----------+------------+--------------+----------------+----------------+------------------+-----------+----------------+-----------------+---------------+------------------+--------------------+-----------+--------+--------------------+----------------+--------

In [9]:
df.select([
    F.sum(F.when(F.col(column).isNull(), 1).otherwise(0)).alias(column) for column in df.columns
]).show()

+-----------+-------------+------+---+-----+----+-----------+------------+--------------+----------------+----------------+------------------+-----------+----------------+-----------------+---------------+------------------+--------------------+-----------+--------+--------------------+----------------+-----------------------+--------------+
|Customer_ID|Customer_Name|Gender|Age|State|City|Bank_Branch|Account_Type|Transaction_ID|Transaction_Date|Transaction_Time|Transaction_Amount|Merchant_ID|Transaction_Type|Merchant_Category|Account_Balance|Transaction_Device|Transaction_Location|Device_Type|Is_Fraud|Transaction_Currency|Customer_Contact|Transaction_Description|Customer_Email|
+-----------+-------------+------+---+-----+----+-----------+------------+--------------+----------------+----------------+------------------+-----------+----------------+-----------------+---------------+------------------+--------------------+-----------+--------+--------------------+----------------+--------

In [10]:
df.count()

200000

In [11]:
(df.select(
    F.col('Customer_Name'),
    F.col('Age'),
    F.col('Transaction_Amount'),
    F.col('Is_Fraud')
).where(
    F.col('Is_Fraud') == 1
).show())

+-------------------+---+------------------+--------+
|      Customer_Name|Age|Transaction_Amount|Is_Fraud|
+-------------------+---+------------------+--------+
|      Theodore Suri| 56|           85269.3|       1|
|        Jalsa Menon| 24|          70219.41|       1|
|       Ishanvi Kant| 39|          28020.64|       1|
|       Peter Dhawan| 52|          37118.69|       1|
|        Gopal Ratta| 28|          17317.33|       1|
|          Ryan Nagi| 43|           1517.59|       1|
|  Rachita Jayaraman| 21|          82488.69|       1|
|      Niharika Kari| 29|          70716.71|       1|
|     Yashoda Parmer| 33|          32001.45|       1|
|Prisha Venkataraman| 41|          39955.85|       1|
|        Zayan Sidhu| 30|          95667.96|       1|
|   Warinder Agrawal| 67|          60502.58|       1|
|       Nitara Batta| 24|          76437.89|       1|
|      Varenya Batta| 33|          50499.73|       1|
|      Yadavi Mammen| 29|          63840.09|       1|
|      Lakshmi Sethi| 38|   

In [12]:
df.createOrReplaceTempView("transactions")

In [13]:
spark.sql(
    """
    SELECT
        Customer_Name,
        Age,
        Transaction_Amount,
        Is_Fraud
    FROM
        transactions
    WHERE
        Is_Fraud = 1
    """
).show()

+-------------------+---+------------------+--------+
|      Customer_Name|Age|Transaction_Amount|Is_Fraud|
+-------------------+---+------------------+--------+
|      Theodore Suri| 56|           85269.3|       1|
|        Jalsa Menon| 24|          70219.41|       1|
|       Ishanvi Kant| 39|          28020.64|       1|
|       Peter Dhawan| 52|          37118.69|       1|
|        Gopal Ratta| 28|          17317.33|       1|
|          Ryan Nagi| 43|           1517.59|       1|
|  Rachita Jayaraman| 21|          82488.69|       1|
|      Niharika Kari| 29|          70716.71|       1|
|     Yashoda Parmer| 33|          32001.45|       1|
|Prisha Venkataraman| 41|          39955.85|       1|
|        Zayan Sidhu| 30|          95667.96|       1|
|   Warinder Agrawal| 67|          60502.58|       1|
|       Nitara Batta| 24|          76437.89|       1|
|      Varenya Batta| 33|          50499.73|       1|
|      Yadavi Mammen| 29|          63840.09|       1|
|      Lakshmi Sethi| 38|   

In [14]:
spark.sql("""
    SELECT
        *
    FROM
        transactions
    WHERE
        Transaction_Type = "Debit"
    AND
        Transaction_Amount > 1000
    AND
        State = "Andhra Pradesh"
""").show()

+--------------------+------------------+------+---+--------------+-------------+--------------------+------------+--------------------+----------------+-------------------+------------------+--------------------+----------------+-----------------+---------------+--------------------+--------------------+-----------+--------+--------------------+----------------+-----------------------+--------------------+
|         Customer_ID|     Customer_Name|Gender|Age|         State|         City|         Bank_Branch|Account_Type|      Transaction_ID|Transaction_Date|   Transaction_Time|Transaction_Amount|         Merchant_ID|Transaction_Type|Merchant_Category|Account_Balance|  Transaction_Device|Transaction_Location|Device_Type|Is_Fraud|Transaction_Currency|Customer_Contact|Transaction_Description|      Customer_Email|
+--------------------+------------------+------+---+--------------+-------------+--------------------+------------+--------------------+----------------+-------------------+-----

In [15]:
spark.sql("""
    SELECT
        State,
        COUNT(State) AS Count
    FROM
        transactions
    GROUP BY
        State
    ORDER BY
        Count DESC
""").show()

+-----------------+-----+
|            State|Count|
+-----------------+-----+
|         Nagaland| 6031|
|        Meghalaya| 6003|
|    Uttar Pradesh| 6002|
|      Uttarakhand| 5985|
|      Lakshadweep| 5954|
|        Telangana| 5952|
|          Haryana| 5947|
|            Delhi| 5943|
|           Kerala| 5933|
|   Madhya Pradesh| 5928|
|Arunachal Pradesh| 5919|
|           Punjab| 5912|
|          Gujarat| 5901|
|           Odisha| 5899|
|        Jharkhand| 5898|
|          Mizoram| 5892|
| Himachal Pradesh| 5875|
|              Goa| 5871|
|          Tripura| 5869|
|          Manipur| 5869|
+-----------------+-----+
only showing top 20 rows



In [16]:
df.select("State").distinct().show()

+----------------+
|           State|
+----------------+
|Himachal Pradesh|
|         Haryana|
|          Kerala|
|       Jharkhand|
|      Tamil Nadu|
|       Telangana|
|           Bihar|
|         Gujarat|
|    Chhattisgarh|
|         Mizoram|
|  Andhra Pradesh|
|         Tripura|
|   Uttar Pradesh|
|        Nagaland|
|         Manipur|
|       Karnataka|
|          Sikkim|
|      Puducherry|
|           Delhi|
|          Odisha|
+----------------+
only showing top 20 rows



In [17]:
from pyspark.sql.types import StringType

In [18]:
df.withColumn('Name_Parts', F.split(F.col('Customer_Name'), ' ')) \
    .withColumn('First_Name', F.element_at(F.col('Name_Parts'), 1)) \
    .withColumn('Last_Name', F.element_at(F.col('Name_Parts'), 2)) \
    .withColumn('First_Initial', F.substring(F.col('First_Name'), 1, 1)) \
    .withColumn('Second_Surname_Letter', F.substring(F.col('Last_Name'), 1, 1)) \
    .withColumn('Combined_Initials', F.concat(F.col('First_Initial'), F.col('Second_Surname_Letter'))) \
    .show()

+--------------------+-------------------+------+---+--------------------+------------------+--------------------+------------+--------------------+----------------+-------------------+------------------+--------------------+----------------+-----------------+---------------+--------------------+--------------------+-----------+--------+--------------------+----------------+-----------------------+--------------------+--------------------+----------+------------+-------------+---------------------+-----------------+
|         Customer_ID|      Customer_Name|Gender|Age|               State|              City|         Bank_Branch|Account_Type|      Transaction_ID|Transaction_Date|   Transaction_Time|Transaction_Amount|         Merchant_ID|Transaction_Type|Merchant_Category|Account_Balance|  Transaction_Device|Transaction_Location|Device_Type|Is_Fraud|Transaction_Currency|Customer_Contact|Transaction_Description|      Customer_Email|          Name_Parts|First_Name|   Last_Name|First_Initi

In [19]:
name_surname = df.select(
    F.split(F.col('Customer_Name'), ' ')[0].alias('First_Name'),
    F.split(F.col('Customer_Name'), ' ')[1].alias('Last_Name')
)

take_first_letter = F.udf(lambda name: name[0] if name else None, StringType())

In [20]:
name_surname.select(
    'First_Name', take_first_letter('First_Name').alias('First_Letter'),
    'Last_Name', take_first_letter('Last_Name').alias('First_Name_Letter'),
).select(
    F.col('First_Letter'), F.col('First_Name_Letter')
)

DataFrame[First_Letter: string, First_Name_Letter: string]

In [21]:
spark.sql("""
WITH name_parts AS (
    SELECT
        Customer_Name,
        split(Customer_Name, ' ')[0] AS First_Name,
        split(Customer_Name, ' ')[1] AS Last_Name,
        substr(split(Customer_Name, ' ')[0], 1, 1) AS First_Initial,
        substr(split(Customer_Name, ' ')[1], 1, 1) AS Second_Surname_Letter,
        concat(
            First_Initial,
            Second_Surname_Letter
        ) AS Combined_Initials
    FROM
        transactions)
SELECT Combined_Initials FROM name_parts
""").show()

+-----------------+
|Combined_Initials|
+-----------------+
|               OT|
|               HK|
|               EN|
|               YR|
|               KR|
|               ID|
|               AS|
|               JS|
|               BR|
|               MG|
|               CB|
|               YD|
|               AS|
|               MP|
|               EY|
|               RB|
|               BD|
|               TS|
|               GV|
|               SS|
+-----------------+
only showing top 20 rows



In [22]:
df.select(
    F.round(F.avg('Account_Balance'), 3).alias('Average_Balance')
).show()

+---------------+
|Average_Balance|
+---------------+
|      52437.989|
+---------------+



In [23]:
spark.sql("""
    SELECT
        round(avg(Account_Balance), 3) AS Average_Balance
    FROM
        transactions
""").show()

+---------------+
|Average_Balance|
+---------------+
|      52437.989|
+---------------+



In [24]:
df.select(
    F.max('Transaction_Amount').alias('Max_Transaction_Amount'),
).show()

+----------------------+
|Max_Transaction_Amount|
+----------------------+
|              98999.98|
+----------------------+



In [25]:
spark.sql("""
    SELECT
        max(Transaction_Amount)
    FROM transactions
""").show()

+-----------------------+
|max(Transaction_Amount)|
+-----------------------+
|               98999.98|
+-----------------------+



In [26]:
df.groupBy(
    'Customer_ID',
    # 'Transaction_Amount',
).agg(
    F.sum('Transaction_Amount').alias('Total_Transaction_Amount'),
).orderBy(
    'Total_Transaction_Amount', ascending=False
).show(truncate=False, n=5)

+------------------------------------+------------------------+
|Customer_ID                         |Total_Transaction_Amount|
+------------------------------------+------------------------+
|6137084d-c143-437c-b676-e5de957cd9dc|98999.98                |
|052c66e7-dda4-44ce-b384-53eba907d790|98999.45                |
|56b3c3af-2f8b-4717-9d01-ea25c68947a7|98999.02                |
|5eb26096-850d-49e8-899e-fb190dc3ca5b|98997.99                |
|aeb47222-c508-42ec-bf46-1735dcf25805|98997.02                |
+------------------------------------+------------------------+
only showing top 5 rows



In [27]:
spark.sql("""
    SELECT
        Customer_ID,
        sum(Transaction_Amount) AS Total_Transaction_Amount
    FROM
        transactions
    GROUP BY Customer_ID
    ORDER BY sum(Transaction_Amount) DESC
""").show(truncate=False, n=5)

+------------------------------------+------------------------+
|Customer_ID                         |Total_Transaction_Amount|
+------------------------------------+------------------------+
|6137084d-c143-437c-b676-e5de957cd9dc|98999.98                |
|052c66e7-dda4-44ce-b384-53eba907d790|98999.45                |
|56b3c3af-2f8b-4717-9d01-ea25c68947a7|98999.02                |
|5eb26096-850d-49e8-899e-fb190dc3ca5b|98997.99                |
|aeb47222-c508-42ec-bf46-1735dcf25805|98997.02                |
+------------------------------------+------------------------+
only showing top 5 rows



In [28]:
df.groupBy(
    'Merchant_Category'
).count().orderBy('count', ascending=False).show()

+-----------------+-----+
|Merchant_Category|count|
+-----------------+-----+
|       Restaurant|33525|
|    Entertainment|33421|
|      Electronics|33409|
|         Clothing|33340|
|        Groceries|33187|
|           Health|33118|
+-----------------+-----+



In [29]:
spark.sql("""
    SELECT
        Merchant_Category,
        count(*) AS Count
    FROM
        transactions
    GROUP BY
        Merchant_Category
    ORDER BY
        Count DESC
""").show()

+-----------------+-----+
|Merchant_Category|Count|
+-----------------+-----+
|       Restaurant|33525|
|    Entertainment|33421|
|      Electronics|33409|
|         Clothing|33340|
|        Groceries|33187|
|           Health|33118|
+-----------------+-----+



In [30]:
df.groupBy(
    'Account_Type'
).agg(
    F.avg('Transaction_Amount').alias('Average_Transaction_Amount')
).orderBy(
    'Average_Transaction_Amount', ascending=False
).show()

+------------+--------------------------+
|Account_Type|Average_Transaction_Amount|
+------------+--------------------------+
|    Business|         49649.43531278669|
|    Checking|         49646.34864263941|
|     Savings|         49317.90828435421|
+------------+--------------------------+



In [31]:
spark.sql("""
    SELECT
        Account_Type,
        avg(Transaction_Amount) AS Average_Transaction_Amount
    FROM
        transactions
    GROUP BY Account_Type
    ORDER BY Average_Transaction_Amount DESC
""").show()

+------------+--------------------------+
|Account_Type|Average_Transaction_Amount|
+------------+--------------------------+
|    Business|         49649.43531278669|
|    Checking|         49646.34864263941|
|     Savings|         49317.90828435421|
+------------+--------------------------+



In [40]:
df = df.withColumn(
    'Transaction_Date',
    F.to_date(F.col('Transaction_Date'), format='dd-MM-yyyy'),
)

In [68]:
df.show()

+--------------------+-------------------+------+---+--------------------+------------------+--------------------+------------+--------------------+----------------+-------------------+------------------+--------------------+----------------+-----------------+---------------+--------------------+--------------------+-----------+--------+--------------------+----------------+-----------------------+--------------------+---------------------+----------------------+--------------------+----------------------+
|         Customer_ID|      Customer_Name|Gender|Age|               State|              City|         Bank_Branch|Account_Type|      Transaction_ID|Transaction_Date|   Transaction_Time|Transaction_Amount|         Merchant_ID|Transaction_Type|Merchant_Category|Account_Balance|  Transaction_Device|Transaction_Location|Device_Type|Is_Fraud|Transaction_Currency|Customer_Contact|Transaction_Description|      Customer_Email|Year_Transaction_Date|Month_Transaction_Date|Day_transaction_Date|

In [44]:
df = df.withColumn('Year_Transaction_Date', F.year(F.col('Transaction_Date'))) \
    .withColumn('Month_Transaction_Date', F.month(F.col('Transaction_Date'))) \
    .withColumn('Day_transaction_Date', F.day(F.col('Transaction_Date')))

In [56]:
df = df.withColumn(
    "Transaction_Amount_EUR",
    F.round(F.col('Transaction_Amount') * 0.010996533, 2)
)

In [134]:
spark.sql("""
SELECT
    *,
    year(Transaction_Date) AS Year_Transaction_Date,
    month(Transaction_Date) AS Month_Transaction_Date,
    day(Transaction_Date) AS Day_Transaction_Date
FROM (
    SELECT
        date_format(to_date(t.Transaction_Date, 'dd-MM-yyyy'), 'yyyy-MM-dd') as Transaction_Date
    FROM
        transactions t
) AS data_format
""").show()

+----------------+---------------------+----------------------+--------------------+
|Transaction_Date|Year_Transaction_Date|Month_Transaction_Date|Day_Transaction_Date|
+----------------+---------------------+----------------------+--------------------+
|      2025-01-23|                 2025|                     1|                  23|
|      2025-01-11|                 2025|                     1|                  11|
|      2025-01-25|                 2025|                     1|                  25|
|      2025-01-19|                 2025|                     1|                  19|
|      2025-01-30|                 2025|                     1|                  30|
|      2025-01-25|                 2025|                     1|                  25|
|      2025-01-04|                 2025|                     1|                   4|
|      2025-01-16|                 2025|                     1|                  16|
|      2025-01-25|                 2025|                     1|  

In [57]:
df.show()

+--------------------+-------------------+------+---+--------------------+------------------+--------------------+------------+--------------------+----------------+-------------------+------------------+--------------------+----------------+-----------------+---------------+--------------------+--------------------+-----------+--------+--------------------+----------------+-----------------------+--------------------+---------------------+----------------------+--------------------+----------------------+
|         Customer_ID|      Customer_Name|Gender|Age|               State|              City|         Bank_Branch|Account_Type|      Transaction_ID|Transaction_Date|   Transaction_Time|Transaction_Amount|         Merchant_ID|Transaction_Type|Merchant_Category|Account_Balance|  Transaction_Device|Transaction_Location|Device_Type|Is_Fraud|Transaction_Currency|Customer_Contact|Transaction_Description|      Customer_Email|Year_Transaction_Date|Month_Transaction_Date|Day_transaction_Date|

In [63]:
df.select(
    '*',
    F.concat(
        F.col('City'), F.lit(', '), F.col('State')
    ).alias('Combined_Location'),
)

+------------------------------------+-------------------+------+---+-----------+------------------+-------------------------+------------+------------------------------------+----------------+-------------------+------------------+------------------------------------+----------------+-----------------+---------------+------------------+--------------------------+-----------+--------+--------------------+----------------+-----------------------+-----------------------+---------------------+----------------------+--------------------+----------------------+--------------------------+
|Customer_ID                         |Customer_Name      |Gender|Age|State      |City              |Bank_Branch              |Account_Type|Transaction_ID                      |Transaction_Date|Transaction_Time   |Transaction_Amount|Merchant_ID                         |Transaction_Type|Merchant_Category|Account_Balance|Transaction_Device|Transaction_Location      |Device_Type|Is_Fraud|Transaction_Currency|Cu

In [145]:
df_spark = spark.sql("""
    SELECT
        *,
        concat(City, ', ', State) AS Combined_Location
    FROM
        transactions
""")

df_spark.write.mode('overwrite').saveAsTable('transactions')

In [148]:
spark.sql(
    """
        SELECT * FROM transactions
    """
)

DataFrame[Customer_ID: string, Customer_Name: string, Gender: string, Age: int, State: string, City: string, Bank_Branch: string, Account_Type: string, Transaction_ID: string, Transaction_Date: string, Transaction_Time: timestamp, Transaction_Amount: double, Merchant_ID: string, Transaction_Type: string, Merchant_Category: string, Account_Balance: double, Transaction_Device: string, Transaction_Location: string, Device_Type: string, Is_Fraud: int, Transaction_Currency: string, Customer_Contact: string, Transaction_Description: string, Customer_Email: string]

In [143]:
spark.sql("""
    SELECT
        *
    FROM
        transactions
""").show()

+--------------------+-------------------+------+---+--------------------+------------------+--------------------+------------+--------------------+----------------+-------------------+------------------+--------------------+----------------+-----------------+---------------+--------------------+--------------------+-----------+--------+--------------------+----------------+-----------------------+--------------------+
|         Customer_ID|      Customer_Name|Gender|Age|               State|              City|         Bank_Branch|Account_Type|      Transaction_ID|Transaction_Date|   Transaction_Time|Transaction_Amount|         Merchant_ID|Transaction_Type|Merchant_Category|Account_Balance|  Transaction_Device|Transaction_Location|Device_Type|Is_Fraud|Transaction_Currency|Customer_Contact|Transaction_Description|      Customer_Email|
+--------------------+-------------------+------+---+--------------------+------------------+--------------------+------------+--------------------+------

In [155]:
df.withColumn(
    'Is_High_Value',
    F.when(F.col('Transaction_Amount') > 5000, 'Yes').otherwise('No')
).show()

+--------------------+-------------------+------+---+--------------------+------------------+--------------------+------------+--------------------+----------------+-------------------+------------------+--------------------+----------------+-----------------+---------------+--------------------+--------------------+-----------+--------+--------------------+----------------+-----------------------+--------------------+---------------------+----------------------+--------------------+----------------------+-------------+
|         Customer_ID|      Customer_Name|Gender|Age|               State|              City|         Bank_Branch|Account_Type|      Transaction_ID|Transaction_Date|   Transaction_Time|Transaction_Amount|         Merchant_ID|Transaction_Type|Merchant_Category|Account_Balance|  Transaction_Device|Transaction_Location|Device_Type|Is_Fraud|Transaction_Currency|Customer_Contact|Transaction_Description|      Customer_Email|Year_Transaction_Date|Month_Transaction_Date|Day_tra