In [10]:
# import kagglehub
# import shutil
# from pathlib import Path

# # Step 1: Download to cache
# path = kagglehub.dataset_download("ealaxi/paysim1")
# print("Downloaded to:", path)

# # Step 2: Move the file to /datafiles/kagglehub/paysim
# src = Path(path) / "PS_20174392719_1491204439457_log.csv"
# dst = Path("/datafiles/kagglehub/paysim")
# dst.mkdir(parents=True, exist_ok=True)
# shutil.copy(src, dst)
# print("Copied to:", dst)

In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, substring, reduce, max, min
from functools import reduce as py_reduce
from pyspark.sql.types import DecimalType

In [12]:
# Start SparkSession
spark = SparkSession.builder \
    .appName("PaySim Dataset Analysis") \
    .master("spark://spark-master:7077") \
    .getOrCreate()

df = spark.read.csv("/data/kagglehub/paysim/PS_20174392719_1491204439457_log.csv", header=True, inferSchema=True)
df.show(5)

+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|step|    type|  amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|
+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|   1| PAYMENT| 9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|      0|             0|
|   1| PAYMENT| 1864.28|C1666544295|      21249.0|      19384.72|M2044282225|           0.0|           0.0|      0|             0|
|   1|TRANSFER|   181.0|C1305486145|        181.0|           0.0| C553264065|           0.0|           0.0|      1|             0|
|   1|CASH_OUT|   181.0| C840083671|        181.0|           0.0|  C38997010|       21182.0|           0.0|      1|             0|
|   1| PAYMENT|11668.14|C2048537720|      41554.0|      29885.86|M1230701703|      

### Summary about dataset

step - maps a unit of time in the real world. In this case 1 step is 1 hour of time. Total steps 744 (30 days simulation).

type - CASH-IN, CASH-OUT, DEBIT, PAYMENT and TRANSFER.

amount -
amount of the transaction in local currency.

nameOrig - customer who started the transaction

oldbalanceOrg - initial balance before the transaction

newbalanceOrig - new balance after the transaction.

nameDest - customer who is the recipient of th.

oldbalanceDest - initial balance recipient before the transaction. Note that there is not information for customers that start with M (Merchants).

newbalanceDest - new balance recipient after the transaction. Note that there is not information for customers that start with M (Merchants).

isFraud - This is the transactions made by the fraudulent agents inside the simulation. In this specific dataset the fraudulent behavior of the agents aims to profit by taking control or customers accounts and try to empty the funds by transferring to another account and then cashing out of the system.

isFlaggedFraud - The business model aims to control massive transfers from one account to another and flags illegal attempts. An illegal attempt in this dataset is an attempt to transfer more than 200.000 in a single transaction.e transaction

In [13]:
df.printSchema()

root
 |-- step: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- nameOrig: string (nullable = true)
 |-- oldbalanceOrg: double (nullable = true)
 |-- newbalanceOrig: double (nullable = true)
 |-- nameDest: string (nullable = true)
 |-- oldbalanceDest: double (nullable = true)
 |-- newbalanceDest: double (nullable = true)
 |-- isFraud: integer (nullable = true)
 |-- isFlaggedFraud: integer (nullable = true)



In [14]:
df.select('amount','step','nameOrig','oldbalanceOrg').summary().show()

+-------+------------------+------------------+-----------+------------------+
|summary|            amount|              step|   nameOrig|     oldbalanceOrg|
+-------+------------------+------------------+-----------+------------------+
|  count|           6362620|           6362620|    6362620|           6362620|
|   mean|179861.90354913156|243.39724563151657|       NULL| 833883.1040744876|
| stddev| 603858.2314629381| 142.3319710491294|       NULL|2888242.6730375625|
|    min|               0.0|                 1|C1000000639|               0.0|
|    25%|          13388.29|               156|       NULL|               0.0|
|    50%|           74859.9|               239|       NULL|           14207.0|
|    75%|         208699.28|               335|       NULL|          107303.0|
|    max|     9.244551664E7|               743| C999999784|     5.958504037E7|
+-------+------------------+------------------+-----------+------------------+



In [15]:
# Get all column names
columns = df.columns

In [16]:
Dict_Null = {col:df.filter(df[col].isNull()).count() for col in df.columns}
Dict_Null

{'step': 0,
 'type': 0,
 'amount': 0,
 'nameOrig': 0,
 'oldbalanceOrg': 0,
 'newbalanceOrig': 0,
 'nameDest': 0,
 'oldbalanceDest': 0,
 'newbalanceDest': 0,
 'isFraud': 0,
 'isFlaggedFraud': 0}

In [17]:
# different type of transacton.
df.groupby('type').count().show()

+--------+-------+
|    type|  count|
+--------+-------+
|TRANSFER| 532909|
| CASH_IN|1399284|
|CASH_OUT|2237500|
| PAYMENT|2151495|
|   DEBIT|  41432|
+--------+-------+



In [18]:
df_flagged = df.filter(df.isFlaggedFraud == 1)
df_flagged.count()

16

In [19]:
df_flagged.select(max(df_flagged.amount).cast(DecimalType(38, 20)).alias("maximum")).show()

+--------------------+
|             maximum|
+--------------------+
|10000000.00000000...|
+--------------------+



In [20]:
df_flagged.select(min(df_flagged.amount).alias("minimum")).show()

+---------+
|  minimum|
+---------+
|353874.22|
+---------+



In [21]:
df.filter((col('isFlaggedFraud')== 0)
          & (col('type')=='TRANSFER' )
          & (col('oldbalanceOrg') == col('newbalanceOrig'))).count()

282784

In [22]:
#lets check the fraud transactions.
df_fraud = df.filter(col('isFraud')==1)
df_fraud.show(4)

+----+--------+------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|step|    type|amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|
+----+--------+------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|   1|TRANSFER| 181.0|C1305486145|        181.0|           0.0| C553264065|           0.0|           0.0|      1|             0|
|   1|CASH_OUT| 181.0| C840083671|        181.0|           0.0|  C38997010|       21182.0|           0.0|      1|             0|
|   1|TRANSFER|2806.0|C1420196421|       2806.0|           0.0| C972765878|           0.0|           0.0|      1|             0|
|   1|CASH_OUT|2806.0|C2101527076|       2806.0|           0.0|C1007251739|       26202.0|           0.0|      1|             0|
+----+--------+------+-----------+-------------+--------------+-----------+--------------+-------

In [23]:
df_fraud.groupBy('type').count().show()

+--------+-----+
|    type|count|
+--------+-----+
|TRANSFER| 4097|
|CASH_OUT| 4116|
+--------+-----+



In [24]:
df.filter("nameDest = 0").count()
# seems no null value for nameDest.

0

In [25]:
# lets distribute the data between Merchant and Customer.
df_split = df.withColumn("dest_type", substring("nameDest", 1, 1)) \
             .withColumn("dest_id", substring("nameDest", 2, 100))

df_split.select("nameDest", "dest_type", "dest_id").show(5, truncate=False)

+-----------+---------+----------+
|nameDest   |dest_type|dest_id   |
+-----------+---------+----------+
|M1979787155|M        |1979787155|
|M2044282225|M        |2044282225|
|C553264065 |C        |553264065 |
|C38997010  |C        |38997010  |
|M1230701703|M        |1230701703|
+-----------+---------+----------+
only showing top 5 rows



In [26]:
df_split.groupby("dest_type").count().show()

+---------+-------+
|dest_type|  count|
+---------+-------+
|        M|2151495|
|        C|4211125|
+---------+-------+



In [27]:
df_merchant = df_split.filter(df_split["dest_type"]=="M")
df_customer = df_split.filter(df_split["dest_type"]=="C")

In [28]:
df_merchant.show(3)

+----+-------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+---------+----------+
|step|   type|  amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|dest_type|   dest_id|
+----+-------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+---------+----------+
|   1|PAYMENT| 9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|      0|             0|        M|1979787155|
|   1|PAYMENT| 1864.28|C1666544295|      21249.0|      19384.72|M2044282225|           0.0|           0.0|      0|             0|        M|2044282225|
|   1|PAYMENT|11668.14|C2048537720|      41554.0|      29885.86|M1230701703|           0.0|           0.0|      0|             0|        M|1230701703|
+----+-------+--------+-----------+-------------+--------------+-----------+--------------+---

In [29]:
df_merchant.groupby("type").count().show()
# It seems Merchants have only one payment type.

+-------+-------+
|   type|  count|
+-------+-------+
|PAYMENT|2151495|
+-------+-------+



In [30]:
df_merchant.groupby("oldbalanceDest").count().show()

+--------------+-------+
|oldbalanceDest|  count|
+--------------+-------+
|           0.0|2151495|
+--------------+-------+



In [31]:
df_merchant.groupby("newbalanceDest").count().show()

+--------------+-------+
|newbalanceDest|  count|
+--------------+-------+
|           0.0|2151495|
+--------------+-------+



In [32]:
df_merchant.filter("isFlaggedFraud = 1").count()
#seems nothing is flagged fraud by the tracker.

0

In [33]:
df_merchant.filter((df_merchant["amount"] > 200000)).show()


+----+-------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+---------+----------+
|step|   type|   amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|dest_type|   dest_id|
+----+-------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+---------+----------+
| 419|PAYMENT|230549.18|C1010785271|        653.0|           0.0| M928086060|           0.0|           0.0|      0|             0|        M| 928086060|
| 518|PAYMENT|203795.61|C1019618849|          0.0|           0.0| M942583605|           0.0|           0.0|      0|             0|        M| 942583605|
| 522|PAYMENT| 207417.7|C1591112582|          0.0|           0.0|M2089516721|           0.0|           0.0|      0|             0|        M|2089516721|
| 524|PAYMENT|219732.65| C590102634|          0.0|           0.0|M1157448222|           

In [34]:
first_7_days = df.filter(col("step") < 24 * 7).count()
first_7_days

1926356

In [35]:
df.groupBy("step") \
  .agg({"isFraud": "sum"})   \
  .orderBy("step") \
  .show()


+----+------------+
|step|sum(isFraud)|
+----+------------+
|   1|          16|
|   2|           8|
|   3|           4|
|   4|          10|
|   5|           6|
|   6|          22|
|   7|          12|
|   8|          12|
|   9|          19|
|  10|          11|
|  11|           7|
|  12|          14|
|  13|          14|
|  14|          12|
|  15|          20|
|  16|          10|
|  17|           7|
|  18|          16|
|  19|          11|
|  20|           4|
+----+------------+
only showing top 20 rows

