In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("OutlierDetection").getOrCreate()

df = spark.read.csv("/opt/spark_data/ecommerce_transactions_1000.csv", header=True, inferSchema=True)
df = df.fillna({"amount":0})
df = df.withColumn("amount", df["amount"].cast("double"))


In [2]:
quantiles = df.approxQuantile("amount", [0.25, 0.75], 0.05)
Q1, Q3 = quantiles
IQR = Q3 - Q1

lower_bound = Q1 -  1.5 * IQR
upper_bound = Q3 +  1.5 * IQR

print(f"Q1 = {Q1}, Q3 = {Q3}, IQR = {IQR}")
print(f"Lower Bound = {lower_bound}, Upper Bound = {upper_bound}")

Q1 = 0.0, Q3 = 24763.06, IQR = 24763.06
Lower Bound = -37144.590000000004, Upper Bound = 61907.65000000001


In [6]:
outliers = df.filter((df.amount < lower_bound) | (df.amount > upper_bound))
outliers.show()

+--------------+-------+--------+--------------------+-------------------+
|transaction_id|user_id|  amount|               email|   transaction_time|
+--------------+-------+--------+--------------------+-------------------+
|         T0002|   U253|70921.08| porteramy@yahoo.com|2025-03-30 21:07:41|
|         T0005|   U064|81176.73|   louis64@gmail.com|2025-04-14 08:50:35|
|         T0011|   U093| 82119.7|roberttucker@john...|2025-04-20 02:52:35|
|         T0012|   U279| 63515.6|brucesmith@gmail.com|2025-04-20 09:58:53|
|         T0035|   U180|74468.55|michaelcarey@gmai...|2025-04-01 16:09:24|
|         T0036|   U066|88464.76|stephanie50@yahoo...|2025-04-11 05:50:57|
|         T0049|   U050|93898.14|carlsonjames@gard...|2025-04-05 03:12:16|
|         T0052|   U088|70959.19|jessica48@hotmail...|2025-04-25 00:09:15|
|         T0060|   U265|80521.08|      kaitlynsalazar|2025-04-10 17:07:00|
|         T0063|   U098|87681.99|         rachelhayes|2025-04-13 16:25:19|
|         T0066|   U108|8

In [7]:
print("Jumlah Outliers: ", outliers.count())

Jumlah Outliers:  158


In [11]:
from pyspark.sql.functions import col

amount5 = df.orderBy(col("amount").desc()).limit(5)
print("Top 5 transaksi dengan amount terbesar")
amount5.show()

Top 5 transaksi dengan amount terbesar
+--------------+-------+--------+--------------------+-------------------+
|transaction_id|user_id|  amount|               email|   transaction_time|
+--------------+-------+--------+--------------------+-------------------+
|         T0437|   U233|99830.84|franklincraig@gma...|2025-03-31 01:07:47|
|         T0175|   U224|99410.65|natalie63@hotmail...|2025-04-10 14:15:20|
|         T0320|   U046|99399.22|bonniemack@yahoo.com|2025-04-05 21:15:08|
|         T0115|   U148|98589.66|          hillsophia|2025-03-29 20:30:24|
|         T0451|   U293|98343.68|  sean46@walters.com|2025-04-17 14:27:35|
+--------------+-------+--------+--------------------+-------------------+



In [12]:
total_transaksi = df.count()
print(f"Jumlah total transaksi: {total_transaksi}")


Jumlah total transaksi: 1000


In [13]:
from pyspark.sql.functions import sum as spark_sum

total_pendapatan = df.select(spark_sum("amount")).collect()[0][0]
print(f"Total pendapatan dari seluruh transaksi: {total_pendapatan}")


Total pendapatan dari seluruh transaksi: 19644994.95


In [14]:
print("Jumlah Outliers: ", outliers.count())

Jumlah Outliers:  158


In [17]:
outliers_count = outliers.count()  

total_transaksi = df.count()

persentase_outlier = (outliers_count / total_transaksi) * 100

print(f"Persentase outlier terhadap seluruh transaksi: {persentase_outlier:.2f}%")


Persentase outlier terhadap seluruh transaksi: 15.80%
