In [52]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('boneka popeye').getOrCreate()


In [53]:
df = spark.read.csv('/content/sample_data/Jual boneka popeye.csv', header = True, inferSchema = True)
df.show(100)

+---------------------+------+-------------+----------------+
|          Nama Produk| Harga|      Terjual|          Lokasi|
+---------------------+------+-------------+----------------+
| Boneka Popeye dan...|  55.0|   86 Terjual|   Jakarta Pusat|
| Boneka Popeye The...|  58.5|    1 Terjual|   Jakarta Barat|
|   Boneka Popeye uk L|  65.0|   18 Terjual|   Jakarta Pusat|
| CBT Boneka Popeye...| 61.38|   22 Terjual|        Surabaya|
|           BONEKA 3kg|  59.0|   21 Terjual|    Kab. Bandung|
| Boneka Popeye kec...|  40.0|   11 Terjual|   Jakarta Utara|
| Boneka Popeye Unisex|  35.0|         NULL|          Bekasi|
| Boneka Popeye Bes...|  55.0|    2 Terjual|   Jakarta Utara|
| Boneka Beruang Te...|  55.5|10RB+ Terjual|            NULL|
| [FREE ONGKIR] Dor...| 89.25|  156 Terjual|   Jakarta Pusat|
| BONEKA POPEYE & O...|  62.7|    7 Terjual|   Kab. Boyolali|
| boneka popeye ata...|  75.0|    7 Terjual|Surakarta (Solo)|
| Boneka Popeye TNI...| 190.0|   27 Terjual|     Kab. Kediri|
| Boneka

In [54]:
df_cleaned = df.dropDuplicates()
df.show()

+--------------------+-----+-------------+----------------+
|         Nama Produk|Harga|      Terjual|          Lokasi|
+--------------------+-----+-------------+----------------+
|Boneka Popeye dan...| 55.0|   86 Terjual|   Jakarta Pusat|
|Boneka Popeye The...| 58.5|    1 Terjual|   Jakarta Barat|
|  Boneka Popeye uk L| 65.0|   18 Terjual|   Jakarta Pusat|
|CBT Boneka Popeye...|61.38|   22 Terjual|        Surabaya|
|          BONEKA 3kg| 59.0|   21 Terjual|    Kab. Bandung|
|Boneka Popeye kec...| 40.0|   11 Terjual|   Jakarta Utara|
|Boneka Popeye Unisex| 35.0|         NULL|          Bekasi|
|Boneka Popeye Bes...| 55.0|    2 Terjual|   Jakarta Utara|
|Boneka Beruang Te...| 55.5|10RB+ Terjual|            NULL|
|[FREE ONGKIR] Dor...|89.25|  156 Terjual|   Jakarta Pusat|
|BONEKA POPEYE & O...| 62.7|    7 Terjual|   Kab. Boyolali|
|boneka popeye ata...| 75.0|    7 Terjual|Surakarta (Solo)|
|Boneka Popeye TNI...|190.0|   27 Terjual|     Kab. Kediri|
|Boneka Popeye The...| 58.5|         NUL

In [55]:
df_cleaned = df_cleaned.na.drop()
df_cleaned.show(100)

+---------------------+------+-------------+------------------+
|          Nama Produk| Harga|      Terjual|            Lokasi|
+---------------------+------+-------------+------------------+
| Boneka Miki Mini ...|  32.5|1,3RB Terjual|       Kab. Bekasi|
| Sepasang boneka I...|  34.0|4,6RB Terjual|     Kab. Karawang|
| BONEKA SESAME/BON...|  95.0|   22 Terjual|  Surakarta (Solo)|
| Boneka Putri Duyu...|  52.0|  318 Terjual|            Bekasi|
| 40cm Boneka Pomni...|  35.0|2,2RB Terjual|            Bekasi|
| Mainan Anak Bonek...|  58.0|   12 Terjual|     Jakarta Utara|
| Boneka ular bahan...|  13.5|  868 Terjual|     Kab. Karawang|
| Boneka Popeye Paj...|  70.0|    3 Terjual|     Jakarta Pusat|
| Boneka Jumbo Tedd...|  83.9|10RB+ Terjual|    Kab. Mojokerto|
| Boneka Miki Mini ...|  33.0|  459 Terjual|            Bekasi|
| Mainan Boneka Lan...|  26.0|  887 Terjual|            Bekasi|
| Mainan Boneka Luc...|  30.0|10RB+ Terjual|            Bekasi|
| PROMO!!!BONEKA UL...|  10.2|1,1RB Terj

In [57]:
from os import truncate
from pyspark.sql.functions import regexp_replace, when, col

# Step 1: Bersihkan koma (ganti ke titik) dan hapus "Terjual" dan "++"
df_cleaned = df_cleaned.withColumn(
    "Terjual",
    regexp_replace(col("Terjual"), r",", ".")  # koma jadi titik
).withColumn(
    "Terjual",
    regexp_replace(col("Terjual"), r"(\s*Terjual|\+)", "")  # hapus "Terjual" dan "++"
)

# Step 2: Tangani "RB"
df_cleaned = df_cleaned.withColumn(
    "Terjual",
    when(
        col("Terjual").rlike("(?i)RB"),  # kalau ada RB
        (regexp_replace(col("Terjual"), "(?i)RB", "").cast("float") * 1000).cast("int")
    ).otherwise(
        col("Terjual").cast("int")  # kalau tidak ada RB
    )
)

df_cleaned = df_cleaned.withColumn(
    "Harga",
    regexp_replace(col("Harga"), r"\.", "").cast("int")
)

df_cleaned.show(100)

+---------------------+-----+-------+------------------+
|          Nama Produk|Harga|Terjual|            Lokasi|
+---------------------+-----+-------+------------------+
| Boneka Miki Mini ...|  325|   1300|       Kab. Bekasi|
| Sepasang boneka I...|  340|   4600|     Kab. Karawang|
| BONEKA SESAME/BON...|  950|     22|  Surakarta (Solo)|
| Boneka Putri Duyu...|  520|    318|            Bekasi|
| 40cm Boneka Pomni...|  350|   2200|            Bekasi|
| Mainan Anak Bonek...|  580|     12|     Jakarta Utara|
| Boneka ular bahan...|  135|    868|     Kab. Karawang|
| Boneka Popeye Paj...|  700|      3|     Jakarta Pusat|
| Boneka Jumbo Tedd...|  839|  10000|    Kab. Mojokerto|
| Boneka Miki Mini ...|  330|    459|            Bekasi|
| Mainan Boneka Lan...|  260|    887|            Bekasi|
| Mainan Boneka Luc...|  300|  10000|            Bekasi|
| PROMO!!!BONEKA UL...|  102|   1100|     Kab. Karawang|
| Boneka Paus biru ...|  385|      1|            Bekasi|
| Bantal Boneka Emo...|  320|  

In [58]:
df_cleaned.show(100)

+---------------------+-----+-------+------------------+
|          Nama Produk|Harga|Terjual|            Lokasi|
+---------------------+-----+-------+------------------+
| Boneka Miki Mini ...|  325|   1300|       Kab. Bekasi|
| Sepasang boneka I...|  340|   4600|     Kab. Karawang|
| BONEKA SESAME/BON...|  950|     22|  Surakarta (Solo)|
| Boneka Putri Duyu...|  520|    318|            Bekasi|
| 40cm Boneka Pomni...|  350|   2200|            Bekasi|
| Mainan Anak Bonek...|  580|     12|     Jakarta Utara|
| Boneka ular bahan...|  135|    868|     Kab. Karawang|
| Boneka Popeye Paj...|  700|      3|     Jakarta Pusat|
| Boneka Jumbo Tedd...|  839|  10000|    Kab. Mojokerto|
| Boneka Miki Mini ...|  330|    459|            Bekasi|
| Mainan Boneka Lan...|  260|    887|            Bekasi|
| Mainan Boneka Luc...|  300|  10000|            Bekasi|
| PROMO!!!BONEKA UL...|  102|   1100|     Kab. Karawang|
| Boneka Paus biru ...|  385|      1|            Bekasi|
| Bantal Boneka Emo...|  320|  

In [59]:
df_cleaned.describe(['Harga', 'Terjual']).show()

+-------+------------------+------------------+
|summary|             Harga|           Terjual|
+-------+------------------+------------------+
|  count|                96|                96|
|   mean|2392.4270833333335|1690.2291666666667|
| stddev|10505.729705606307|2770.4317979206467|
|    min|               100|                 1|
|    max|             94999|             10000|
+-------+------------------+------------------+



In [60]:
from pyspark.sql.functions import corr
df_cleaned.select(corr('Harga', 'Terjual')).show()

+--------------------+
|corr(Harga, Terjual)|
+--------------------+
|0.006712635974657086|
+--------------------+



In [61]:
from pyspark.sql.functions import sum

df_cleaned.groupBy("Lokasi").agg(sum("Terjual").alias("total_terjual")) \
    .orderBy("total_terjual", ascending=False).show()

+------------------+-------------+
|            Lokasi|total_terjual|
+------------------+-------------+
|            Bekasi|        70547|
|     Kab. Karawang|        20904|
|       Kab. Bekasi|        10949|
|     Jakarta Timur|        10041|
|       Tasikmalaya|        10000|
|        Kab. Bogor|        10000|
|    Kab. Mojokerto|        10000|
|            Malang|         6700|
|    Kab. Tangerang|         4600|
|     Jakarta Barat|         1736|
|      Kab. Bandung|         1439|
|      Kab. Jombang|         1300|
|     Kab. Banyumas|         1300|
|   Jakarta Selatan|          691|
|     Jakarta Utara|          412|
|     Jakarta Pusat|          404|
|          Surabaya|          390|
|            Cimahi|          284|
|        Yogyakarta|          258|
|Kab. Bandung Barat|           77|
+------------------+-------------+
only showing top 20 rows



In [62]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer

tokenizer = Tokenizer(inputCol="Nama Produk", outputCol="words")
wordsData = tokenizer.transform(df_cleaned)

remover = StopWordsRemover(inputCol="words", outputCol="filtered")
filteredData = remover.transform(wordsData)

vectorizer = CountVectorizer(inputCol="filtered", outputCol="features")
model = vectorizer.fit(filteredData)
result = model.transform(filteredData)

# Tampilkan 10 kata yang paling sering muncul
vocab = model.vocabulary
frequencies = result.select("features").rdd \
    .map(lambda row: row.features.toArray()) \
    .reduce(lambda x, y: x + y)

for word, freq in sorted(zip(vocab, frequencies), key=lambda x: -x[1])[:10]:
    print(word, int(freq))

boneka 119
mainan 25
/ 16
lembut 15
ukuran 15
anak 14
sni 14
popeye 14
bahan 12
plush 10


In [63]:
#tokenize
from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="Nama Produk", outputCol="items")
data_tokenized = tokenizer.transform(df_cleaned)

In [64]:
from pyspark.ml.fpm import FPGrowth
from pyspark.sql.functions import array_distinct

# Apply array_distinct to the "items" column to remove duplicates
data_tokenized_unique = data_tokenized.withColumn("items", array_distinct("items"))

# Now, apply FPGrowth to the data with unique items
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.01, minConfidence=0.3)
model = fpGrowth.fit(data_tokenized_unique)

In [65]:
model.freqItemsets.show(100, truncate=False)

+---------------------------------------------------------+----+
|items                                                    |freq|
+---------------------------------------------------------+----+
|[size]                                                   |1   |
|[size, ukuran]                                           |1   |
|[size, ukuran, boneka]                                   |1   |
|[size, m]                                                |1   |
|[size, m, ukuran]                                        |1   |
|[size, m, ukuran, boneka]                                |1   |
|[size, m, 40cm]                                          |1   |
|[size, m, 40cm, ukuran]                                  |1   |
|[size, m, 40cm, ukuran, boneka]                          |1   |
|[size, m, 40cm, boneka]                                  |1   |
|[size, m, boneka]                                        |1   |
|[size, 40cm]                                             |1   |
|[size, 40cm, ukuran]    

In [67]:
model.associationRules.show(truncate=False)

+----------------------------------------------------------------------+----------+----------+------------------+--------------------+
|antecedent                                                            |consequent|confidence|lift              |support             |
+----------------------------------------------------------------------+----------+----------+------------------+--------------------+
|[90, laut, hiu, hadiah, tahun, bantal, lembut, ukuran, mainan, boneka]|[paus]    |1.0       |48.0              |0.010416666666666666|
|[90, laut, hiu, hadiah, tahun, bantal, lembut, ukuran, mainan, boneka]|[karakter]|1.0       |19.2              |0.010416666666666666|
|[90, laut, hiu, hadiah, tahun, bantal, lembut, ukuran, mainan, boneka]|[ulang]   |1.0       |13.714285714285714|0.010416666666666666|
|[90, laut, hiu, hadiah, tahun, bantal, lembut, ukuran, mainan, boneka]|[besar]   |1.0       |12.0              |0.010416666666666666|
|[90, laut, hiu, hadiah, tahun, bantal, lembut, ukuran,

In [68]:
model.associationRules.filter("array_contains(antecedent, 'murah')").show(truncate=False)

+-----------------------------------------------+----------+----------+-----------------+--------------------+
|antecedent                                     |consequent|confidence|lift             |support             |
+-----------------------------------------------+----------+----------+-----------------+--------------------+
|[murah, ee, mobil, bantal, /]                  |[sofa]    |1.0       |48.0             |0.010416666666666666|
|[murah, ee, mobil, bantal, /]                  |[boneka]  |1.0       |1.032258064516129|0.010416666666666666|
|[murah, ee, mobil, bantal, /]                  |[pup]     |1.0       |48.0             |0.010416666666666666|
|[murah, ee, mobil, bantal, /]                  |[emoticon]|1.0       |48.0             |0.010416666666666666|
|[murah, ee, emoticon, mobil, sofa, pup, bantal]|[/]       |1.0       |8.0              |0.010416666666666666|
|[murah, ee, emoticon, mobil, sofa, pup, bantal]|[boneka]  |1.0       |1.032258064516129|0.010416666666666666|
|

In [70]:
!mkdir -p /content/output/

In [73]:
df_cleaned.write.csv('/content/output/boneka_cleaned', header=True, mode="overwrite")