In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession.builder.appName("Big Data Frequent Itemsets").getOrCreate()

print(spark)

<pyspark.sql.session.SparkSession object at 0x7f90c36ef9d0>


In [2]:
# Load excel
import pandas as pd
from pyspark.sql.types import *

df_excel = pd.read_excel("/root/Lecture/BIGDATA/datasets/Online Retail.xlsx")

header = StructType([ StructField("InvoiceNo", StringType(), True)\
                       ,StructField("StockCode", StringType(), True)\
                       ,StructField("Description", StringType(), True)\
                       ,StructField("Quantity", IntegerType(), True)\
                       ,StructField("InvoiceDate", StringType(), True)\
                       ,StructField("UnitPrice", DoubleType(), True)\
                       ,StructField("CustomerID", StringType(), True)\
                       ,StructField("Country", StringType(), True)])

df = spark.createDataFrame(df_excel, schema=header)
type(df)

pyspark.sql.dataframe.DataFrame

In [3]:
# Show dataset Online Retail
df.show()

+---------+---------+--------------------+--------+--------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|         InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|java.util.Gregori...|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|java.util.Gregori...|     3.39|   17850.0|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|java.util.Gregori...|     2.75|   17850.0|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|java.util.Gregori...|     3.39|   17850.0|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|java.util.Gregori...|     3.39|   17850.0|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|java.util.Gregori...|     7.65|   17850.0|United Kingdom|
|   536365|    2173

In [4]:
# Drop some column
dropped_column = ['Quantity', 'InvoiceDate', 'UnitPrice', 'Country', 'Description', 'CustomerID']
df = df.drop(*dropped_column)

# Show dataset after some column dropped
df.show()

+---------+---------+
|InvoiceNo|StockCode|
+---------+---------+
|   536365|   85123A|
|   536365|    71053|
|   536365|   84406B|
|   536365|   84029G|
|   536365|   84029E|
|   536365|    22752|
|   536365|    21730|
|   536366|    22633|
|   536366|    22632|
|   536367|    84879|
|   536367|    22745|
|   536367|    22748|
|   536367|    22749|
|   536367|    22310|
|   536367|    84969|
|   536367|    22623|
|   536367|    22622|
|   536367|    21754|
|   536367|    21755|
|   536367|    21777|
+---------+---------+
only showing top 20 rows



In [5]:
# Group dataset 
from pyspark.sql import functions as F

gdf = df.groupby("InvoiceNo").agg(F.collect_set("StockCode"))

# Show groupped dataset
gdf.show()

+---------+----------------------+
|InvoiceNo|collect_set(StockCode)|
+---------+----------------------+
|   536596|  [22900, 22114, 84...|
|   536938|  [22112, 21931, 84...|
|   537252|               [22197]|
|   537691|  [22505, 46000R, 2...|
|   538041|               [22145]|
|   538184|  [22561, 22147, 21...|
|   538517|  [22749, 21212, 22...|
|   538879|  [21212, 22759, 22...|
|   539275|  [22083, 22150, 22...|
|   539630|  [22111, 22971, 22...|
|   540499|  [22697, 22796, 21...|
|   540540|  [22111, 22834, 22...|
|   540976|  [22413, 21212, 22...|
|   541432|  [22113, 22457, 21...|
|   541518|  [21212, 22432, 22...|
|   541783|  [22561, 22697, 22...|
|   542026|  [22398, 22194, 22...|
|   542375|  [22629, 21731, 22...|
|   543641|  [22645, 75131, 22...|
|   544303|  [84596L, 22931, 8...|
+---------+----------------------+
only showing top 20 rows



In [6]:
# Frequent itemsets with Minimum Support = 0.01 and Minimum Confidence = 0.8
from pyspark.ml.fpm import FPGrowth

fpGrowth = FPGrowth(itemsCol="collect_set(StockCode)", minSupport=0.01, minConfidence=0.8)
model = fpGrowth.fit(gdf)
model.freqItemsets.show()

+----------------+----+
|           items|freq|
+----------------+----+
|         [22633]| 487|
|         [23236]| 344|
|        [85123A]|2246|
|         [22423]|2172|
| [22423, 85123A]| 355|
|         [22667]| 486|
|         [22579]| 343|
|  [22579, 22578]| 282|
|        [85099B]|2135|
| [85099B, 22423]| 288|
|[85099B, 85123A]| 404|
|         [22620]| 486|
|        [84536A]| 342|
|         [71053]| 342|
|         [47566]|1706|
| [47566, 85099B]| 332|
|  [47566, 22423]| 398|
| [47566, 85123A]| 391|
|         [85150]| 483|
|         [20725]|1608|
+----------------+----+
only showing top 20 rows



In [7]:
# Association rule with Minimum Support = 0.01 and Minimum Confidence = 0.8
model.associationRules.show()

+--------------------+----------+------------------+
|          antecedent|consequent|        confidence|
+--------------------+----------+------------------+
|      [20723, 22355]|   [20724]|0.8038277511961722|
|      [22356, 20719]|   [20724]|0.8211586901763224|
|      [22356, 22355]|   [20724]|               0.8|
|        [DOT, 22386]|  [85099B]|0.8028985507246377|
|      [22698, 22697]|   [22699]|0.8524844720496895|
|[20723, 20719, 22...|   [20724]|0.8585526315789473|
|      [21931, 22386]|  [85099B]| 0.803088803088803|
|[22411, 21931, 22...|  [85099B]| 0.867741935483871|
|             [22579]|   [22578]|0.8221574344023324|
|      [22698, 22423]|   [22697]|0.8606965174129353|
|      [22698, 22423]|   [22699]|0.8383084577114428|
|             [22698]|   [22697]|0.8029925187032418|
|      [22411, 22386]|  [85099B]|0.8050847457627118|
|      [20723, 22356]|   [20724]|0.8586387434554974|
|      [21928, 22386]|  [85099B]|0.8333333333333334|
|      [22698, 22699]|   [22697]|0.89413680781

In [8]:
# Item prediction with Minimum Support = 0.01 and Minimum Confidence = 0.8
model.transform(gdf).show()

+---------+----------------------+----------+
|InvoiceNo|collect_set(StockCode)|prediction|
+---------+----------------------+----------+
|   536596|  [22900, 22114, 84...|        []|
|   536938|  [22112, 21931, 84...|  [85099B]|
|   537252|               [22197]|        []|
|   537691|  [22505, 46000R, 2...|        []|
|   538041|               [22145]|        []|
|   538184|  [22561, 22147, 21...|        []|
|   538517|  [22749, 21212, 22...|        []|
|   538879|  [21212, 22759, 22...|        []|
|   539275|  [22083, 22150, 22...|        []|
|   539630|  [22111, 22971, 22...|        []|
|   540499|  [22697, 22796, 21...|        []|
|   540540|  [22111, 22834, 22...|        []|
|   540976|  [22413, 21212, 22...|        []|
|   541432|  [22113, 22457, 21...|        []|
|   541518|  [21212, 22432, 22...|        []|
|   541783|  [22561, 22697, 22...|        []|
|   542026|  [22398, 22194, 22...|        []|
|   542375|  [22629, 21731, 22...|        []|
|   543641|  [22645, 75131, 22...|