In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('Frequent Item Set') \
    .getOrCreate()

spark   

In [3]:
# Import the data from CSV
spark_df = spark.read \
            .option("header", True) \
            .option("sep", ";") \
            .csv("MidTermExam_data/input-market-05111740000122/market-basket.csv")
spark_df.show()

+------+--------------------+--------+----------------+-----+----------+--------------+
|BillNo|            Itemname|Quantity|            Date|Price|CustomerID|       Country|
+------+--------------------+--------+----------------+-----+----------+--------------+
|536365|WHITE HANGING HEA...|       6|01.12.2010 08:26| 2,55|     17850|United Kingdom|
|536365| WHITE METAL LANTERN|       6|01.12.2010 08:26| 3,39|     17850|United Kingdom|
|536365|CREAM CUPID HEART...|       8|01.12.2010 08:26| 2,75|     17850|United Kingdom|
|536365|KNITTED UNION FLA...|       6|01.12.2010 08:26| 3,39|     17850|United Kingdom|
|536365|RED WOOLLY HOTTIE...|       6|01.12.2010 08:26| 3,39|     17850|United Kingdom|
|536365|SET 7 BABUSHKA NE...|       2|01.12.2010 08:26| 7,65|     17850|United Kingdom|
|536365|GLASS STAR FROSTE...|       6|01.12.2010 08:26| 4,25|     17850|United Kingdom|
|536366|HAND WARMER UNION...|       6|01.12.2010 08:28| 1,85|     17850|United Kingdom|
|536366|HAND WARMER RED P...|   

In [4]:
# Select only the first two column
basket_df = spark_df.select("BillNo", "Itemname")
basket_df.show()

+------+--------------------+
|BillNo|            Itemname|
+------+--------------------+
|536365|WHITE HANGING HEA...|
|536365| WHITE METAL LANTERN|
|536365|CREAM CUPID HEART...|
|536365|KNITTED UNION FLA...|
|536365|RED WOOLLY HOTTIE...|
|536365|SET 7 BABUSHKA NE...|
|536365|GLASS STAR FROSTE...|
|536366|HAND WARMER UNION...|
|536366|HAND WARMER RED P...|
|536367|ASSORTED COLOUR B...|
|536367|POPPY'S PLAYHOUSE...|
|536367|POPPY'S PLAYHOUSE...|
|536367|FELTCRAFT PRINCES...|
|536367|IVORY KNITTED MUG...|
|536367|BOX OF 6 ASSORTED...|
|536367|BOX OF VINTAGE JI...|
|536367|BOX OF VINTAGE AL...|
|536367|HOME BUILDING BLO...|
|536367|LOVE BUILDING BLO...|
|536367|RECIPE BOX WITH M...|
+------+--------------------+
only showing top 20 rows



In [5]:
basket_df.count()

522064

In [6]:
# Remove the duplicates if available
basket_df = basket_df.dropDuplicates(["BillNo", "Itemname"])
basket_df.count()

511280

In [7]:
#Rearrange the Item based on Bill Number

from pyspark.sql.functions import *
from pyspark.ml.fpm import FPGrowth
import pandas

basket_new = basket_df.groupBy("BillNo").agg(collect_list("Itemname").alias("ItemList")).sort('BillNo')
basket_new.show()

+------+--------------------+
|BillNo|            ItemList|
+------+--------------------+
|536365|[KNITTED UNION FL...|
|536366|[HAND WARMER UNIO...|
|536367|[BOX OF VINTAGE J...|
|536368|[YELLOW COAT RACK...|
|536369|[BATH BUILDING BL...|
|536370|[SPACEBOY LUNCH B...|
|536371|[PAPER CHAIN KIT ...|
|536372|[HAND WARMER UNIO...|
|536373|[GLASS STAR FROST...|
|536374|[VICTORIAN SEWING...|
|536375|[SAVE THE PLANET ...|
|536376|[RED HANGING HEAR...|
|536377|[HAND WARMER RED ...|
|536378|[PACK OF 60 PINK ...|
|536380|[JAM MAKING SET P...|
|536381|[ZINC WILLIE WINK...|
|536382|[VINTAGE SNAKES &...|
|536384|[ENAMEL BREAD BIN...|
|536385|[TRADITIONAL CHRI...|
|536386|[JUMBO BAG RED RE...|
+------+--------------------+
only showing top 20 rows



In [8]:
fpGrowth = FPGrowth(itemsCol="ItemList", minSupport=0.006, minConfidence=0.006)

# 2. Train/fit the data training to become a model
model = fpGrowth.fit(basket_new)

In [9]:
# Frequent Itemset
model.freqItemsets.show()



ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it

In [None]:
# Display generated association rules.
model.associationRules.show()

In [11]:
# Write the output to console sink to check the output
writing_df = model.write \
    .format("xlsx") \
    .option("path", "MidTermExam_data/output-market-05111740000122/") \
    .outputMode("append") \
    .start()
    
# Start the streaming application to run until the following happens
# 1. Exception in the running program
# 2. Manual Interruption
writing_df.awaitTermination()

AttributeError: 'function' object has no attribute 'format'