In [18]:
import findspark
findspark.init('/opt/spark')
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
spark = SparkSession.builder\
            .appName('localiza')\
            .config("spark.executor.memory", "14g")\
            .config("spark.storage.memoryFraction", "0")\
            .config("spark.driver.memory", '14G') \
            .config("spark.executor.memory", '14G') \
            .config("spark.driver.maxResultSize", '4G') \
            .config("spark.executor.JavaOptions", "-XX:+UseG1GC")\
            .config("spark.executor.extraJavaOptions", "-XX:+UseCompressedOops -XX:ConcGCThreads=20 -XX:InitiatingHeapOcuupancyPercent=35")\
            .getOrCreate()

In [21]:
df = spark.read.csv('data.csv', header=True).dropna(subset=['Description'])\
                                .withColumn('Description', f.upper('Description'))\
                                .withColumn('InvoiceNo', f.upper('InvoiceNo'))

In [22]:
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: string (nullable = true)
 |-- CustomerID: string (nullable = true)
 |-- Country: string (nullable = true)



In [23]:
df2 = df.groupBy('InvoiceNo').agg(f.collect_set('Description').alias('product'))

In [24]:
transactions = [tuple(row[0]) for row in df2.select('product').collect()]

In [25]:
transactions_fp = [row[0] for row in df2.select('product').collect()]

In [26]:
from efficient_apriori import apriori
import pyfpgrowth

In [27]:
import time
import pandas as pd
confidence = [0.1, 0.5]
suporte = [0.1, 0.09, 0.08, 0.07, 0.06, 0.05, 0.04, 0.03, 0.02, 0.01]
suport_fp = [int(len(transactions)*0.1), 
             int(len(transactions)*0.09),
             int(len(transactions)*0.08),
             int(len(transactions)*0.07),
             int(len(transactions)*0.06),
             int(len(transactions)*0.05),
             int(len(transactions)*0.04),
             int(len(transactions)*0.03),
             int(len(transactions)*0.02),
             int(len(transactions)*0.01)]

In [28]:
dfObj = pd.DataFrame(columns=['suport', 'confidence', 'time', 'frequents', 'rule_size', 'algorithm', 'itemset', 'rules'])

In [29]:
for i in suporte: 
    for j in confidence:
        start = time.time()
        itemsets, rules = apriori(transactions, min_support=i, min_confidence=j)
        end = time.time()
        total_time = end - start 
        number_of_frequents = []
        for key in itemsets.keys():
            number_of_frequents.append(len(itemsets[key].keys()))
        dfObj = dfObj.append({'suport': i, 'confidence': j, 
                              'time': total_time, 'frequents': number_of_frequents,
                             'rule_size': len(rules), 'algorithm': 'apriori', 
                             'itemset': itemsets, 'rules': rules}, ignore_index=True)

In [30]:
for i in suport_fp: 
    for j in confidence:
        start = time.time()
        itemsets = pyfpgrowth.find_frequent_patterns(transactions, i)
        rules = pyfpgrowth.generate_association_rules(itemsets, j)
        end = time.time()
        total_time = end - start 
#         number_of_frequents = []
#         for key in itemsets.keys():
#             number_of_frequents.append(len(itemsets[key].keys()))
        dfObj = dfObj.append({'suport': i, 'confidence': j, 
                              'time': total_time, 'frequents': None,
                             'rules': None,'algorithm': 'fp_growth', 
                             'itemset': itemsets, 'rules': rules}, ignore_index=True)

In [31]:
dfObj.to_csv('results_trab_desc_all.csv', header=True, index=False)

In [32]:
dfObj

Unnamed: 0,suport,confidence,time,frequents,rule_size,algorithm,itemset,rules
0,0.1,0.1,0.14586,[],0.0,apriori,{},[]
1,0.1,0.5,0.112128,[],0.0,apriori,{},[]
2,0.09,0.1,0.113984,[1],0.0,apriori,"{1: {('WHITE HANGING HEART T-LIGHT HOLDER',): ...",[]
3,0.09,0.5,0.112541,[1],0.0,apriori,"{1: {('WHITE HANGING HEART T-LIGHT HOLDER',): ...",[]
4,0.08,0.1,0.164124,[3],0.0,apriori,"{1: {('REGENCY CAKESTAND 3 TIER',): 2169, ('JU...",[]
5,0.08,0.5,0.16596,[3],0.0,apriori,"{1: {('REGENCY CAKESTAND 3 TIER',): 2169, ('JU...",[]
6,0.07,0.1,0.164233,[3],0.0,apriori,"{1: {('REGENCY CAKESTAND 3 TIER',): 2169, ('JU...",[]
7,0.07,0.5,0.172891,[3],0.0,apriori,"{1: {('REGENCY CAKESTAND 3 TIER',): 2169, ('JU...",[]
8,0.06,0.1,0.195371,[6],0.0,apriori,"{1: {('PARTY BUNTING',): 1706, ('ASSORTED COLO...",[]
9,0.06,0.5,0.190114,[6],0.0,apriori,"{1: {('PARTY BUNTING',): 1706, ('ASSORTED COLO...",[]


In [33]:
start = time.time()
itemsets, rules = apriori(transactions, min_support=0.01, min_confidence=0.1)
end = time.time()

In [34]:
itemsets

{1: {(' SET 2 TEA TOWELS I LOVE LONDON ',): 272,
  ('HOT WATER BOTTLE TEA AND SYMPATHY',): 648,
  ('CHILLI LIGHTS',): 669,
  ('WHITE SKULL HOT WATER BOTTLE ',): 503,
  ('JUMBO BAG PINK POLKADOT',): 1231,
  ('JUMBO  BAG BAROQUE BLACK WHITE',): 947,
  ('JUMBO BAG WOODLAND ANIMALS',): 872,
  ('RED RETROSPOT CHARLOTTE BAG',): 1050,
  ('CHARLOTTE BAG PINK POLKADOT',): 760,
  ('JUMBO BAG OWLS',): 663,
  ('JUMBO STORAGE BAG SUKI',): 1201,
  ('CHOCOLATE HOT WATER BOTTLE',): 866,
  ('SMALL POPCORN HOLDER',): 603,
  ('MEMO BOARD COTTAGE DESIGN',): 316,
  ('FELTCRAFT BUTTERFLY HEARTS',): 489,
  ('12 PENCILS TALL TUBE WOODLAND',): 253,
  ('12 PENCILS SMALL TUBE RED RETROSPOT',): 366,
  ('HANGING METAL HEART LANTERN',): 363,
  ('COLOUR GLASS T-LIGHT HOLDER HANGING',): 633,
  ('FELTCRAFT 6 FLOWER FRIENDS',): 556,
  ('HEART OF WICKER LARGE',): 942,
  ('HEART OF WICKER SMALL',): 1212,
  ('AGED GLASS SILVER T-LIGHT HOLDER',): 345,
  ('RECIPE BOX PANTRY YELLOW DESIGN',): 1164,
  ('WOOD S/3 CABINET ANT W