# TASK 02 Frequent pattern mining with FPGrowth

> Load the dataset sample_fpgrowth.txt, which is available in the folder Googgle Drive\DemoNov29


In [1]:
import findspark
findspark.init()

import os
PROJECT_HOME = os.path.abspath(os.curdir)
print(PROJECT_HOME)

import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.fpm import FPGrowth
import pyspark.sql.functions as F

/home/noobcoder/0_Project/school/BigData/DemoNov29


## SETUP

In [2]:
SPARK_MASTER_HOST = os.environ.get('SPARK_MASTER_HOST', 'localhost')
sc = SparkSession.builder\
    .master(f'spark://{SPARK_MASTER_HOST}:7077')\
    .appName('Quiz04_2')\
    .config('spark.executor.memory', '512m')\
    .getOrCreate()

In [3]:
pd.set_option('display.max_colwidth', 200)

## a. Perform an appropriate SparkML API (DF-based) on the given dataset

In [4]:
df = sc.read.csv(f"file://{PROJECT_HOME}/data/sample_fpgrowth.txt", header=None).toDF('item')
df = df.withColumn('item', F.split(F.col('item'), ' '))
df.toPandas()

Unnamed: 0,item
0,"[r, z, h, k, p]"
1,"[z, y, x, w, v, u, t, s]"
2,"[s, x, o, n, r]"
3,"[x, z, y, m, t, s, q, e]"
4,[z]
5,"[x, z, y, r, q, t, p]"


In [5]:
fpGrowth = FPGrowth(itemsCol="item", minSupport=0.5, minConfidence=0.6)
model = fpGrowth.fit(df)

#### Display frequent itemsets.

In [6]:
model.freqItemsets.toPandas()

Unnamed: 0,items,freq
0,[s],3
1,"[s, x]",3
2,[r],3
3,[y],3
4,"[y, x]",3
5,"[y, x, z]",3
6,"[y, t]",3
7,"[y, t, x]",3
8,"[y, t, x, z]",3
9,"[y, t, z]",3


#### Display generated association rules.

In [7]:
model.associationRules.toPandas()

Unnamed: 0,antecedent,consequent,confidence,lift
0,[t],[y],1.0,2.0
1,[t],[x],1.0,1.5
2,[t],[z],1.0,1.2
3,"[y, t, x]",[z],1.0,1.2
4,[x],[s],0.75,1.5
5,[x],[y],0.75,1.5
6,[x],[z],0.75,0.9
7,[x],[t],0.75,1.5
8,"[y, z]",[x],1.0,1.5
9,"[y, z]",[t],1.0,2.0


In [8]:
model.transform(df).toPandas()

Unnamed: 0,item,prediction
0,"[r, z, h, k, p]","[y, x, t]"
1,"[z, y, x, w, v, u, t, s]",[]
2,"[s, x, o, n, r]","[y, z, t]"
3,"[x, z, y, m, t, s, q, e]",[]
4,[z],"[y, x, t]"
5,"[x, z, y, r, q, t, p]",[s]


## b. Perform an appropriate MLLib API (RDD-based) on the given dataset

In [9]:
from pyspark.mllib.fpm import FPGrowth as MLibFPGrowth

In [10]:
context = sc.sparkContext
data = context.textFile(f"file://{PROJECT_HOME}/data/sample_fpgrowth.txt")
lines = data.map(lambda x: str(x).strip().split(' '))

In [11]:
model = MLibFPGrowth.train(lines, minSupport=0.5)
model.freqItemsets().toDF().toPandas()

Unnamed: 0,items,freq
0,[t],3
1,"[t, x]",3
2,"[t, x, z]",3
3,"[t, z]",3
4,[s],3
5,"[s, x]",3
6,[z],5
7,[y],3
8,"[y, t]",3
9,"[y, t, x]",3
