# FP-Growth Examples

In [1]:
import findspark
findspark.init()

from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext()
spark = SparkSession(sc)

## Numerical example from the MLlib website
https://spark.apache.org/docs/latest/ml-frequent-pattern-mining.html

In [2]:
from pyspark.ml.fpm import FPGrowth

df = spark.createDataFrame([
    (0, [1, 2, 5]),
    (1, [1, 2, 3, 5]),
    (2, [1, 2])
], ["id", "items"])
df.show()

fpGrowth = FPGrowth(itemsCol="items", minSupport=0.5, minConfidence=0.6)
model = fpGrowth.fit(df)

# Display frequent itemsets.
model.freqItemsets.show()

# Display generated association rules.
model.associationRules.show()

+---+------------+
| id|       items|
+---+------------+
|  0|   [1, 2, 5]|
|  1|[1, 2, 3, 5]|
|  2|      [1, 2]|
+---+------------+

+---------+----+
|    items|freq|
+---------+----+
|      [5]|   2|
|   [5, 1]|   2|
|[5, 1, 2]|   2|
|   [5, 2]|   2|
|      [2]|   3|
|      [1]|   3|
|   [1, 2]|   3|
+---------+----+

+----------+----------+------------------+
|antecedent|consequent|        confidence|
+----------+----------+------------------+
|       [5]|       [1]|               1.0|
|       [5]|       [2]|               1.0|
|    [1, 2]|       [5]|0.6666666666666666|
|    [5, 2]|       [1]|               1.0|
|    [5, 1]|       [2]|               1.0|
|       [2]|       [5]|0.6666666666666666|
|       [2]|       [1]|               1.0|
|       [1]|       [5]|0.6666666666666666|
|       [1]|       [2]|               1.0|
+----------+----------+------------------+



## msnbc.com anonymous web data
http://archive.ics.uci.edu/ml/machine-learning-databases/msnbc-mld/msnbc.data.html

This data describes the page visits of users who visited msnbc.com

In [3]:
# load the file and manipulate it to adapt to the Spark dataframe structure
df = spark.read.text("msnbc990928.seq").toPandas()
df = df.iloc[7:,].reset_index().dropna()
df.columns = ['id', 'items']
df['items'] = df['items'].apply(lambda x: list(set(x.split())))
spark_df = spark.createDataFrame(df)
spark_df.show()

+---+-------------------+
| id|              items|
+---+-------------------+
|  7|                [1]|
|  8|                [2]|
|  9|          [3, 2, 4]|
| 10|                [5]|
| 11|                [1]|
| 12|                [6]|
| 13|                [1]|
| 14|                [6]|
| 15|          [8, 7, 6]|
| 16|[10, 3, 5, 4, 6, 9]|
| 17|            [1, 11]|
| 18|               [12]|
| 19|                [1]|
| 20|                [8]|
| 21|                [6]|
| 22|                [2]|
| 23|            [9, 12]|
| 24|                [3]|
| 25|                [9]|
| 26|                [3]|
+---+-------------------+
only showing top 20 rows



## Data Dictionary
* 1	frontpage 
* 2	news 
* 3	tech 
* 4	local 
* 5	opinion 
* 6	on-air 
* 7	misc 
* 8	weather 
* 9	msn-news 
* 10	health 
* 11	living 
* 12	business 
* 13	msn-sports 
* 14	sports 
* 15	summary 
* 16	bbs 
* 17	travel

In [4]:
# define values for minSupport and minConfidence

# Fit the model
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.01, minConfidence=0.005)
model = fpGrowth.fit(spark_df)

# Display frequent itemsets.
model.freqItemsets.sort("freq", ascending=False).show(30, False)

# Display generated association rules.
model.associationRules.sort("confidence", ascending=False).show(30, False)

+-------+------+
|items  |freq  |
+-------+------+
|[1]    |313181|
|[6]    |217101|
|[2]    |175286|
|[3]    |121948|
|[4]    |121719|
|[14]   |119138|
|[12]   |112183|
|[8]    |95615 |
|[9]    |90192 |
|[7]    |80514 |
|[13]   |76948 |
|[2, 1] |74704 |
|[11]   |57597 |
|[10]   |50606 |
|[12, 1]|43178 |
|[6, 1] |40078 |
|[4, 1] |39183 |
|[14, 1]|38826 |
|[7, 1] |36419 |
|[3, 1] |33364 |
|[7, 6] |33283 |
|[11, 1]|32722 |
|[15]   |29200 |
|[4, 2] |27533 |
|[7, 4] |27029 |
|[10, 1]|26248 |
|[2, 6] |25425 |
|[3, 2] |25292 |
|[5]    |24987 |
|[12, 2]|23453 |
+-------+------+
only showing top 30 rows

+----------+----------+-------------------+
|antecedent|consequent|confidence         |
+----------+----------+-------------------+
|[11, 2]   |[1]       |0.7542324081538638 |
|[10, 2]   |[1]       |0.6829185520361991 |
|[12, 2]   |[1]       |0.6619195838485482 |
|[7, 2]    |[1]       |0.6501154734411085 |
|[14, 2]   |[1]       |0.6325142375026366 |
|[4, 2]    |[1]       |0.6016779864163004 |
