# Apriori Algorithm aka Association Rule Learning

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Data Preprocessing

In [2]:
#Loading the dataset
dataset = pd.read_csv('groceries.csv', header = 0, index_col = 0, on_bad_lines = 'skip')

#Get rid of problematic characters using regex:
dataset.replace(r'[()\/]','', regex=True, inplace=True)

#Removing the index and resetting the columns:
dataset.reset_index(drop=True, inplace=True)
dataset.columns = range(len(dataset.columns))

#Initialize an empty list to store the transactions:
transactions = []

#Iterate through each row and fill the 'transactions' list:
for i in range(len(dataset)):
  transactions.append([str(dataset.values[i,j]) for j in range(len(dataset.columns))])

## Training the Apriori model on the dataset

In [3]:
from apyori import apriori
rules = apriori(transactions = transactions, min_support = 0.001, min_confidence = 0.2, min_lift = 3, min_length = 2, max_length = 2)

## Visualising the results

### Displaying the first results coming directly from the output of the apriori function

In [4]:
results = list(rules)

In [5]:
results

[RelationRecord(items=frozenset({'liquor', 'bottled beer'}), support=0.002457002457002457, ordered_statistics=[OrderedStatistic(items_base=frozenset({'liquor'}), items_add=frozenset({'bottled beer'}), confidence=0.3, lift=7.963043478260869)]),
 RelationRecord(items=frozenset({'other vegetables', 'citrus fruit'}), support=0.0014742014742014742, ordered_statistics=[OrderedStatistic(items_base=frozenset({'citrus fruit'}), items_add=frozenset({'other vegetables'}), confidence=0.23684210526315788, lift=5.806911857958148)]),
 RelationRecord(items=frozenset({'whole milk', 'curd'}), support=0.002620802620802621, ordered_statistics=[OrderedStatistic(items_base=frozenset({'curd'}), items_add=frozenset({'whole milk'}), confidence=0.253968253968254, lift=4.642144282862846)]),
 RelationRecord(items=frozenset({'redblush wine', 'liquor'}), support=0.002293202293202293, ordered_statistics=[OrderedStatistic(items_base=frozenset({'liquor'}), items_add=frozenset({'redblush wine'}), confidence=0.279999999

### Putting the results well organised into a Pandas DataFrame

In [6]:
def inspect(results):
    lhs         = [tuple(result[2][0][0])[0] for result in results]
    rhs         = [tuple(result[2][0][1])[0] for result in results]
    supports    = [result[1] for result in results]
    confidences = [result[2][0][2] for result in results]
    lifts       = [result[2][0][3] for result in results]
    return list(zip(lhs, rhs, supports, confidences, lifts))
resultsinDataFrame = pd.DataFrame(inspect(results), columns = ['Left Hand Side', 'Right Hand Side', 'Support', 'Confidence', 'Lift'])

### Displaying the results non sorted

In [7]:
resultsinDataFrame

Unnamed: 0,Left Hand Side,Right Hand Side,Support,Confidence,Lift
0,liquor,bottled beer,0.002457,0.3,7.963043
1,citrus fruit,other vegetables,0.001474,0.236842,5.806912
2,curd,whole milk,0.002621,0.253968,4.642144
3,liquor,redblush wine,0.002293,0.28,24.773913
4,pip fruit,other vegetables,0.001966,0.210526,5.161699
5,soft cheese,rollsbuns,0.001638,0.30303,4.057018
6,root vegetables,whole milk,0.002948,0.214286,3.916809


### Displaying the results sorted by descending lifts

In [8]:
resultsinDataFrame.nlargest(n = 10, columns = 'Lift')

Unnamed: 0,Left Hand Side,Right Hand Side,Support,Confidence,Lift
3,liquor,redblush wine,0.002293,0.28,24.773913
0,liquor,bottled beer,0.002457,0.3,7.963043
1,citrus fruit,other vegetables,0.001474,0.236842,5.806912
4,pip fruit,other vegetables,0.001966,0.210526,5.161699
2,curd,whole milk,0.002621,0.253968,4.642144
5,soft cheese,rollsbuns,0.001638,0.30303,4.057018
6,root vegetables,whole milk,0.002948,0.214286,3.916809


In [9]:
# Format the results for easier use
formatted_results = []
for result in results:
    formatted_results.append({
        'items': list(result.items),
        'support': result.support,
        'confidence': result.ordered_statistics[0].confidence,
        'lift': result.ordered_statistics[0].lift
    })

# Save the results to a file (e.g., JSON)
import json
with open('apriori_results.json', 'w') as f:
    json.dump(formatted_results, f)