In [6]:
import pandas as pd
from itertools import combinations

# Load dataset
df = pd.read_csv('/content/groceries_dataset.csv')

# Show first few rows
df.head()


Unnamed: 0,Item 1,Item 2,Item 3,Item 4,Item 5,Item 6,Item 7,Item 8,Item 9,Item 10,...,Item 23,Item 24,Item 25,Item 26,Item 27,Item 28,Item 29,Item 30,Item 31,Item 32
0,citrus fruit,semi-finished bread,margarine,ready soups,,,,,,,...,,,,,,,,,,
1,tropical fruit,yogurt,coffee,,,,,,,,...,,,,,,,,,,
2,whole milk,,,,,,,,,,...,,,,,,,,,,
3,pip fruit,yogurt,cream cheese,meat spreads,,,,,,,...,,,,,,,,,,
4,other vegetables,whole milk,condensed milk,long life bakery product,,,,,,,...,,,,,,,,,,


In [8]:
transactions = []
for i in range(len(df)):
    transaction = set()
    for col in df.columns:
        item = str(df.iloc[i][col])
        if item != "nan":
            transaction.add(item)
    transactions.append(transaction)

print("Total transactions:", len(transactions))
print("Example transaction:", list(transactions[0]))


Total transactions: 9835
Example transaction: ['margarine', 'semi-finished bread', 'ready soups', 'citrus fruit']


In [9]:
def get_support(transactions, itemset):
    count = sum(1 for t in transactions if itemset.issubset(t))
    return count / len(transactions)


In [10]:
def get_frequent_itemsets(transactions, candidates, min_support):
    frequent = []
    support_data = {}
    for itemset in candidates:
        sup = get_support(transactions, itemset)
        if sup >= min_support:
            frequent.append(itemset)
            support_data[frozenset(itemset)] = sup
    return frequent, support_data


In [11]:
def generate_candidates(frequent_itemsets, k):
    candidates = []
    n = len(frequent_itemsets)
    for i in range(n):
        for j in range(i+1, n):
            l1 = list(frequent_itemsets[i])
            l2 = list(frequent_itemsets[j])
            l1.sort(); l2.sort()
            if l1[:k-2] == l2[:k-2]:  # join step
                candidate = frequent_itemsets[i] | frequent_itemsets[j]
                if candidate not in candidates:
                    candidates.append(candidate)
    return candidates


In [12]:
def apriori(transactions, min_support=0.02):
    # 1-itemsets
    C1 = []
    for t in transactions:
        for item in t:
            if [item] not in C1:
                C1.append([item])
    C1 = [set(c) for c in C1]

    L1, support_data = get_frequent_itemsets(transactions, C1, min_support)
    L = [L1]

    k = 2
    while len(L[k-2]) > 0:
        Ck = generate_candidates(L[k-2], k)
        Lk, supK = get_frequent_itemsets(transactions, Ck, min_support)
        support_data.update(supK)
        if len(Lk) == 0:
            break
        L.append(Lk)
        k += 1

    return L, support_data


In [13]:
def generate_rules(L, support_data, min_conf=0.5):
    rules = []
    for i in range(1, len(L)):   # from 2-itemsets onwards
        for freq_set in L[i]:
            for conseq in combinations(freq_set, 1):
                conseq = set(conseq)
                antecedent = freq_set - conseq
                if len(antecedent) == 0:
                    continue
                conf = support_data[frozenset(freq_set)] / support_data[frozenset(antecedent)]
                lift = conf / support_data[frozenset(conseq)]
                if conf >= min_conf:
                    rules.append({
                        "rule": f"{antecedent} → {conseq}",
                        "support": support_data[frozenset(freq_set)],
                        "confidence": conf,
                        "lift": lift
                    })
    return rules


In [14]:
rules

[{'rule': "{'other vegetables', 'yogurt'} → {'whole milk'}",
  'support': 0.02226741230299949,
  'confidence': 0.5128805620608898,
  'lift': 2.0072345116867694}]

In [16]:
L, support_data = apriori(transactions, min_support=0.02)

print("Frequent Itemsets:")
for level in L:
    for itemset in level:
        print(itemset, "Support:", support_data[frozenset(itemset)])


Frequent Itemsets:
{'margarine'} Support: 0.05856634468734113
{'citrus fruit'} Support: 0.08276563294356888
{'tropical fruit'} Support: 0.10493136756481952
{'coffee'} Support: 0.05805795627859685
{'yogurt'} Support: 0.13950177935943062
{'whole milk'} Support: 0.25551601423487547
{'pip fruit'} Support: 0.07564819522114896
{'cream cheese'} Support: 0.03965429588205389
{'other vegetables'} Support: 0.1934926283680732
{'long life bakery product'} Support: 0.037417386883579054
{'butter'} Support: 0.05541433655312659
{'rolls/buns'} Support: 0.18393492628368074
{'UHT-milk'} Support: 0.03345195729537367
{'bottled beer'} Support: 0.08052872394509406
{'bottled water'} Support: 0.11052364006100661
{'chocolate'} Support: 0.04961870869344179
{'white bread'} Support: 0.042094560244026434
{'curd'} Support: 0.05327910523640061
{'beef'} Support: 0.05246568378240976
{'soda'} Support: 0.17437722419928825
{'frankfurter'} Support: 0.058973055414336555
{'chicken'} Support: 0.04290798169801729
{'newspapers'}

In [22]:
def generate_rules(L, support_data, min_conf=0.5):
    rules = []
    for i in range(1, len(L)):   # from 2-itemsets onwards
        for freq_set in L[i]:
            for conseq in combinations(freq_set, 1):  # single consequent
                conseq = set(conseq)
                antecedent = freq_set - conseq
                if len(antecedent) == 0:
                    continue
                conf = support_data[frozenset(freq_set)] / support_data[frozenset(antecedent)]
                lift = conf / support_data[frozenset(conseq)]
                if conf >= min_conf:
                    rules.append({
                        "antecedent": antecedent,
                        "consequent": conseq,
                        "support": support_data[frozenset(freq_set)],
                        "confidence": conf,
                        "lift": lift
                    })
    return rules


In [None]:
def print_rules_natural(rules, top_n=10):
    for r in rules[:top_n]:
        antecedent = ', '.join(list(r['antecedent']))
        consequent = ', '.join(list(r['consequent']))
        print(f"If someone buys {antecedent}, "
              f"then with probability = {r['confidence']:.2f}, "
              f"they will also buy {consequent}. "
              f"(Support: {r['support']:.2f}, Lift: {r['lift']:.2f})")


In [25]:
# Run Apriori
L, support_data = apriori(transactions, min_support=0.02)

# Generate rules
rules = generate_rules(L, support_data, min_conf=0.45)

# Print natural language interpretation
print("Natural Language Association Rules:\n")
print_rules_natural(rules, top_n=10)


Natural Language Association Rules:

If someone buys butter, then with probability = 0.50, they will also buy whole milk. (Support: 0.03, Lift: 1.95)
If someone buys curd, then with probability = 0.49, they will also buy whole milk. (Support: 0.03, Lift: 1.92)
If someone buys domestic eggs, then with probability = 0.47, they will also buy whole milk. (Support: 0.03, Lift: 1.85)
If someone buys other vegetables, yogurt, then with probability = 0.51, they will also buy whole milk. (Support: 0.02, Lift: 2.01)
If someone buys root vegetables, whole milk, then with probability = 0.47, they will also buy other vegetables. (Support: 0.02, Lift: 2.45)
If someone buys root vegetables, other vegetables, then with probability = 0.49, they will also buy whole milk. (Support: 0.02, Lift: 1.91)
