# Laboratorium 1 - analiza koszykowa

## Przygotowanie

 * pobierz i wypakuj dataset: https://kaggle.com/datasets/rashikrahmanpritom/groceries-dataset-for-market-basket-analysismba?resource=download&select=basket.csv
   * alternatywnie, pobierz plik `basket.csv` z Teamsów
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install more-itertools`

## Część 1. - przygotowanie danych

In [1]:
# importujemy wszystkie potrzebne pakiety

from more_itertools import powerset_of_sets

In [2]:
# definiujemy stale

PATH = './basket.csv'
EPSILON = 0.0005 # Changed to 0.0005 so that the minimal 
                 # number of baskets required to draw any statistical conclusions is EPSILON * len(baskets) ~= 7
K = 4

In [3]:
# wczytujemy dane o koszykach

def read_baskets(path: str) -> list[tuple[str]]:
    with open(path) as f:
        raw = f.read()
    baskets = [set([y.lower() for y in x.split(',') if y]) for x in raw.split('\n')[1:] if x]
    return baskets

def unique_products(baskets: list[tuple[str]]) -> list[str]:
    products = set()
    for basket in baskets:
        products.update(basket)
    return sorted(list(products))

baskets = read_baskets(PATH)
products = unique_products(baskets)

In [4]:
baskets[:10]

[{'pastry', 'salty snack', 'whole milk'},
 {'sausage', 'semi-finished bread', 'whole milk', 'yogurt'},
 {'pickled vegetables', 'soda'},
 {'canned beer', 'misc. beverages'},
 {'hygiene articles', 'sausage'},
 {'rolls/buns', 'sausage', 'whole milk'},
 {'soda', 'whole milk'},
 {'frankfurter', 'soda', 'whipped/sour cream'},
 {'curd', 'frankfurter'},
 {'beef', 'white bread'}]

In [5]:
products[:10]

['abrasive cleaner',
 'artif. sweetener',
 'baby cosmetics',
 'bags',
 'baking powder',
 'bathroom cleaner',
 'beef',
 'berries',
 'beverages',
 'bottled beer']

In [6]:
len(baskets), len(products)

(14963, 167)

## Część 2. - obliczanie wskaźników

In [7]:
# obliczamy strukture danych (np. slownik albo graf) przechowujaca wszystkie interesujace wartosci `support`

from collections import defaultdict
from tqdm import tqdm

def key(basket: tuple[str]) -> str:
    return ",".join(sorted(basket))

def get_supports(baskets: list[set[str]], all_products: list[str], epsilon: float):
    # raise NotImplementedError()
    N = len(baskets)
    supports = defaultdict(lambda : epsilon)

    def supp(subbasket: set[str]) -> float:
        return 1/N * sum(1 for basket in baskets if subbasket.issubset(basket))
        
    for basket in tqdm(baskets):            
        for subbasket in powerset_of_sets(basket):
            if len(subbasket) > K:
                break
            if len(subbasket) == 0:
                continue
            if (k := key(subbasket)) not in supports and (s := supp(subbasket)) > epsilon:
                supports[k] = s
    return supports

    
supports = get_supports(baskets, products, EPSILON)

100%|███████████████████████████████████████████████████████████████████████| 14963/14963 [00:16<00:00, 918.61it/s]


In [8]:
# definiujemy funkcje obliczajace support, confidence i lift

def support(supports, products: set[str]) -> float:
    return supports[key(products)]

def confidence(supports, prior_products: set[str], following_products: set[str]) -> float:
    return support(supports, prior_products | following_products) / support(supports, prior_products)
    
def lift(supports, prior_products: set[str], following_products: tuple[str]) -> float:
    return support(supports, prior_products | following_products) / (support(supports, prior_products) * support(supports, following_products))

In [9]:
print(support(supports, {'whole milk', 'rolls/buns'}))
print(confidence(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))
print(lift(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))

0.013967787208447505
0.09569377990430622
1.114292629344851


## Część 3. - generowanie rekomendacji

In [10]:
# wyznaczamy liste potencjalnych rekomendacji
# rekomendowane artykuly powinny miec lift > 1 i jak najwyzszy confidence

def generate_basic_candidates(basket: set[str], products: list[str], supports) -> list[tuple[str, set[str], float, float]]:
    # return [(item, subbasket, confidence, lift)]
    # raise NotImplementedError()
    recommendations = []
    
    for subbasket in powerset_of_sets(basket):
        if len(subbasket) > K-1:
            break
        if len(subbasket) == 0:
            continue
            
        for product in products:
            if product not in basket:
                lift_value = lift(supports, subbasket, {product})
                confidence_value = confidence(supports, subbasket, {product})
                # If dataset contains enough data to draw conclusions and baskets are not independent
                if support(supports, subbasket | {product}) > EPSILON and lift_value > 1:
                    recommendations.append((product, subbasket, confidence_value, lift_value))
                    
    return sorted(recommendations, key=lambda t: t[2], reverse=True)
    

In [16]:
# zaproponuj drugi, bardziej zaawansowany algorytm, np.:
# - jesli produkt X wystepuje w liscie kandydatow kilkukrotnie, oblicz srednia lub iloczyn confidence
# - posortuj kandydatow po iloczynie configence i lift

import math

def generate_advanced_candidates(basket: set[str], products: list[str], supports) -> list[tuple[str, set[str], float, float]]:
    # return [(item, subbasket, confidence, lift)]
    # raise NotImplementedError()
    
    basic_recommendations = generate_basic_candidates(basket, products, supports)
    recommendations = {}
    
    for item, subbasket, confidence_value, lift_value in basic_recommendations:
        if item not in recommendations:
            recommendations[item] = ([subbasket], [confidence_value], [lift_value])
        else:
            recommendations[item][0].append(subbasket)
            recommendations[item][1].append(confidence_value)
            recommendations[item][2].append(lift_value)

    recommendations = [(product, *x) for product, x in recommendations.items()]
    
    return sorted(recommendations, key=lambda t: math.prod(t[2])**(1/len(t[2])), reverse=True)


In [21]:
print(baskets[1])
generate_basic_candidates(baskets[1], products, supports)[:5]

{'semi-finished bread', 'yogurt', 'sausage', 'whole milk'}


[('rolls/buns',
  {'sausage', 'whole milk'},
  0.12686567164179108,
  1.153275239839684),
 ('rolls/buns',
  {'whole milk', 'yogurt'},
  0.11976047904191615,
  1.08868532679477),
 ('soda', {'sausage', 'whole milk'}, 0.11940298507462688, 1.2296124333596985),
 ('soda', {'sausage', 'yogurt'}, 0.11627906976744186, 1.1974423406264505),
 ('soda', {'sausage'}, 0.09856035437430787, 1.0149749363405152)]

In [22]:
generate_advanced_candidates(baskets[1], products, supports)[:5]

[('rolls/buns',
  [{'sausage', 'whole milk'}, {'whole milk', 'yogurt'}],
  [0.12686567164179108, 0.11976047904191615],
  [1.153275239839684, 1.08868532679477]),
 ('soda',
  [{'sausage', 'whole milk'}, {'sausage', 'yogurt'}, {'sausage'}],
  [0.11940298507462688, 0.11627906976744186, 0.09856035437430787],
  [1.2296124333596985, 1.1974423406264505, 1.0149749363405152]),
 ('tropical fruit',
  [{'semi-finished bread'}, {'whole milk', 'yogurt'}],
  [0.07746478873239437, 0.0718562874251497],
  [1.1431022029613578, 1.0603408567480421]),
 ('root vegetables',
  [{'semi-finished bread'}],
  [0.0704225352112676],
  [1.0122309263844351]),
 ('pork',
  [{'sausage', 'whole milk'}],
  [0.06716417910447761],
  [1.8107704719645015])]

In [20]:
print(baskets[33])
generate_basic_candidates(baskets[33], products, supports)[:5]

{'yogurt', 'photo/film', 'soda', 'tropical fruit', 'white wine', 'root vegetables', 'domestic eggs'}


[('rolls/buns',
  {'root vegetables', 'tropical fruit'},
  0.16363636363636364,
  1.4875400419750358),
 ('whole milk', {'soda', 'yogurt'}, 0.16091954022988506, 1.0189754889800127),
 ('sausage', {'soda', 'yogurt'}, 0.11494252873563217, 1.904634615139827),
 ('sausage',
  {'root vegetables', 'soda'},
  0.11392405063291139,
  1.887758105891753),
 ('pip fruit', {'white wine'}, 0.07428571428571427, 1.5143557804593226)]

In [19]:
generate_advanced_candidates(baskets[33], products, supports)[:5]

[('rolls/buns',
  [{'root vegetables', 'tropical fruit'}],
  [0.16363636363636364],
  [1.4875400419750358]),
 ('whole milk',
  [{'soda', 'yogurt'}],
  [0.16091954022988506],
  [1.0189754889800127]),
 ('sausage',
  [{'soda', 'yogurt'},
   {'root vegetables', 'soda'},
   {'white wine'},
   {'yogurt'},
   {'soda'}],
  [0.11494252873563217,
   0.11392405063291139,
   0.06857142857142857,
   0.06692607003891049,
   0.06125258086717137],
  [1.904634615139827,
   1.887758105891753,
   1.13625059326056,
   1.1089864739670183,
   1.0149749363405152]),
 ('pip fruit', [{'white wine'}], [0.07428571428571427], [1.5143557804593226]),
 ('citrus fruit',
  [{'white wine'}, {'yogurt'}],
  [0.057142857142857134, 0.053696498054474705],
  [1.0755076370170706, 1.010642390426547])]