# Laboratorium 1 - analiza koszykowa

## Przygotowanie

 * pobierz i wypakuj dataset: https://kaggle.com/datasets/rashikrahmanpritom/groceries-dataset-for-market-basket-analysismba?resource=download&select=basket.csv
   * alternatywnie, pobierz plik `basket.csv` z Teamsów
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install more-itertools`

## Część 1. - przygotowanie danych

In [58]:
# importujemy wszystkie potrzebne pakiety

from more_itertools import powerset

In [59]:
# definiujemy stale

PATH = './basket.csv'
EPSILON = 0.001
K = 4

In [60]:
# wczytujemy dane o koszykach

def read_baskets(path: str) -> list[tuple[str]]:
    with open(path) as f:
        raw = f.read()
    baskets = [set([y.lower() for y in x.split(',') if y]) for x in raw.split('\n')[1:] if x]
    return baskets

def unique_products(baskets: list[tuple[str]]) -> list[str]:
    products = set()
    for basket in baskets:
        products.update(basket)
    return sorted(list(products))

baskets = read_baskets(PATH)
products = unique_products(baskets)
N = len(baskets)

## Część 2. - obliczanie wskaźników

In [102]:
# obliczamy strukture danych (np. slownik albo graf) przechowujaca wszystkie interesujace wartosci `support`

def get_supports(baskets: list[tuple[str]], all_products: list[str], epsilon: float):
    def helper(supports, basket, products, matching_baskets):
        nonlocal baskets
        for product in products:
            if product in basket:
                continue
            new_basket = basket + (product,)
            if len(new_basket) > K+1:
                continue
            hits = [x for x in matching_baskets if product in x]
            if len(hits) / N > epsilon:
                supports[new_basket] = 1.0 * len(hits) / N
                helper(supports, new_basket, products, hits)
    supports = {}
    helper(supports, tuple(), all_products, baskets)
    return supports

    
supports = get_supports(baskets, products, EPSILON)
supports

{('abrasive cleaner',): 0.0014702933903628951,
 ('artif. sweetener',): 0.0019381140145692708,
 ('baking powder',): 0.008086613646995923,
 ('bathroom cleaner',): 0.0011361358016440553,
 ('beef',): 0.03395041101383412,
 ('beef', 'bottled beer'): 0.0010693042839002875,
 ('beef', 'bottled water'): 0.0013366303548753592,
 ('beef', 'brown bread'): 0.0015371249081066632,
 ('beef', 'butter'): 0.0011361358016440553,
 ('beef', 'canned beer'): 0.0010024727661565194,
 ('beef', 'citrus fruit'): 0.001804450979081735,
 ('beef', 'curd'): 0.0012697988371315912,
 ('beef', 'domestic eggs'): 0.0011361358016440553,
 ('beef', 'frankfurter'): 0.0010024727661565194,
 ('beef', 'frozen vegetables'): 0.0012697988371315912,
 ('beef', 'fruit/vegetable juice'): 0.0010693042839002875,
 ('beef', 'margarine'): 0.001403461872619127,
 ('beef', 'newspapers'): 0.001670787943594199,
 ('beef', 'other vegetables'): 0.002806923745238254,
 ('beef', 'pastry'): 0.0012029673193878234,
 ('beef', 'rolls/buns'): 0.001603956425850431

In [107]:
# definiujemy funkcje obliczajace support, confidence i lift

def support(supports, products: tuple[str]) -> float:
    return supports[tuple(products)] if tuple(products) in supports.keys() else EPSILON

def confidence(supports, prior_products: tuple[str], following_products: tuple[str]) -> float:
    return support(supports, prior_products + following_products) / support(supports, tuple(prior_products))
    
def lift(supports, prior_products: tuple[str], following_products: tuple[str]) -> float:
    return confidence(supports, prior_products, following_products) / support(supports, tuple(following_products))

In [108]:
print(support(supports, ('whole milk', 'rolls/buns')))
print(confidence(supports, ('whole milk', 'rolls/buns'), ('yogurt',)))
print(lift(supports, ('whole milk', 'rolls/buns'), ('yogurt',)))
print(support(supports, ('sausage', 'semi-finished bread', 'pot plants')))
print(support(supports, ('sausage', 'semi-finished bread')))
print(confidence(supports, ('sausage', 'semi-finished bread'), ('pot plants',)))

0.013967787208447505
0.09569377990430622
1.1142926293448514
1e-06
1e-06
1.0


## Część 3. - generowanie rekomendacji

In [109]:
# wyznaczamy liste potencjalnych rekomendacji
# rekomendowane artykuly powinny miec lift > 1 i jak najwyzszy confidence

def generate_basic_candidates(basket: tuple[str], products: list[str], supports) -> list[tuple[str, tuple[str], float, float]]:
    def helper(result, subbasket, basket, supports):
        nonlocal products
        if len(subbasket) == K:
            return
        for product in basket:
            if product in subbasket:
                continue
            new_subbasket = subbasket + (product,)
            helper(result, new_subbasket, basket, supports)
        if len(subbasket) == 0:
            return
        for product in products:
            if product in basket:
                continue
            if lift(supports, subbasket, (product,)) <= 1:
                continue
            if any([[product, subbasket] == [x[0], x[1]] for x in result]):
                continue
            result.append((product, subbasket, confidence(supports, subbasket, (product,)), lift(supports, subbasket, (product,))))

    result = []
    helper(result, tuple(), tuple(basket), supports)
    return result

print(baskets[1])
print(generate_basic_candidates(baskets[1], products, supports))

{'sausage', 'semi-finished bread', 'yogurt', 'whole milk'}
[('abrasive cleaner', ('sausage', 'semi-finished bread', 'yogurt'), 1.0, 680.1363636363636), ('artif. sweetener', ('sausage', 'semi-finished bread', 'yogurt'), 1.0, 515.9655172413793), ('baby cosmetics', ('sausage', 'semi-finished bread', 'yogurt'), 1.0, 1000000.0), ('bags', ('sausage', 'semi-finished bread', 'yogurt'), 1.0, 1000000.0), ('baking powder', ('sausage', 'semi-finished bread', 'yogurt'), 1.0, 123.66115702479338), ('bathroom cleaner', ('sausage', 'semi-finished bread', 'yogurt'), 1.0, 880.1764705882354), ('beef', ('sausage', 'semi-finished bread', 'yogurt'), 1.0, 29.454724409448822), ('berries', ('sausage', 'semi-finished bread', 'yogurt'), 1.0, 45.898773006134974), ('beverages', ('sausage', 'semi-finished bread', 'yogurt'), 1.0, 60.33467741935484), ('bottled beer', ('sausage', 'semi-finished bread', 'yogurt'), 1.0, 22.069321533923304), ('bottled water', ('sausage', 'semi-finished bread', 'yogurt'), 1.0, 16.479074889

In [79]:
# zaproponuj drugi, bardziej zaawansowany algorytm, np.:
# - jesli produkt X wystepuje w liscie kandydatow kilkukrotnie, oblicz srednia lub iloczyn confidence
# - posortuj kandydatow po iloczynie configence i lift

def generate_advanced_candidates(basket: tuple[str], products: list[str], supports) -> list[tuple[str, tuple[str], float, float]]:
    # return [(item, subbasket, confidence, lift)]
    raise NotImplementedError()

KeyboardInterrupt: 

In [66]:
print(baskets[1])
print(generate_basic_candidates(baskets[1], products, supports))
print(generate_advanced_candidates(baskets[1], products, supports))

{'sausage', 'semi-finished bread', 'yogurt', 'whole milk'}


KeyError: ('sausage', 'semi-finished bread', 'yogurt', 'whole milk', 'sausage', 'abrasive cleaner')

In [None]:
print(baskets[33])
print(generate_basic_candidates(baskets[33], products, supports))
print(generate_advanced_candidates(baskets[33], products, supports))