# Laboratorium 1 - analiza koszykowa

## Przygotowanie

 * pobierz i wypakuj dataset: https://kaggle.com/datasets/rashikrahmanpritom/groceries-dataset-for-market-basket-analysismba?resource=download&select=basket.csv
   * alternatywnie, pobierz plik `basket.csv` z Teamsów
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install more-itertools`

## Część 1. - przygotowanie danych

In [52]:
# importujemy wszystkie potrzebne pakiety

from more_itertools import powerset

In [53]:
# definiujemy stale

PATH = './basket.csv'
EPSILON = 0.001
K = 4

In [54]:
# wczytujemy dane o koszykach

def read_baskets(path: str) -> list[tuple[str]]:
    with open(path) as f:
        raw = f.read()
    baskets = [set([y.lower() for y in x.split(',') if y]) for x in raw.split('\n')[1:] if x]
    return baskets

def unique_products(baskets: list[tuple[str]]) -> list[str]:
    products = set()
    for basket in baskets:
        products.update(basket)
    return sorted(list(products))

baskets = read_baskets(PATH)
products = unique_products(baskets)
N = len(baskets)

## Część 2. - obliczanie wskaźników

In [55]:
# obliczamy strukture danych (np. slownik albo graf) przechowujaca wszystkie interesujace wartosci `support`

def get_supports(baskets: list[tuple[str]], all_products: list[str], epsilon: float):
    def helper(supports, basket, products, matching_baskets):
        nonlocal baskets
        for product in products:
            if product in basket:
                continue
            new_basket = basket + (product,)
            if len(new_basket) > K+1:
                continue
            hits = [x for x in matching_baskets if product in x]
            if len(hits) / N > epsilon:
                supports[new_basket] = 1.0 * len(hits) / N
                helper(supports, new_basket, products, hits)
    supports = {}
    helper(supports, tuple(), all_products, baskets)
    return supports

    
supports = get_supports(baskets, products, EPSILON)
supports

{('abrasive cleaner',): 0.0014702933903628951,
 ('artif. sweetener',): 0.0019381140145692708,
 ('baking powder',): 0.008086613646995923,
 ('bathroom cleaner',): 0.0011361358016440553,
 ('beef',): 0.03395041101383412,
 ('beef', 'bottled beer'): 0.0010693042839002875,
 ('beef', 'bottled water'): 0.0013366303548753592,
 ('beef', 'brown bread'): 0.0015371249081066632,
 ('beef', 'butter'): 0.0011361358016440553,
 ('beef', 'canned beer'): 0.0010024727661565194,
 ('beef', 'citrus fruit'): 0.001804450979081735,
 ('beef', 'curd'): 0.0012697988371315912,
 ('beef', 'domestic eggs'): 0.0011361358016440553,
 ('beef', 'frankfurter'): 0.0010024727661565194,
 ('beef', 'frozen vegetables'): 0.0012697988371315912,
 ('beef', 'fruit/vegetable juice'): 0.0010693042839002875,
 ('beef', 'margarine'): 0.001403461872619127,
 ('beef', 'newspapers'): 0.001670787943594199,
 ('beef', 'other vegetables'): 0.002806923745238254,
 ('beef', 'pastry'): 0.0012029673193878234,
 ('beef', 'rolls/buns'): 0.001603956425850431

In [56]:
# definiujemy funkcje obliczajace support, confidence i lift

def support(supports, products: tuple[str]) -> float:
    return supports[tuple(products)] if tuple(products) in supports.keys() else EPSILON

def confidence(supports, prior_products: tuple[str], following_products: tuple[str]) -> float:
    return support(supports, prior_products + following_products) / support(supports, tuple(prior_products)) if support(supports, prior_products + following_products) != EPSILON and support(supports, tuple(prior_products)) != EPSILON else EPSILON
    
def lift(supports, prior_products: tuple[str], following_products: tuple[str]) -> float:
    return confidence(supports, prior_products, following_products) / support(supports, tuple(following_products)) if support(supports, prior_products + following_products) != EPSILON and support(supports, tuple(prior_products)) != EPSILON and support(supports, tuple(following_products)) != EPSILON else EPSILON

In [57]:
print(support(supports, ('whole milk', 'rolls/buns')))
print(confidence(supports, ('whole milk', 'rolls/buns'), ('yogurt',)))
print(lift(supports, ('whole milk', 'rolls/buns'), ('yogurt',)))

print(support(supports, ('sausage', 'semi-finished bread', 'pot plants')))
print(support(supports, ('sausage', 'semi-finished bread')))
print(support(supports, ('pot plants')))
print(confidence(supports, ('sausage', 'semi-finished bread'), ('pot plants',)))
print(lift(supports, ('sausage', 'semi-finished bread'), ('pot plants',)))

0.013967787208447505
0.09569377990430622
1.1142926293448514
0.001
0.001
0.001
0.001
0.001


## Część 3. - generowanie rekomendacji

In [58]:
# wyznaczamy liste potencjalnych rekomendacji
# rekomendowane artykuly powinny miec lift > 1 i jak najwyzszy confidence

def generate_basic_candidates(basket: tuple[str], products: list[str], supports) -> list[tuple[str, tuple[str], float, float]]:
    def helper(result, subbasket, basket, supports):
        nonlocal products
        if len(subbasket) == K:
            return
        for product in basket:
            if product in subbasket:
                continue
            new_subbasket = subbasket + (product,)
            helper(result, new_subbasket, basket, supports)
        if len(subbasket) == 0:
            return
        for product in products:
            if product in basket:
                continue
            if lift(supports, subbasket, (product,)) <= 1:
                continue
            if any([[product, subbasket] == [x[0], x[1]] for x in result]):
                continue
            result.append((product, subbasket, confidence(supports, subbasket, (product,)), lift(supports, subbasket, (product,))))

    result = []
    helper(result, tuple(), tuple(basket), supports)
    return result

In [59]:
# zaproponuj drugi, bardziej zaawansowany algorytm, np.:
# - jesli produkt X wystepuje w liscie kandydatow kilkukrotnie, oblicz srednia lub iloczyn confidence
# - posortuj kandydatow po iloczynie confidence i lift

def generate_advanced_candidates(basket: tuple[str], products: list[str], supports) -> list[tuple[str, float]]:
    candidates = generate_basic_candidates(basket, products, supports)
    result = []
    for product in products:
        mean = 1
        n = 0
        for candidate in candidates:
            if candidate[0] == product:
                mean *= candidate[2]*candidate[3]
                n += 1

        mean = mean**(1/n) if n != 0 else 0
        result.append((product, mean))
    result.sort(key=lambda x : x[1])
    return result

In [60]:
print(baskets[1])
print(generate_basic_candidates(baskets[1], products, supports))
print(baskets[1])
print(generate_advanced_candidates(baskets[1], products, supports))

{'yogurt', 'semi-finished bread', 'sausage', 'whole milk'}
[('rolls/buns', ('yogurt', 'whole milk'), 0.11976047904191618, 1.0886853267947703), ('chewing gum', ('yogurt',), 0.016342412451361865, 1.3585084306095978), ('citrus fruit', ('yogurt',), 0.053696498054474705, 1.0106423904265471), ('detergent', ('yogurt',), 0.012451361867704281, 1.4442614544686756), ('hard cheese', ('yogurt',), 0.014785992217898832, 1.0056490979837283), ('herbs', ('yogurt',), 0.013229571984435798, 1.2528739595133724), ('soft cheese', ('yogurt',), 0.014785992217898832, 1.4749520103761349), ('specialty bar', ('yogurt',), 0.014007782101167316, 1.002863366410366), ('rolls/buns', ('sausage', 'whole milk'), 0.12686567164179105, 1.1532752398396837), ('soda', ('sausage', 'whole milk'), 0.11940298507462688, 1.2296124333596987), ('beverages', ('sausage',), 0.02547065337763012, 1.536763655199514), ('bottled beer', ('sausage',), 0.05537098560354374, 1.2220000849348451), ('curd', ('sausage',), 0.04872646733111849, 1.446615338

In [61]:
print(baskets[33])
print(generate_basic_candidates(baskets[33], products, supports))
print(baskets[33])
print(generate_advanced_candidates(baskets[33], products, supports))

{'tropical fruit', 'root vegetables', 'white wine', 'photo/film', 'soda', 'domestic eggs', 'yogurt'}
[('cat food', ('tropical fruit',), 0.014792899408284025, 1.250543242068666), ('flour', ('tropical fruit',), 0.01577909270216963, 1.6171408500175628), ('specialty chocolate', ('tropical fruit',), 0.019723865877712035, 1.2348460465615279), ('uht-milk', ('tropical fruit',), 0.022682445759368838, 1.0606169871794873), ('frozen vegetables', ('root vegetables',), 0.030739673390970224, 1.0977511526231203), ('grapes', ('root vegetables',), 0.015369836695485112, 1.064716974419184), ('hygiene articles', ('root vegetables',), 0.01440922190201729, 1.0517326210726083), ('meat', ('root vegetables',), 0.019212295869356386, 1.1407681868776969), ('processed cheese', ('root vegetables',), 0.015369836695485112, 1.5130188583851563), ('shopping bags', ('root vegetables',), 0.04803073967339097, 1.0093875810856028), ('sliced cheese', ('root vegetables',), 0.01729106628242075, 1.2320296418279129), ('waffles', (