# Laboratorium 1 - analiza koszykowa

## Przygotowanie

 * pobierz i wypakuj dataset: https://kaggle.com/datasets/rashikrahmanpritom/groceries-dataset-for-market-basket-analysismba?resource=download&select=basket.csv
   * alternatywnie, pobierz plik `basket.csv` z Teamsów
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install more-itertools`

## Część 1. - przygotowanie danych

In [100]:
# importujemy wszystkie potrzebne pakiety

from more_itertools import powerset

In [101]:
# definiujemy stale

PATH = './basket.csv'
EPSILON = 0.00001
K = 10

In [102]:
# wczytujemy dane o koszykach

def read_baskets(path: str) -> list[tuple[str]]:
    with open(path) as f:
        raw = f.read()
    baskets = [set([y.lower() for y in x.split(',') if y]) for x in raw.split('\n')[1:] if x]
    return baskets

def unique_products(baskets: list[tuple[str]]) -> list[str]:
    products = set()
    for basket in baskets:
        products.update(basket)
    return sorted(list(products))

baskets = read_baskets(PATH)
products = unique_products(baskets)
N = len(baskets)

## Część 2. - obliczanie wskaźników

In [103]:
# obliczamy strukture danych (np. slownik albo graf) przechowujaca wszystkie interesujace wartosci `support`

def get_supports(baskets: list[tuple[str]], all_products: list[str], epsilon: float):
    def helper(supports, basket, products, matching_baskets):
        nonlocal baskets
        for product in products:
            if product in basket:
                continue
            new_basket = basket + (product,)
            if len(new_basket) > K+1:
                continue
            hits = [x for x in matching_baskets if product in x]
            if len(hits) / N > epsilon:
                supports[new_basket] = 1.0 * len(hits) / N
                helper(supports, new_basket, products, hits)
    supports = {}
    helper(supports, tuple(), all_products, baskets)
    return supports

    
supports = get_supports(baskets, products, EPSILON)
supports

{('abrasive cleaner',): 0.0014702933903628951,
 ('abrasive cleaner', 'beef'): 0.00013366303548753594,
 ('abrasive cleaner', 'frozen meals'): 0.00013366303548753594,
 ('abrasive cleaner', 'frozen vegetables'): 0.00013366303548753594,
 ('abrasive cleaner', 'meat'): 0.00013366303548753594,
 ('abrasive cleaner', 'other vegetables'): 0.00020049455323130388,
 ('abrasive cleaner', 'pip fruit'): 0.00013366303548753594,
 ('abrasive cleaner', 'soda'): 0.00013366303548753594,
 ('abrasive cleaner', 'whipped/sour cream'): 0.00013366303548753594,
 ('abrasive cleaner', 'whole milk'): 0.00020049455323130388,
 ('abrasive cleaner', 'yogurt'): 0.00013366303548753594,
 ('artif. sweetener',): 0.0019381140145692708,
 ('artif. sweetener', 'bottled water'): 0.00013366303548753594,
 ('artif. sweetener', 'bottled water', 'soda'): 0.00013366303548753594,
 ('artif. sweetener', 'butter'): 0.00013366303548753594,
 ('artif. sweetener', 'curd'): 0.00013366303548753594,
 ('artif. sweetener', 'domestic eggs'): 0.000200

In [104]:
# definiujemy funkcje obliczajace support, confidence i lift

def support(supports, products: tuple[str]) -> float:
    return supports[tuple(products)] if tuple(products) in supports.keys() else EPSILON

def confidence(supports, prior_products: tuple[str], following_products: tuple[str]) -> float:
    return support(supports, prior_products + following_products) / support(supports, tuple(prior_products)) if support(supports, prior_products + following_products) != EPSILON and support(supports, tuple(prior_products)) != EPSILON else EPSILON
    
def lift(supports, prior_products: tuple[str], following_products: tuple[str]) -> float:
    return confidence(supports, prior_products, following_products) / support(supports, tuple(following_products)) if support(supports, prior_products + following_products) != EPSILON and support(supports, tuple(prior_products)) != EPSILON and support(supports, tuple(following_products)) != EPSILON else EPSILON

In [105]:
print(support(supports, ('whole milk', 'rolls/buns')))
print(confidence(supports, ('whole milk', 'rolls/buns'), ('yogurt',)))
print(lift(supports, ('whole milk', 'rolls/buns'), ('yogurt',)))

print(support(supports, ('sausage', 'semi-finished bread', 'pot plants')))
print(support(supports, ('sausage', 'semi-finished bread')))
print(support(supports, ('pot plants')))
print(confidence(supports, ('sausage', 'semi-finished bread'), ('pot plants',)))
print(lift(supports, ('sausage', 'semi-finished bread'), ('pot plants',)))

0.013967787208447505
0.09569377990430622
1.1142926293448514
0.0001
0.0006014836596939117
0.0001
0.0001
0.0001


## Część 3. - generowanie rekomendacji

In [106]:
# wyznaczamy liste potencjalnych rekomendacji
# rekomendowane artykuly powinny miec lift > 1 i jak najwyzszy confidence

def generate_basic_candidates(basket: tuple[str], products: list[str], supports) -> list[tuple[str, tuple[str], float, float]]:
    def helper(result, subbasket, basket, supports):
        nonlocal products
        if len(subbasket) == K:
            return
        for product in basket:
            if product in subbasket:
                continue
            new_subbasket = subbasket + (product,)
            helper(result, new_subbasket, basket, supports)
        if len(subbasket) == 0:
            return
        for product in products:
            if product in basket:
                continue
            if lift(supports, subbasket, (product,)) <= 1:
                continue
            if any([[product, subbasket] == [x[0], x[1]] for x in result]):
                continue
            result.append((product, subbasket, confidence(supports, subbasket, (product,)), lift(supports, subbasket, (product,))))

    result = []
    helper(result, tuple(), tuple(basket), supports)
    return result

In [107]:
# zaproponuj drugi, bardziej zaawansowany algorytm, np.:
# - jesli produkt X wystepuje w liscie kandydatow kilkukrotnie, oblicz srednia lub iloczyn confidence
# - posortuj kandydatow po iloczynie confidence i lift

def generate_advanced_candidates(basket: tuple[str], products: list[str], supports) -> list[tuple[str, float]]:
    candidates = generate_basic_candidates(basket, products, supports)
    result = []
    for product in products:
        mean = 1
        n = 0
        for candidate in candidates:
            if candidate[0] == product:
                mean *= candidate[2]*candidate[3]
                n += 1

        mean = mean**(1/n) if n != 0 else 0
        result.append((product, mean))
    result.sort(key=lambda x : x[1])
    return result

In [108]:
print(baskets[1])
print(generate_basic_candidates(baskets[1], products, supports))
print(baskets[1])
print(generate_advanced_candidates(baskets[1], products, supports))

{'yogurt', 'semi-finished bread', 'sausage', 'whole milk'}
[('bottled beer', ('yogurt', 'sausage', 'whole milk'), 0.09090909090909093, 2.0063019576293915), ('butter', ('yogurt', 'sausage', 'whole milk'), 0.09090909090909093, 2.5811626703467314), ('butter milk', ('yogurt', 'sausage', 'whole milk'), 0.09090909090909093, 5.172139647424819), ('chewing gum', ('yogurt', 'sausage', 'whole milk'), 0.09090909090909093, 7.557070707070709), ('curd', ('yogurt', 'sausage', 'whole milk'), 0.09090909090909093, 2.6989538239538247), ('dessert', ('yogurt', 'sausage', 'whole milk'), 0.09090909090909093, 3.853463816636622), ('margarine', ('yogurt', 'sausage', 'whole milk'), 0.09090909090909093, 2.822142587702754), ('misc. beverages', ('yogurt', 'sausage', 'whole milk'), 0.09090909090909093, 5.76386748844376), ('other vegetables', ('yogurt', 'sausage', 'whole milk'), 0.13636363636363635, 1.1168084788774444), ('pastry', ('yogurt', 'sausage', 'whole milk'), 0.09090909090909093, 1.7574583039699323), ('pork', 

In [109]:
print(baskets[33])
print(generate_basic_candidates(baskets[33], products, supports))
print(baskets[33])
print(generate_advanced_candidates(baskets[33], products, supports))

{'tropical fruit', 'root vegetables', 'white wine', 'photo/film', 'soda', 'domestic eggs', 'yogurt'}
[('pastry', ('tropical fruit', 'root vegetables', 'soda'), 0.4, 7.732816537467701), ('onions', ('tropical fruit', 'root vegetables', 'yogurt'), 0.42857142857142855, 21.164073550212162), ('whole milk', ('tropical fruit', 'root vegetables', 'yogurt'), 0.28571428571428575, 1.8092013783930843), ('beef', ('tropical fruit', 'root vegetables'), 0.03636363636363637, 1.071080887616321), ('canned beer', ('tropical fruit', 'root vegetables'), 0.05454545454545454, 1.1626262626262627), ('citrus fruit', ('tropical fruit', 'root vegetables'), 0.05454545454545454, 1.0266209262435677), ('condensed milk', ('tropical fruit', 'root vegetables'), 0.03636363636363637, 5.55213358070501), ('cream cheese ', ('tropical fruit', 'root vegetables'), 0.05454545454545454, 2.3055469953775036), ('instant food products', ('tropical fruit', 'root vegetables'), 0.03636363636363637, 9.06848484848485), ('onions', ('tropical