# Laboratorium 1 - analiza koszykowa

## Przygotowanie

 * pobierz i wypakuj dataset: https://kaggle.com/datasets/rashikrahmanpritom/groceries-dataset-for-market-basket-analysismba?resource=download&select=basket.csv
   * alternatywnie, pobierz plik `basket.csv` z Teamsów
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install more-itertools`

## Część 1. - przygotowanie danych

In [1]:
# importujemy wszystkie potrzebne pakiety

from more_itertools import powerset_of_sets

In [42]:
# definiujemy stale

PATH = './basket.csv'
EPSILON = 0.0001
K = 4

In [43]:
# wczytujemy dane o koszykach

def read_baskets(path: str) -> list[tuple[str]]:
    with open(path) as f:
        raw = f.read()
    baskets = [set([y.lower() for y in x.split(',') if y]) for x in raw.split('\n')[1:] if x]
    return baskets

def unique_products(baskets: list[tuple[str]]) -> list[str]:
    products = set()
    for basket in baskets:
        products.update(basket)
    return sorted(list(products))

baskets = read_baskets(PATH)
products = unique_products(baskets)

In [44]:
baskets[:10]

[{'pastry', 'salty snack', 'whole milk'},
 {'sausage', 'semi-finished bread', 'whole milk', 'yogurt'},
 {'pickled vegetables', 'soda'},
 {'canned beer', 'misc. beverages'},
 {'hygiene articles', 'sausage'},
 {'rolls/buns', 'sausage', 'whole milk'},
 {'soda', 'whole milk'},
 {'frankfurter', 'soda', 'whipped/sour cream'},
 {'curd', 'frankfurter'},
 {'beef', 'white bread'}]

In [46]:
products[:10]

['abrasive cleaner',
 'artif. sweetener',
 'baby cosmetics',
 'bags',
 'baking powder',
 'bathroom cleaner',
 'beef',
 'berries',
 'beverages',
 'bottled beer']

In [56]:
len(baskets), len(products)

(14963, 167)

## Część 2. - obliczanie wskaźników

In [49]:
# obliczamy strukture danych (np. slownik albo graf) przechowujaca wszystkie interesujace wartosci `support`

from collections import defaultdict
from tqdm import tqdm

def key(basket: tuple[str]) -> str:
    return ",".join(sorted(basket))

def get_supports(baskets: list[set[str]], all_products: list[str], epsilon: float):
    # raise NotImplementedError()
    N = len(baskets)
    supports = {}

    def supp(subbasket: set[str]) -> float:
        return 1/N * sum(1 for basket in baskets if subbasket.issubset(basket))
        
    for basket in tqdm(baskets):            
        for subbasket in powerset_of_sets(basket):
            if len(subbasket) > K:
                break
            if len(subbasket) > 0 and (k := key(subbasket)) not in supports:
                supports[k] = supp(subbasket)
    return supports

    
supports = get_supports(baskets, products, EPSILON)
# supports

100%|██████████████████████████████████████████████████████████████████████| 14963/14963 [00:12<00:00, 1180.92it/s]


In [50]:
# definiujemy funkcje obliczajace support, confidence i lift

def support(supports, products: set[str]) -> float:
    return supports[key(products)]

def confidence(supports, prior_products: set[str], following_products: set[str]) -> float:
    return support(supports, prior_products | following_products) / support(supports, prior_products)
    
def lift(supports, prior_products: set[str], following_products: tuple[str]) -> float:
    return support(supports, prior_products | following_products) / (support(supports, prior_products) * support(supports, following_products))

In [51]:
print(support(supports, {'whole milk', 'rolls/buns'}))
print(confidence(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))
print(lift(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))

0.013967787208447505
0.09569377990430622
1.114292629344851


## Część 3. - generowanie rekomendacji

In [52]:
# wyznaczamy liste potencjalnych rekomendacji
# rekomendowane artykuly powinny miec lift > 1 i jak najwyzszy confidence

def generate_basic_candidates(basket: set[str], products: list[str], supports) -> list[tuple[str, set[str], float, float]]:
    # return [(item, subbasket, confidence, lift)]
    # raise NotImplementedError()
    recommendations = []
    
    for subbasket in powerset_of_sets(basket):
        if len(subbasket) > K-1:
            break
        
        if len(subbasket) == 0 or key(subbasket) not in supports:
            continue
            
        for product in products:
            if product not in basket and key({product}) in supports and key(subbasket | {product}) in supports:
                lift_value = lift(supports, subbasket, {product})
                confidence_value = confidence(supports, subbasket, {product})
                
                if support(supports, subbasket | {product}) > EPSILON and lift_value > 1:
                    recommendations.append((product, subbasket, confidence_value, lift_value))
                    
    return sorted(recommendations, key=lambda t: t[2], reverse=True)
    

In [53]:
# zaproponuj drugi, bardziej zaawansowany algorytm, np.:
# - jesli produkt X wystepuje w liscie kandydatow kilkukrotnie, oblicz srednia lub iloczyn confidence
# - posortuj kandydatow po iloczynie configence i lift

def generate_advanced_candidates(basket: set[str], products: list[str], supports) -> list[tuple[str, set[str], float, float]]:
    # return [(item, subbasket, confidence, lift)]
    raise NotImplementedError()

In [54]:
print(baskets[1])
generate_basic_candidates(baskets[1], products, supports)[:5]
# generate_advanced_candidates(baskets[1], products, supports)

{'semi-finished bread', 'sausage', 'whole milk', 'yogurt'}


[('rolls/buns',
  {'sausage', 'whole milk', 'yogurt'},
  0.22727272727272724,
  2.0660278360764384),
 ('pork',
  {'sausage', 'whole milk', 'yogurt'},
  0.18181818181818182,
  4.901883701883701),
 ('soda',
  {'sausage', 'whole milk', 'yogurt'},
  0.18181818181818182,
  1.8723643871613587),
 ('other vegetables',
  {'sausage', 'whole milk', 'yogurt'},
  0.13636363636363635,
  1.1168084788774442),
 ('rolls/buns',
  {'sausage', 'whole milk'},
  0.12686567164179108,
  1.153275239839684)]

In [55]:
print(baskets[33])
generate_basic_candidates(baskets[33], products, supports)[:5]
# generate_advanced_candidates(baskets[33], products, supports)

{'soda', 'domestic eggs', 'yogurt', 'photo/film', 'tropical fruit', 'root vegetables', 'white wine'}


[('whole milk', {'photo/film', 'yogurt'}, 0.5, 3.1661024121878967),
 ('onions',
  {'root vegetables', 'tropical fruit', 'yogurt'},
  0.42857142857142855,
  21.16407355021216),
 ('pastry',
  {'root vegetables', 'soda', 'tropical fruit'},
  0.4,
  7.732816537467699),
 ('frozen vegetables', {'white wine', 'yogurt'}, 0.375, 13.39170644391408),
 ('bottled beer',
  {'tropical fruit', 'white wine'},
  0.33333333333333337,
  7.356440511307768)]