# Laboratorium 1 - analiza koszykowa

## Przygotowanie

 * pobierz i wypakuj dataset: https://kaggle.com/datasets/rashikrahmanpritom/groceries-dataset-for-market-basket-analysismba?resource=download&select=basket.csv
   * alternatywnie, pobierz plik `basket.csv` z Teamsów
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install more-itertools`

## Część 1. - przygotowanie danych

In [1]:
# importujemy wszystkie potrzebne pakiety

from more_itertools import powerset_of_sets

In [29]:
# definiujemy stale

PATH = './basket.csv'
EPSILON = 0.000001
K = 4

In [30]:
# wczytujemy dane o koszykach

def read_baskets(path: str) -> list[tuple[str]]:
    with open(path) as f:
        raw = f.read()
    baskets = [set([y.lower() for y in x.split(',') if y]) for x in raw.split('\n')[1:] if x]
    return baskets

def unique_products(baskets: list[tuple[str]]) -> list[str]:
    products = set()
    for basket in baskets:
        products.update(basket)
    return sorted(list(products))

baskets = read_baskets(PATH)
products = unique_products(baskets)

In [31]:
baskets[:10]

[{'pastry', 'salty snack', 'whole milk'},
 {'sausage', 'semi-finished bread', 'whole milk', 'yogurt'},
 {'pickled vegetables', 'soda'},
 {'canned beer', 'misc. beverages'},
 {'hygiene articles', 'sausage'},
 {'rolls/buns', 'sausage', 'whole milk'},
 {'soda', 'whole milk'},
 {'frankfurter', 'soda', 'whipped/sour cream'},
 {'curd', 'frankfurter'},
 {'beef', 'white bread'}]

In [32]:
len(baskets)

14963

In [33]:
products[:10]

['abrasive cleaner',
 'artif. sweetener',
 'baby cosmetics',
 'bags',
 'baking powder',
 'bathroom cleaner',
 'beef',
 'berries',
 'beverages',
 'bottled beer']

In [34]:
len(products)

167

In [35]:
import math

math.comb(len(products), K)

31256555

## Część 2. - obliczanie wskaźników

In [36]:
# obliczamy strukture danych (np. slownik albo graf) przechowujaca wszystkie interesujace wartosci `support`

from collections import defaultdict
from tqdm import tqdm

def key(basket: tuple[str]) -> str:
    return ",".join(sorted(basket))

def get_supports(baskets: list[set[str]], all_products: list[str], epsilon: float):
    # raise NotImplementedError()
    N = len(baskets)
    supports = defaultdict(lambda: epsilon)

    def supp(subbasket: set[str]) -> float:
        return 1/N * sum(1 for basket in baskets if subbasket.issubset(basket))
        
    for basket in tqdm(baskets):            
        for subbasket in powerset_of_sets(basket):                        
            if len(subbasket) > 0 and (k := key(subbasket)) not in supports:
                supports[k] = supp(subbasket)
            
            if len(subbasket) == K:
                break
                
    return supports

    
supports = get_supports(baskets, products, EPSILON)
# supports

100%|██████████████████████████████████████████████████████████████████████████████████████| 14963/14963 [00:08<00:00, 1763.96it/s]


In [37]:
# definiujemy funkcje obliczajace support, confidence i lift

def support(supports, products: set[str]) -> float:
    return supports[key(products)]

def confidence(supports, prior_products: set[str], following_products: tuple[str]) -> float:
    return support(supports, prior_products | following_products) / support(supports, prior_products)
    
def lift(supports, prior_products: set[str], following_products: tuple[str]) -> float:
    return support(supports, prior_products | following_products) / (support(supports, prior_products) * support(supports, following_products))

In [38]:
print(support(supports, {'whole milk', 'rolls/buns'}))
print(confidence(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))
print(lift(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))

0.013967787208447505
0.09569377990430622
1.114292629344851


In [39]:
print(support(supports, {'sausage', 'semi-finished bread', 'yogurt'}))
print(support(supports, {'abrasive cleaner'}))
print(support(supports, {'sausage', 'semi-finished bread', 'yogurt'} | {'abrasive cleaner'}))
print(confidence(supports, {'sausage', 'semi-finished bread', 'yogurt'}, {'abrasive cleaner'}))
print(lift(supports, {'sausage', 'semi-finished bread', 'yogurt'}, {'abrasive cleaner'}))

0.00013366303548753594
0.0014702933903628953
1e-06
0.007481499999999999
5.088440204545453


In [43]:
print(support(supports, {'domestic eggs', 'root vegetables', 'soda', 'white wine'}))
print(support(supports, {'abrasive cleaner'}))
print(support(supports, {'domestic eggs', 'root vegetables', 'soda', 'white wine'} | {'abrasive cleaner'}))
print(confidence(supports, {'domestic eggs', 'root vegetables', 'soda', 'white wine'}, {'abrasive cleaner'}))
print(lift(supports, {'domestic eggs', 'root vegetables', 'soda', 'white wine'}, {'abrasive cleaner'}))

1e-06
0.0014702933903628953
1e-06
1.0
680.1363636363636


In [28]:
s = {'sausage', 'semi-finished bread', 'yogurt'} | {'abrasive cleaner'}
list(filter(lambda x: s.issubset(x), products))


[]

## Część 3. - generowanie rekomendacji

In [40]:
# wyznaczamy liste potencjalnych rekomendacji
# rekomendowane artykuly powinny miec lift > 1 i jak najwyzszy confidence

def generate_basic_candidates(basket: set[str], products: list[str], supports) -> list[tuple[str, set[str], float, float]]:
    # return [(item, subbasket, confidence, lift)]
    # raise NotImplementedError()
    recommendations = []
    
    for subbasket in powerset_of_sets(basket):
        if len(subbasket) > K:
            break
        if len(subbasket) == 0:
            continue
        for product in products:
            if product not in basket and (lift_value := lift(supports, subbasket, {product})) > 1:
                recommendations.append((product, subbasket, confidence(supports, subbasket, {product}), lift_value))

    return sorted(recommendations, key=lambda t: t[2], reverse=True)
    

In [14]:
# zaproponuj drugi, bardziej zaawansowany algorytm, np.:
# - jesli produkt X wystepuje w liscie kandydatow kilkukrotnie, oblicz srednia lub iloczyn confidence
# - posortuj kandydatow po iloczynie configence i lift

def generate_advanced_candidates(basket: set[str], products: list[str], supports) -> list[tuple[str, set[str], float, float]]:
    # return [(item, subbasket, confidence, lift)]
    # raise NotImplementedError()

    base_recommendations = generate_basic_candidates(basket, products, supports)
    recommendations = {}



In [41]:
print(baskets[1])
generate_basic_candidates(baskets[1], products, supports)[:10]
# generate_advanced_candidates(baskets[1], products, supports)

{'yogurt', 'whole milk', 'semi-finished bread', 'sausage'}


[('other vegetables',
  {'semi-finished bread', 'whole milk', 'yogurt'},
  0.3333333333333333,
  2.729976281700419),
 ('rolls/buns',
  {'sausage', 'whole milk'},
  0.12686567164179108,
  1.153275239839684),
 ('domestic eggs',
  {'semi-finished bread', 'whole milk'},
  0.12,
  3.2352432432432425),
 ('rolls/buns',
  {'whole milk', 'yogurt'},
  0.11976047904191615,
  1.08868532679477),
 ('soda', {'sausage', 'whole milk'}, 0.11940298507462688, 1.2296124333596985),
 ('soda', {'sausage', 'yogurt'}, 0.11627906976744186, 1.1974423406264505),
 ('beverages',
  {'sausage', 'semi-finished bread'},
  0.11111111111111112,
  6.703853046594981),
 ('bottled beer',
  {'sausage', 'semi-finished bread'},
  0.11111111111111112,
  2.4521468371025894),
 ('chocolate',
  {'sausage', 'semi-finished bread'},
  0.11111111111111112,
  4.709789109222537),
 ('coffee',
  {'sausage', 'semi-finished bread'},
  0.11111111111111112,
  3.5149166079398637)]

In [42]:
print(baskets[33])
generate_basic_candidates(baskets[33], products, supports)
# generate_advanced_candidates(baskets[33], products, supports)

{'white wine', 'domestic eggs', 'soda', 'tropical fruit', 'root vegetables', 'photo/film', 'yogurt'}


[('abrasive cleaner',
  {'domestic eggs', 'root vegetables', 'soda', 'white wine'},
  1.0,
  680.1363636363636),
 ('artif. sweetener',
  {'domestic eggs', 'root vegetables', 'soda', 'white wine'},
  1.0,
  515.9655172413793),
 ('baby cosmetics',
  {'domestic eggs', 'root vegetables', 'soda', 'white wine'},
  1.0,
  4987.666666666666),
 ('bags',
  {'domestic eggs', 'root vegetables', 'soda', 'white wine'},
  1.0,
  3740.75),
 ('baking powder',
  {'domestic eggs', 'root vegetables', 'soda', 'white wine'},
  1.0,
  123.66115702479338),
 ('bathroom cleaner',
  {'domestic eggs', 'root vegetables', 'soda', 'white wine'},
  1.0,
  880.1764705882351),
 ('beef',
  {'domestic eggs', 'root vegetables', 'soda', 'white wine'},
  1.0,
  29.45472440944882),
 ('berries',
  {'domestic eggs', 'root vegetables', 'soda', 'white wine'},
  1.0,
  45.89877300613497),
 ('beverages',
  {'domestic eggs', 'root vegetables', 'soda', 'white wine'},
  1.0,
  60.33467741935483),
 ('bottled beer',
  {'domestic eggs',