In [2]:
import pandas as pd
import numpy as np
import json
import random

In [3]:
train_df = pd.read_csv("../data/v1/train.csv")
test_df = pd.read_csv("../data/v1/test.csv")
meta_df = pd.read_csv("../data/item_metadata_filtered.csv")
with open("../data/id_mappings.json") as f:
    id_map = json.load(f)

In [4]:
# User id and asin to int id mapping
asin_to_id = {asin: int(item_id) for asin, item_id in id_map["item_mapping"].items()} # asin (str) -> item_id (int)
item_id_to_asin = {int(item_id): asin for item_id, asin in id_map["item_reverse_mapping"].items()} # item_id (int) -> asin (str)

# Create a lookup dictionary for true items from the test set
true_items_map = pd.Series(test_df.item_id.values, index=test_df.user_id).to_dict()

In [5]:
all_categories = sorted(meta_df["main_category"].dropna().unique().tolist())
category_options = ["Any"] + all_categories

In [6]:
# Preprocess metadata
meta_df["title"] = meta_df["title"].fillna("")
meta_df["store"] = meta_df["store"].fillna("")
meta_df["description"] = meta_df["description"].fillna("")
meta_df["average_rating"] = meta_df["average_rating"].fillna("")
meta_df["price"] = meta_df["price"].fillna("")
meta_df["image_urls"] = meta_df["image_urls"].fillna("[]")

# Index by ASIN for lookup
meta_df = meta_df.set_index("parent_asin")

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
# --- TF-IDF ---
all_items = sorted(asin_to_id.keys())  # all ASINs
texts = [
    meta_df.loc[asin]["title"] + " " +
    meta_df.loc[asin]["store"] + " " +
    meta_df.loc[asin]["description"]
    if asin in meta_df.index else ""
    for asin in all_items
]

In [25]:
vectorizer = TfidfVectorizer(max_features=2_000)
tfidf_matrix = vectorizer.fit_transform(texts)

# Basic TD-IDF
with de-duplication of results

MAP@10 ~0.011 dla randomowego sampla 1k użytkowników

In [10]:
def build_user_profile(user_id):
    user_ratings = train_df[train_df["user_id"] == user_id]
    indices = user_ratings["item_id"].values
    ratings = user_ratings["rating"].values
    item_vectors = tfidf_matrix[indices]
    weighted = item_vectors.multiply(ratings[:, None])
    profile = weighted.mean(axis=0)
    return np.asarray(profile)

In [11]:
def recommend_for_user_v1(user_id, top_k=10):
    profile = build_user_profile(user_id)
    scores = cosine_similarity(profile, tfidf_matrix).flatten()
    # Remove previously rated items
    seen_items = set(train_df[train_df["user_id"] == user_id]["item_id"])
    recs = [i for i in scores.argsort()[::-1] if i not in seen_items]
    return recs[:top_k]

In [12]:
def average_precision_at_k(recommendations, true_item_id, k=10):
    if true_item_id in recommendations[:k]:
        # Find the rank (position) of the true item, adding 1 for 1-based ranking
        rank = recommendations[:k].index(true_item_id) + 1
        return 1.0 / rank
    return 0.0

In [13]:
def pick_random_user_simple(min_interactions, max_interactions):
    counts = train_df["user_id"].value_counts()
    if max_interactions <= 0:
        valid = counts[counts >= min_interactions].index.tolist()
    else:
        valid = counts[min_interactions <= counts <= max_interactions].index.tolist()
    return random.choice(valid), len(valid), len(counts) if valid else None

In [14]:
from tqdm import tqdm

In [15]:
def map_at_10(validation_df, recommend_func, k=10):
    ap_scores = []
    for _, row in tqdm(validation_df.iterrows(), total=len(validation_df), desc="Calculating MAP@10"):
        user_id = row["user_id"]
        true_item_id = row["item_id"]
        recommendations = recommend_func(user_id, top_k=k)
        ap = average_precision_at_k(recommendations, true_item_id, k)
        ap_scores.append(ap)
    return np.mean(ap_scores) if ap_scores else 0.0


In [16]:
def map_at_10_sampled(validation_df, recommend_func, k=10, sample_size=10_000, random_state=42):
    unique_users = validation_df["user_id"].unique()
    rng = np.random.default_rng(random_state)
    sampled_users = rng.choice(unique_users, size=min(sample_size, len(unique_users)), replace=False)
    sampled_df = validation_df[validation_df["user_id"].isin(sampled_users)]

    ap_scores = []
    for _, row in tqdm(sampled_df.iterrows(), total=len(sampled_df), desc="Calculating MAP@10 (sampled)"):
        user_id = row["user_id"]
        true_item_id = row["item_id"]
        recommendations = recommend_func(user_id, top_k=k)
        ap = average_precision_at_k(recommendations, true_item_id, k)
        ap_scores.append(ap)
    return np.mean(ap_scores) if ap_scores else 0.0

In [None]:
rec_v1 = map_at_10_sampled(test_df, recommend_for_user_v1, k=10, sample_size=1_000)

In [None]:
print(f"MAP@10 for TD-IDF v1: {rec_v1:.4f}")

# TD-IDF v2
with boosting most popular items from user's main category

MAP@10 ~0.023 dla randomowego sampla 1k użytkowników

In [19]:
# Precompute user -> seen items
user_seen_items = train_df.groupby("user_id")["item_id"].apply(set).to_dict()

In [21]:
# Precompute user -> main category
def precompute_user_main_category():
    user_main_cat = {}
    for user_id, items in tqdm(train_df.groupby("user_id")["item_id"], desc="User main category"):
        asins = [item_id_to_asin[i] for i in items if i in item_id_to_asin]
        categories = meta_df.loc[asins, "main_category"].dropna()
        if not categories.empty:
            user_main_cat[user_id] = categories.value_counts().idxmax()
    return user_main_cat
user_main_category = precompute_user_main_category()

User main category: 100%|██████████| 868218/868218 [05:17<00:00, 2737.01it/s]


In [22]:
# Precompute category -> popularity vector
def precompute_category_popularity():
    cat_pop = {}
    for cat in tqdm(meta_df["main_category"].dropna().unique(), desc="Category popularity"):
        items_in_cat = meta_df[meta_df["main_category"] == cat].index
        item_ids = [asin_to_id[a] for a in items_in_cat if a in asin_to_id]
        pop = train_df[train_df["item_id"].isin(item_ids)]["item_id"].value_counts()
        pop_vec = np.zeros(tfidf_matrix.shape[0])
        for item_id, count in pop.items():
            pop_vec[item_id] = count
        if pop_vec.max() > 0:
            pop_vec = pop_vec / pop_vec.max()
        cat_pop[cat] = pop_vec
    return cat_pop
category_popularity = precompute_category_popularity()

Category popularity: 100%|██████████| 8/8 [00:00<00:00, 30.42it/s]


In [33]:
# Fit once, outside the recommend function (e.g., after tfidf_matrix is created)
ann = NearestNeighbors(n_neighbors=top_k + 100, metric="cosine", n_jobs=-1)
ann.fit(tfidf_matrix)

def recommend_for_user_v2_fast(user_id, top_k=10, alpha=0.7):
    profile = build_user_profile(user_id)
    scores = cosine_similarity(profile, tfidf_matrix).flatten()
    seen_items = user_seen_items.get(user_id, set())
    main_cat = user_main_category.get(user_id, None)
    if main_cat is None or main_cat not in category_popularity:
        recs = [i for i in scores.argsort()[::-1] if i not in seen_items]
        return recs[:top_k]
    pop_scores = category_popularity[main_cat]
    if scores.max() > 0:
        scores = scores / scores.max()
    combined = alpha * scores + (1 - alpha) * pop_scores
    recs = [i for i in combined.argsort()[::-1] if i not in seen_items]
    return recs[:top_k]

In [34]:
rec_v2 = map_at_10_sampled(test_df, recommend_for_user_v2_fast, k=10, sample_size=1_000)

Calculating MAP@10 (sampled): 100%|██████████| 1000/1000 [00:40<00:00, 24.81it/s]


In [27]:
print(f"MAP@10 for TD-IDF v2: {rec_v2:.4f}")

MAP@10 for TD-IDF v2: 0.0239


In [31]:
len(train_df["user_id"].unique()), len(test_df["user_id"].unique())

(868218, 868218)

In [79]:
from joblib import Parallel, delayed

In [80]:
def ap_for_user(user_id, true_item_id, recommend_func, k):
    recommendations = recommend_func(user_id, top_k=k)
    return average_precision_at_k(recommendations, true_item_id, k)

def map_at_10_sampled_parallel(validation_df, recommend_func, k=10, sample_size=10_000, random_state=42, n_jobs=-1):
    unique_users = validation_df["user_id"].unique()
    rng = np.random.default_rng(random_state)
    sampled_users = rng.choice(unique_users, size=min(sample_size, len(unique_users)), replace=False)
    sampled_df = validation_df[validation_df["user_id"].isin(sampled_users)]

    iterator = (row for _, row in sampled_df.iterrows())
    results = Parallel(n_jobs=n_jobs, prefer="threads")(
        delayed(ap_for_user)(row["user_id"], row["item_id"], recommend_func, k)
        for row in tqdm(iterator, total=len(sampled_df), desc="Parallel MAP@10")
    )
    return np.mean(results) if results else 0.0

In [None]:
rec_v2_par = map_at_10_sampled_parallel(test_df, recommend_for_user_v2_fast, k=10, sample_size=10_000)

In [68]:
print(f"MAP@10 for TD-IDF v2 parallel: {rec_v2_par:.4f}")

MAP@10 for TD-IDF v2 parallel: 0.0229


---

In [39]:
def recommend_for_user_v2_fast_half(user_id, top_k=10, alpha=0.7):
    profile = build_user_profile(user_id)
    scores = cosine_similarity(profile, tfidf_matrix).flatten()
    seen_items = user_seen_items.get(user_id, set())
    main_cat = user_main_category.get(user_id, None)
    if main_cat is None or main_cat not in category_popularity:
        recs = [i for i in scores.argsort()[::-1] if i not in seen_items]
        return recs[:top_k]
    pop_scores = category_popularity[main_cat]
    if scores.max() > 0:
        scores = scores / scores.max()
    combined = alpha * scores + (1 - alpha) * pop_scores
    recs = [i for i in combined.argsort()[::-1] if i not in seen_items]
    return recs[:top_k]

In [64]:
import os

def recommend_for_user_v2_fast_half_v2(user_id, top_k=10, alpha=0.7):
    profile = build_user_profile(user_id)
    scores = cosine_similarity(profile, tfidf_matrix).flatten()
    seen_items = user_seen_items.get(user_id, set())
    main_cat = user_main_category.get(user_id, None)
    if main_cat is None or main_cat not in category_popularity:
        recs = [i for i in scores.argsort()[::-1] if i not in seen_items]
        return recs[:top_k]
    pop_scores = category_popularity[main_cat]
    if scores.max() > 0:
        scores = scores / scores.max()
    combined = alpha * scores + (1 - alpha) * pop_scores
    recs = [i for i in combined.argsort()[::-1] if i not in seen_items]
    u = {"user_id": user_id, "items_id": ' '.join(recs[:top_k]) }
    # u_df = pd.DataFrame(u)
    # u_df.to_csv(f"user_recommendation{user_id}.csv")
    u_df = pd.DataFrame([u])
    file_path = os.path.abspath(f"C:\\Users\\Thyrmite\\Documents\\dev\\Shopping_Recomendation\\nimus\\data\\user_recommendation{user_id}.csv")
    u_df.to_csv(file_path, index=False)
    print(f"Saved to: {file_path}")

    print(f"Ukończono plik o numerze: {user_id}")
    return u

In [54]:
from multiprocessing import Process

In [65]:
processes = []
for i, user_id in tqdm(enumerate(range(0, 1000))):
    p = Process(target=recommend_for_user_v2_fast_half_v2, args=(user_id, ))
    processes.append(p)
    # print(f"Ukończono plik o numerze: {i}")
    p.start()

1000it [00:04, 210.57it/s]


In [67]:
print(os.getcwd())
print([f for f in os.listdir() if f.startswith("user_recommendation") and f.endswith(".csv")])

C:\Users\Thyrmite\Documents\dev\Shopping_Recomendation\nimus
[]
