### Podstawowe rzeczy

1. wprowadzamy sety
2. przypisujemy item_id do metadanych
3. dodajemy mape złożoną z tablic TF-IDF

In [None]:
import pandas as pd
import json 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

train_df = pd.read_csv("data/train.csv")
meta_df = pd.read_csv("data/item_metadata_filtered.csv")
with open("data/id_mappings.json") as f:
    id_map = json.load(f)

asin_to_id = {asin: int(item_id) for asin, item_id in id_map["item_mapping"].items()}
item_mapping_df = pd.DataFrame(list(asin_to_id.items()), columns=["parent_asin", "item_id"])
meta_df = pd.merge(meta_df, item_mapping_df, on="parent_asin", how="left")
train_df = train_df.merge(meta_df[["item_id", "main_category"]], on="item_id", how="left")


# Preprocess metadata
meta_df["title"] = meta_df["title"].fillna("")
meta_df["store"] = meta_df["store"].fillna("")
meta_df["description"] = meta_df["description"].fillna("")
meta_df["average_rating"] = meta_df["average_rating"].fillna("")
meta_df["price"] = meta_df["price"].fillna("")
meta_df["image_urls"] = meta_df["image_urls"].fillna("[]")
meta_df["main_category"] = meta_df["main_category"].fillna("")

# Index by ASIN for lookup
meta_df = meta_df.set_index("parent_asin")

# --- TF-IDF ---
tfidf_matrix_dictionary = {}
for category in meta_df["main_category"].unique():
    # Filter ASINs for current category
    category_df = meta_df[meta_df["main_category"] == category]
    
    texts = [
        row["title"] + " " + row["store"] + " " + row["description"]
        for _, row in category_df.iterrows()
    ]
    
    vectorizer = TfidfVectorizer(max_features=10000)
    tfidf_matrix_dictionary[category] = vectorizer.fit_transform(texts)
    
    print(f"Processed category: {category}, TF-IDF shape: {tfidf_matrix_dictionary[category].shape}")
tfidf_matrix_dictionary

Processed category: All Beauty, TF-IDF shape: (17952, 10000)
Processed category: Premium Beauty, TF-IDF shape: (111, 751)
Processed category: Health & Personal Care, TF-IDF shape: (10716, 10000)
Processed category: Appstore for Android, TF-IDF shape: (43463, 10000)
Processed category: Software, TF-IDF shape: (5471, 10000)
Processed category: , TF-IDF shape: (113, 4449)
Processed category: Gift Cards, TF-IDF shape: (4, 17)
Processed category: Computers, TF-IDF shape: (2, 56)
Processed category: Home Audio & Theater, TF-IDF shape: (1, 11)


{'All Beauty': <17952x10000 sparse matrix of type '<class 'numpy.float64'>'
 	with 456065 stored elements in Compressed Sparse Row format>,
 'Premium Beauty': <111x751 sparse matrix of type '<class 'numpy.float64'>'
 	with 1412 stored elements in Compressed Sparse Row format>,
 'Health & Personal Care': <10716x10000 sparse matrix of type '<class 'numpy.float64'>'
 	with 303313 stored elements in Compressed Sparse Row format>,
 'Appstore for Android': <43463x10000 sparse matrix of type '<class 'numpy.float64'>'
 	with 4076735 stored elements in Compressed Sparse Row format>,
 'Software': <5471x10000 sparse matrix of type '<class 'numpy.float64'>'
 	with 715216 stored elements in Compressed Sparse Row format>,
 '': <113x4449 sparse matrix of type '<class 'numpy.float64'>'
 	with 15271 stored elements in Compressed Sparse Row format>,
 'Gift Cards': <4x17 sparse matrix of type '<class 'numpy.float64'>'
 	with 60 stored elements in Compressed Sparse Row format>,
 'Computers': <2x56 sparse 

In [3]:
category_df = {}
for category in meta_df["main_category"].unique():
    category_df[category] = meta_df[meta_df["main_category"] == category]
    category_df[category].drop(columns="main_category", axis=1)
    
category_df["All Beauty"].head()
item_id_to_index_dictionary = {
    category: {
        item_id: idx for idx, item_id in enumerate(df["item_id"].values)
    }
    for category, df in category_df.items()
    }
index_to_item_id_dictionary = {
    category: {v: k for k, v in item_id_to_index_dictionary[category].items()}
    for category in item_id_to_index_dictionary
}


In [29]:

def build_user_profile(user_id):
    try:
        user_ratings = train_df[train_df["user_id"] == user_id]
        categories = user_ratings["main_category"].unique()

        profile_parts = []
        tfidf_vectors = None
        for category in categories:
            category_ratings = user_ratings[user_ratings["main_category"] == category]
            item_ids = category_ratings["item_id"].values
            ratings = category_ratings["rating"].values

            # Mapping: item_id -> index in TF-IDF matrix
            id_to_index = item_id_to_index_dictionary.get(category, {})
            indices = [id_to_index[item_id] for item_id in item_ids if item_id in id_to_index]
            filtered_ratings = [r for item_id, r in zip(item_ids, ratings) if item_id in id_to_index]

            if not indices:
                continue

            tfidf_vectors = tfidf_matrix_dictionary[category][indices]
            weighted = tfidf_vectors.multiply(np.array(filtered_ratings)[:, None])

            # Ensure weighted.mean(axis=0) is converted to a dense 1D array
            mean_vector = weighted.mean(axis=0)
            if hasattr(mean_vector, "toarray"):
                mean_vector = mean_vector.toarray().ravel()  # Convert sparse matrix to flat array
            else:
                mean_vector = np.asarray(mean_vector).ravel()

            profile_parts.append(mean_vector)

        if not profile_parts:
            return None  # or np.zeros(shape) if you prefer to return a default profile

        # Safely average dense 1D vectors
        profile = np.mean(profile_parts, axis=0)
        return profile, tfidf_vectors 
    except Exception as e:
        print(e)
        print("Wyszło dla user o id: ", user_id)


def recommend_for_user(user_id, top_k=10):
    user_items = train_df[train_df["user_id"] == user_id]["item_id"]
    categories = meta_df[meta_df["item_id"].isin(user_items)]["main_category"].unique()
    
    profile_parts = []
    all_vectors = []
    all_indices = []
    seen_items = set(train_df.loc[train_df["user_id"] == user_id, "item_id"])
    
    for category in categories:
        user_ratings = train_df[(train_df["user_id"] == user_id) & (train_df["main_category"] == category)]
        item_ids = user_ratings["item_id"].values
        ratings = user_ratings["rating"].values
        
        id_to_index = item_id_to_index_dictionary.get(category, {})
        indices = [id_to_index[item_id] for item_id in item_ids if item_id in id_to_index]
        filtered_ratings = [r for item_id, r in zip(item_ids, ratings) if item_id in id_to_index]
        
        if not indices:
            continue
        
        tfidf_vectors = tfidf_matrix_dictionary[category][indices]
        weighted = tfidf_vectors.multiply(np.array(filtered_ratings)[:, None])
        profile_parts.append(weighted.mean(axis=0))
    
    if not profile_parts:
        return ""
        
    try:
        profile = np.mean(profile_parts, axis=0)
    except Exception as e: 
        print(f"user id: {user_id}")
        raise Exception(e)
    
    # Try to find recommendations from user's categories first
    recommendations = []
    for category in categories:
        tfidf_matrix = tfidf_matrix_dictionary.get(category)
        index_to_item_id = index_to_item_id_dictionary.get(category)
        scores = cosine_similarity(profile, tfidf_matrix).ravel()
        
        ranked_indices = np.argsort(-scores)
        recs = [index_to_item_id[i] for i in ranked_indices if index_to_item_id[i] not in seen_items]
        recommendations.extend(recs)
        
        if len(recommendations) >= top_k:
            return ' '.join(map(str, recommendations[:top_k]))
    
    # Fallback: try other categories if needed
    other_categories = [cat for cat in tfidf_matrix_dictionary.keys() if cat not in categories]
    for category in other_categories:
        tfidf_matrix = tfidf_matrix_dictionary.get(category)
        index_to_item_id = index_to_item_id_dictionary.get(category)
        scores = cosine_similarity(profile, tfidf_matrix).ravel()

        ranked_indices = np.argsort(-scores)
        recs = [index_to_item_id[i] for i in ranked_indices if index_to_item_id[i] not in seen_items]
        recommendations.extend(recs)

        if len(recommendations) >= top_k:
            break

    return ' '.join(map(str, recommendations[:top_k]))


def solve_and_save(min, max, num):
    df_final = pd.DataFrame({
            "user_id": train_df["user_id"].unique()[min:max],
    })
        # df_final["predictions"] = df_final["user_id"].apply(recommend_for_user, args=(train_df["main_category"]))
    df_final["predictions"] = df_final["user_id"].apply(
            lambda user_id: recommend_for_user(user_id)
    )

    name = f"user_predictions{num}.csv"
    print(name, flush=True)
    df_final.to_csv(name, index=False)


## Dziwny problem z tym rekordem:

In [19]:
train_df[train_df["user_id"] == 639]

Unnamed: 0,user_id,item_id,rating,timestamp,main_category
1930,639,75774,3.0,1659731231155,Premium Beauty
1931,639,47417,5.0,1677127171442,All Beauty


In [30]:
solve_and_save(0, 20000, 20000)

user id: 639


  return array(a, dtype, copy=False, order=order, subok=True)


Exception: could not broadcast input array from shape (751) into shape (1)

### Tutaj poniżej kiedy będzie wszystko działać

In [None]:
from multiprocessing import Process
from tqdm import tqdm


processes = []
for i, min_idx in tqdm(enumerate(range(0, 20000, 20000))):
    max_idx = min(min_idx + 20000, 868218)
    p = Process(target=solve_and_save, args=(min_idx, max_idx, i))
    processes.append(p)
    print(f"Ukończono plik o numerze: {i}")
    p.start()


0it [00:00, ?it/s]

Ukończono plik o numerze: 0


1it [00:00, 73.08it/s]


  return array(a, dtype, copy=False, order=order, subok=True)
Process Process-2:
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-8-2b7580ec1215>", line 135, in solve_and_save
    lambda user_id: recommend_for_user(user_id)
  File "/home/pete/Coding/Python/DSC_task/env/lib/python3.6/site-packages/pandas/core/series.py", line 4213, in apply
    mapped = lib.map_infer(values, f, convert=convert_dtype)
  File "pandas/_libs/lib.pyx", line 2403, in pandas._libs.lib.map_infer
  File "<ipython-input-8-2b7580ec1215>", line 135, in <lambda>
    lambda user_id: recommend_for_user(user_id)
  File "<ipython-input-8-2b7580ec1215>", line 95, in recommend_for_user
    profile = np.mean(profile_parts, axis=0)
  File "<__array_function__ internals>", line 6, in mean
  File "/home/pete