In [None]:
import os, glob, ast
import numpy as np
import pandas as pd
from PIL import Image

import torch
import torch.nn.functional as F
from transformers import ViltProcessor, ViltModel
from sklearn.decomposition import PCA



In [None]:

# -----------------------------
# 0) Load item_feature (sans embeddings)
# -----------------------------
item_feat = pd.read_parquet(
    "/kaggle/input/www2025-mmctr-data/MicroLens_1M_MMCTR/item_feature.parquet"
)

# On garde uniquement les colonnes utiles pour le texte (et item_id)
keep_cols = ["item_id", "item_title", "item_tags", "likes_level", "views_level"]
item_feat_small = item_feat[keep_cols].copy()



In [None]:
# -----------------------------
# 1) Construire le texte brut à partir de item_title + item_tags (+ optionnel likes/views)
# -----------------------------
def parse_tags(x):
    """Retourne une liste de tags (strings) depuis list/array/string."""
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return []
    if isinstance(x, (list, tuple, np.ndarray)):
        return [str(t).strip() for t in x if str(t).strip()]
    if isinstance(x, str):
        s = x.strip()
        if not s:
            return []
        # Cas: string qui ressemble à une liste python "['a','b']"
        if (s.startswith("[") and s.endswith("]")) or (s.startswith("(") and s.endswith(")")):
            try:
                obj = ast.literal_eval(s)
                if isinstance(obj, (list, tuple)):
                    return [str(t).strip() for t in obj if str(t).strip()]
            except Exception:
                pass
        # Cas: tags séparés par , ou |
        if "|" in s:
            return [t.strip() for t in s.split("|") if t.strip()]
        if "," in s:
            return [t.strip() for t in s.split(",") if t.strip()]
        # Sinon un seul tag
        return [s]
    return [str(x).strip()]

def build_text(row, use_levels=True):
    title = str(row["item_title"]).strip() if pd.notna(row["item_title"]) else ""
    tags = parse_tags(row["item_tags"])

    parts = []
    if title:
        parts.append(title)
    if tags:
        parts.append("tags: " + ", ".join(tags))

    # Optionnel: ajouter likes/views comme texte (ça peut aider ViLT)
    if use_levels:
        if pd.notna(row["likes_level"]):
            parts.append(f"likes_level: {row['likes_level']}")
        if pd.notna(row["views_level"]):
            parts.append(f"views_level: {row['views_level']}")

    return " | ".join(parts) if parts else ""

texts = item_feat_small.apply(lambda r: build_text(r, use_levels=True), axis=1).tolist()
item_ids = item_feat_small["item_id"].astype(int).tolist()



In [None]:
# -----------------------------
# 2) Récupérer les images brutes (sans img_emb_CLIPRN50)
# -----------------------------
IMAGES_DIR = "/kaggle/input/www2025-mmctr-data/MicroLens_1M_MMCTR/item_images"  # adapte si besoin

def find_image_path(item_id: int):
    # essaye les extensions courantes
    for ext in ("jpg", "png", "jpeg", "webp"):
        p = os.path.join(IMAGES_DIR, f"{item_id}.{ext}")
        if os.path.exists(p):
            return p
    # fallback glob
    cand = glob.glob(os.path.join(IMAGES_DIR, f"{item_id}.*"))
    return cand[0] if cand else None

img_paths = [find_image_path(i) for i in item_ids]



In [None]:
# -----------------------------
# 3) ViLT (image brute + texte brut) -> embedding fusionné
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
model_id = "dandelin/vilt-b32-mlm"
processor = ViltProcessor.from_pretrained(model_id)
model = ViltModel.from_pretrained(model_id).to(device).eval()

@torch.no_grad()
def vilt_cls_batch(images_pil, texts_batch):
    inputs = processor(
        images=images_pil,
        text=texts_batch,
        return_tensors="pt",
        padding=True,
        truncation=True
    ).to(device)
    out = model(**inputs)
    cls = out.last_hidden_state[:, 0, :]      # (B, hidden)
    cls = F.normalize(cls, p=2, dim=-1)
    return cls.cpu().numpy()

BATCH = 16  # augmente si GPU OK
vecs = []

for s in range(0, len(item_ids), BATCH):
    batch_texts = texts[s:s+BATCH]
    batch_paths = img_paths[s:s+BATCH]

    images = []
    for p in batch_paths:
        if p is None:
            images.append(Image.fromarray(np.zeros((224,224,3), dtype=np.uint8)).convert("RGB"))
        else:
            images.append(Image.open(p).convert("RGB"))

    vecs.append(vilt_cls_batch(images, batch_texts))

X = np.vstack(vecs)  # (N, hidden)



In [None]:
# -----------------------------
# 4) PCA -> 128 dims + normalisation
# -----------------------------
pca = PCA(n_components=128, random_state=42)
X128 = pca.fit_transform(X).astype(np.float32)
X128 = X128 / (np.linalg.norm(X128, axis=1, keepdims=True) + 1e-12)

print("fin")



In [None]:
import numpy as np
import pandas as pd

# --- chemins ---
ITEM_INFO_PATH = "/kaggle/input/www2025-mmctr-data/MicroLens_1M_MMCTR/MicroLens_1M_x1/item_info.parquet"
OUT_PATH = "./item_info_updated_emb.parquet"

# --- données déjà calculées chez toi ---
# item_ids : liste des item_id dans le même ordre que X128
# X128 : numpy array (N, 128) float32
# ex: item_ids = item_feat_small["item_id"].astype(int).tolist()

col_name = "item_emb_d128"   # nom exact de la colonne à modifier

# 1) lire item_info
item_info = pd.read_parquet(ITEM_INFO_PATH)

# 2) vérifier la colonne existe
if col_name not in item_info.columns:
    raise ValueError(f"La colonne '{col_name}' n'existe pas dans item_info. Colonnes: {list(item_info.columns)}")

# 3) construire un mapping item_id -> embedding(128)
id_to_vec = {int(i): X128[k].astype(np.float32).tolist() for k, i in enumerate(item_ids)}

# 4) remplacer le contenu de la colonne, ligne par ligne
item_info["item_id"] = item_info["item_id"].astype(int)
item_info[col_name] = item_info["item_id"].map(lambda x: id_to_vec.get(x, [0.0]*128))  # vecteur 0 si manquant

# 5) vérifier qu’on n’a plus de NaN
missing = item_info[col_name].isna().sum()
print(f"Nombre d'embeddings manquants remplacés par 0 : {missing}")

# 6) sauvegarder
item_info.to_parquet(OUT_PATH, index=False)
print("Saved:", OUT_PATH)
