# Modelo de Preferência do Usuário — Last.fm 360K

Este notebook implementa e avalia um **modelo de recomendação Top-N por usuário**, utilizando a base de dados **Last.fm 360K**, com foco em **interpretabilidade** e **avaliação quantitativa**.  
O modelo serve como componente de **preferência do usuário** que pode ser posteriormente combinado com modelos contextuais (ex.: Markov para next-track).

In [11]:
import sys 
print("Python exe:",sys.executable)

import os
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares

Python exe: d:\dev\music-recsys\venv\Scripts\python.exe


In [12]:
# LOADER + PADRONIZADOR
def load_parquet(path):
    df = pd.read_parquet(path)
    # normaliza nomes
    df.columns = [c.strip() for c in df.columns]
    return df

def ensure_cols(df, rename_map):
    # renomeia somente se existir
    rename = {k: v for k, v in rename_map.items() if k in df.columns}
    return df.rename(columns=rename)

def force_str(df, cols):
    for c in cols:
        if c in df.columns:
            df[c] = df[c].astype(str)
    return df

In [13]:
# CARREGAR OS PARQUETS

df_events = load_parquet("data/als_events.parquet")
train_pairs = load_parquet("data/train_pairs_ids.parquet")
test_pairs  = load_parquet("data/test_pairs_ids.parquet")
top200 = load_parquet("data/global_rank_ids_top200.parquet")

# padronizar possíveis variações
df_events = ensure_cols(df_events, {"track_id": "item", "item_id": "item"})
train_pairs = ensure_cols(train_pairs, {"item": "item_id", "next_item": "next_item_id"})
test_pairs  = ensure_cols(test_pairs,  {"item": "item_id", "next_item": "next_item_id"})
top200 = ensure_cols(top200, {"item": "item_id"})

# garantir strings
df_events  = force_str(df_events,  ["user_id", "item"])
train_pairs = force_str(train_pairs, ["user_id", "item_id", "next_item_id"])
test_pairs  = force_str(test_pairs,  ["user_id", "item_id", "next_item_id"])
top200 = force_str(top200, ["item_id"])

print("events:", df_events.shape)
print("train_pairs:", train_pairs.shape, "| cols:", train_pairs.columns.tolist())
print("test_pairs:", test_pairs.shape,  "| cols:", test_pairs.columns.tolist())
print("top200:", top200.shape, "| cols:", top200.columns.tolist())


events: (16857878, 5)
train_pairs: (12820359, 5) | cols: ['user_id', 'item_id', 'next_item_id', 'session_id', 'timestamp']
test_pairs: (3163359, 5) | cols: ['user_id', 'item_id', 'next_item_id', 'session_id', 'timestamp']
top200: (200, 3) | cols: ['item_id', 'pop_count', 'rank']


In [14]:
# 1.IMPORTAR DADOS DO DF_SESS

df_events = pd.read_parquet("data/als_events.parquet")
print("df_events:", df_events.shape)
df_events.head()


df_events: (16857878, 5)


Unnamed: 0,user_id,item,timestamp,artist_name,track_name
0,user_000001,c4633ab1-e715-477f-8685-afa5f2058e42,2006-08-13 13:59:20+00:00,Plaid & Bob Jaroc,The Launching Of Big Face
1,user_000001,bc2765af-208c-44c5-b3b0-cf597a646660,2006-08-13 14:03:29+00:00,Plaid & Bob Jaroc,Zn Zero
2,user_000001,aa9c5a80-5cbe-42aa-a966-eb3cfa37d832,2006-08-13 14:10:43+00:00,Plaid & Bob Jaroc,The Return Of Super Barrio - End Credits
3,user_000001,d9b1c1da-7e47-4f97-a135-77260f2f559d,2006-08-13 14:17:40+00:00,Tommy Guerrero,Mission Flats
4,user_000001,120bb01c-03e4-465f-94a0-dce5e9fac711,2006-08-13 14:19:06+00:00,Artful Dodger,What You Gonna Do?


In [15]:
# CONSTRUIR UNIVERSO DE ITENS (MARKOV ATIVOS E GLOBAL TOP200)

markov_items = pd.Index(train_pairs["item_id"]).append(pd.Index(train_pairs["next_item_id"])).dropna().unique()
top_items = pd.Index(top200["item_id"]).dropna().unique()

als_items = pd.Index(markov_items).append(pd.Index(top_items)).dropna().unique()
als_item_set = set(als_items)

print("Itens Markov:", len(markov_items))
print("Itens Top200:", len(top_items))
print("Universo ALS:", len(als_item_set))

Itens Markov: 852555
Itens Top200: 200
Universo ALS: 852555


In [16]:
# FILTRAR EVENTOS + AGREGAR PLAYS

df_events_f = df_events[df_events["item"].isin(als_item_set)].copy()

user_item_plays = (
    df_events_f
    .groupby(["user_id", "item"], observed=True)
    .size()
    .rename("plays")
    .reset_index()
)

print("user_item_plays:", user_item_plays.shape)
print("users:", user_item_plays["user_id"].nunique(), "| items:", user_item_plays["item"].nunique())


user_item_plays: (3826972, 3)
users: 990 | items: 852555


In [17]:
# CONSTRUINDO USER_ITEM

user_cat = user_item_plays["user_id"].astype("category")
item_cat = user_item_plays["item"].astype("category")

user_item_plays["user_idx"] = user_cat.cat.codes.astype(np.int32)
user_item_plays["item_idx"] = item_cat.cat.codes.astype(np.int32)

n_users = user_cat.cat.categories.size
n_items = item_cat.cat.categories.size

rows = user_item_plays["user_idx"].to_numpy()
cols = user_item_plays["item_idx"].to_numpy()
data = user_item_plays["plays"].to_numpy(dtype=np.float32)

user_item = csr_matrix((data, (rows, cols)), shape=(n_users, n_items))
user_item.eliminate_zeros()

user_index_to_id = user_cat.cat.categories.to_numpy()
item_index_to_id = item_cat.cat.categories.to_numpy()

user_id_to_index = {uid: i for i, uid in enumerate(user_index_to_id)}
item_id_to_index = {iid: i for i, iid in enumerate(item_index_to_id)}

print("user_item:", user_item.shape, "| nnz:", user_item.nnz)


user_item: (990, 852555) | nnz: 3826972


In [18]:
# TREINO DO ALS
alpha = 15.0
conf_ui = (user_item * alpha).astype(np.float32)  # (users, items)

model_als = AlternatingLeastSquares(
    factors=64,
    regularization=0.02,
    iterations=15,
    num_threads=1
)

print("conf_ui:", conf_ui.shape, "esperado:", (n_users, n_items))
model_als.fit(conf_ui)

print("user_factors:", model_als.user_factors.shape, "esperado:", (n_users, 64))
print("item_factors:", model_als.item_factors.shape, "esperado:", (n_items, 64))

conf_ui: (990, 852555) esperado: (990, 852555)


  0%|          | 0/15 [00:00<?, ?it/s]

user_factors: (990, 64) esperado: (990, 64)
item_factors: (852555, 64) esperado: (852555, 64)


In [19]:
# SANITY CHECK

from scipy.sparse import csr_matrix

uid = 0
user_items_1 = user_item[uid]  # 1 x n_items (continua sparse)

rec_items, rec_scores = model_als.recommend(
    userid=uid,
    user_items=user_items_1,
    N=10,
    filter_already_liked_items=True,
    recalculate_user=True
)

print(rec_items[:5], rec_scores[:5])
print("item_id do 1º:", item_index_to_id[rec_items[0]])

[    85   4829 547201 555911 792349] [0.5614764  0.55090684 0.53572273 0.53054893 0.52103305]
item_id do 1º: 0006fc51-adb4-4417-b8fb-7b954b853923


In [20]:
# TESTE DE integridade dos pares + cobertura no universo ALS

# 1) checar colunas
print("cols test_pairs:", test_pairs.columns.tolist())
assert {"user_id","item_id","next_item_id"}.issubset(test_pairs.columns)

# 2) cobertura (quantos pares são avaliáveis)
pairs_valid = test_pairs[
    test_pairs["user_id"].isin(user_id_to_index) &
    test_pairs["next_item_id"].isin(item_id_to_index)
]

print("Total pares:", len(test_pairs))
print("Pares válidos (u e next no ALS):", len(pairs_valid))
print("Cobertura:", len(pairs_valid)/len(test_pairs) if len(test_pairs) else 0)
pairs_valid.head()


cols test_pairs: ['user_id', 'item_id', 'next_item_id', 'session_id', 'timestamp']
Total pares: 3163359
Pares válidos (u e next no ALS): 2923570
Cobertura: 0.9241979806907784


Unnamed: 0,user_id,item_id,next_item_id,session_id,timestamp
0,user_000001,f4fb4539-90b5-401e-8520-1a0dd4e41a32,df8c41f0-1c02-4060-81ca-035ef3085663,1004,2009-01-22 01:10:26+00:00
1,user_000001,df8c41f0-1c02-4060-81ca-035ef3085663,3c224468-a9c4-43d2-9b26-5973718a4f96,1004,2009-01-22 01:14:22+00:00
2,user_000001,3c224468-a9c4-43d2-9b26-5973718a4f96,8c88d32a-8bb1-4e3c-b30f-73107509c8c6,1004,2009-01-22 01:19:34+00:00
3,user_000001,8c88d32a-8bb1-4e3c-b30f-73107509c8c6,8efa69bc-8453-42c8-8949-2a0afd4c8c99,1004,2009-01-22 01:24:26+00:00
4,user_000001,8efa69bc-8453-42c8-8949-2a0afd4c8c99,07ae32f3-f0aa-4479-bb2d-bc6737a4df44,1004,2009-01-22 01:28:19+00:00


In [21]:
import numpy as np

rng = np.random.default_rng(42)

def als_affinity(uidx: int, item_idx: int) -> float:
    return float(model_als.user_factors[uidx] @ model_als.item_factors[item_idx])

# escolhe 10 usuários aleatórios
users_sample = rng.choice(n_users, size=min(10, n_users), replace=False)

for uidx in users_sample:
    # itens que o usuário consumiu (no agregado)
    user_row = user_item[uidx]
    seen_items = user_row.indices
    if len(seen_items) < 5:
        continue

    # pega 5 vistos e 5 aleatórios não vistos
    seen_pick = rng.choice(seen_items, size=5, replace=False)

    # amostra itens não vistos
    all_items = np.arange(n_items)
    mask = np.ones(n_items, dtype=bool)
    mask[seen_items] = False
    unseen_pool = all_items[mask]
    unseen_pick = rng.choice(unseen_pool, size=5, replace=False)

    seen_scores = [als_affinity(uidx, ii) for ii in seen_pick]
    unseen_scores = [als_affinity(uidx, ii) for ii in unseen_pick]

    print(f"uidx={uidx} | média score vistos={np.mean(seen_scores):.4f} | média score não-vistos={np.mean(unseen_scores):.4f}")


uidx=84 | média score vistos=0.7898 | média score não-vistos=-0.0465
uidx=760 | média score vistos=0.8308 | média score não-vistos=0.0011
uidx=87 | média score vistos=0.0454 | média score não-vistos=0.0014
uidx=643 | média score vistos=0.6578 | média score não-vistos=-0.0000
uidx=431 | média score vistos=0.6220 | média score não-vistos=0.0556
uidx=426 | média score vistos=0.5845 | média score não-vistos=-0.0494
uidx=688 | média score vistos=0.6864 | média score não-vistos=0.1713
uidx=93 | média score vistos=0.8141 | média score não-vistos=0.0077
uidx=199 | média score vistos=0.2608 | média score não-vistos=0.0112


In [22]:
# SALVAR ARTEFATOS DO ALS

import os
import numpy as np

os.makedirs("artifacts", exist_ok=True)

np.savez_compressed(
    "artifacts/als_pref_model_top200_markov.npz",
    user_factors=model_als.user_factors.astype(np.float32),
    item_factors=model_als.item_factors.astype(np.float32),
    user_index_to_id=user_index_to_id.astype(str),
    item_index_to_id=item_index_to_id.astype(str),
    alpha=np.array([alpha], dtype=np.float32),
)

print("✅ Salvo: artifacts/als_pref_model_top200_markov.npz")

✅ Salvo: artifacts/als_pref_model_top200_markov.npz


In [23]:
# FUNÇÃO DE SCORE PARA UTILIZAR NO HÍBRIDO

user_id_to_index = {uid: i for i, uid in enumerate(user_index_to_id)}
item_id_to_index = {iid: i for i, iid in enumerate(item_index_to_id)}

def als_score(user_id: str, item_id: str) -> float:
    uidx = user_id_to_index.get(user_id)
    iidx = item_id_to_index.get(item_id)
    if uidx is None or iidx is None:
        return float("-inf")  # para nunca escolher item desconhecido
    return float(model_als.user_factors[uidx] @ model_als.item_factors[iidx])

def rerank_markov_candidates(user_id: str, candidate_item_ids: list[str]) -> tuple[str, float]:
    # retorna (melhor_item, score)
    scored = [(iid, als_score(user_id, iid)) for iid in candidate_item_ids]
    best_item, best_score = max(scored, key=lambda x: x[1])
    return best_item, best_score
