In [11]:
import numpy as np
import pandas as pd
from scipy import sparse
from scipy.sparse.linalg import inv
import json
from pathlib import Path

DATA_DIR = Path("/data/ephemeral/home/Seung/data/train")
TRAIN_PATH = DATA_DIR / "train_ratings.csv"

GENRES_PATH    = DATA_DIR / "genres.tsv"
DIRECTORS_PATH = DATA_DIR / "directors.tsv"
WRITERS_PATH   = DATA_DIR / "writers.tsv"
YEARS_PATH     = DATA_DIR / "years.tsv"
TITLES_PATH    = DATA_DIR / "titles.tsv"

ITEM2ATTR_PATH = DATA_DIR / "Ml_item2attributes.json"
OUT_DIR = Path("./preprocess_out")
OUT_DIR.mkdir(parents=True, exist_ok=True)

SEED = 42
rng = np.random.default_rng(SEED)


In [12]:
train = pd.read_csv(TRAIN_PATH)
print(train.shape)
print(train.columns.tolist())
display(train.head())

n_rows = len(train)
n_users = train["user"].nunique()
n_items = train["item"].nunique()
dup_ui = train.duplicated(["user","item"]).sum()

print(f"rows={n_rows:,} users={n_users:,} items={n_items:,} dup(user,item)={dup_ui:,}")
print(f"time range: {train['time'].min()} ~ {train['time'].max()}")

uc = train.groupby("user").size()
ic = train.groupby("item").size()
print("\n[user interactions]")
print(uc.describe(percentiles=[.01,.05,.1,.25,.5,.75,.9,.95,.99]))
print("\n[item interactions]")
print(ic.describe(percentiles=[.01,.05,.1,.25,.5,.75,.9,.95,.99]))


(5154471, 3)
['user', 'item', 'time']


Unnamed: 0,user,item,time
0,11,4643,1230782529
1,11,170,1230782534
2,11,531,1230782539
3,11,616,1230782542
4,11,2140,1230782563


rows=5,154,471 users=31,360 items=6,807 dup(user,item)=0
time range: 1113220585 ~ 1427781052

[user interactions]
count    31360.000000
mean       164.364509
std        150.009107
min         16.000000
1%          46.000000
5%          49.000000
10%         54.000000
25%         71.000000
50%        114.000000
75%        200.000000
90%        335.000000
95%        450.050000
99%        758.000000
max       2912.000000
dtype: float64

[item interactions]
count     6807.000000
mean       757.230939
std       1682.973090
min         27.000000
1%          45.000000
5%          52.000000
10%         59.000000
25%         90.000000
50%        197.000000
75%        610.500000
90%       1852.800000
95%       3312.100000
99%       9285.040000
max      19699.000000
dtype: float64


In [15]:
train_sorted = train.sort_values(["user","time"]).reset_index(drop=True)

# 유저별 index 모으기
user_groups = train_sorted.groupby("user").indices

holdout_mask = np.zeros(len(train_sorted), dtype=bool)

for u, idxs in user_groups.items():
    idxs = np.array(idxs)
    m = len(idxs)

    # 최소 학습 상호작용 보장 (EASE는 co-occurrence가 생겨야 해서 너무 적으면 망함)
    min_train = 10
    if m <= min_train + 1:
        continue

    # "중간" 위주로 뽑기 위해, 양끝 일부를 제외하고 샘플링
    # (너 데이터에서 user min inter=16이라 이게 꽤 잘 먹힘)
    left = int(m * 0.10)
    right = int(m * 0.90)
    cand = idxs[left:right] if (right - left) >= 3 else idxs[1:-1]

    k = max(1, int(round(m * 0.01)))  # 1% 홀드아웃(감각적으로 public/private 100:1 근처)
    k = min(k, max(1, len(cand)//3))  # 과도 홀드아웃 방지

    chosen = rng.choice(cand, size=k, replace=False)
    holdout_mask[chosen] = True

valid = train_sorted[holdout_mask].copy()
trn   = train_sorted[~holdout_mask].copy()

print(f"train: {len(trn):,}  valid: {len(valid):,}  ratio(valid)={len(valid)/len(train_sorted):.4f}")
print(f"valid per-user avg = {valid.groupby('user').size().mean():.3f}")


train: 5,099,242  valid: 55,229  ratio(valid)=0.0107
valid per-user avg = 1.761


In [16]:
user2idx = {u:i for i,u in enumerate(sorted(trn["user"].unique()))}
item2idx = {it:i for i,it in enumerate(sorted(trn["item"].unique()))}

trn["u"] = trn["user"].map(user2idx).astype(np.int32)
trn["i"] = trn["item"].map(item2idx).astype(np.int32)

valid = valid[valid["user"].isin(user2idx) & valid["item"].isin(item2idx)].copy()
valid["u"] = valid["user"].map(user2idx).astype(np.int32)
valid["i"] = valid["item"].map(item2idx).astype(np.int32)

U = len(user2idx)
I = len(item2idx)
print(f"Encoded: U={U:,}, I={I:,}")
print("valid kept:", len(valid))


Encoded: U=31,360, I=6,807
valid kept: 55229


In [17]:
# 1) 기본 binary
rows = trn["u"].to_numpy()
cols = trn["i"].to_numpy()
data = np.ones(len(trn), dtype=np.float32)

X = sparse.csr_matrix((data, (rows, cols)), shape=(U, I))

print("X nnz:", X.nnz, "shape:", X.shape)


X nnz: 5099242 shape: (31360, 6807)


In [18]:
# 1) 기본 binary
rows = trn["u"].to_numpy()
cols = trn["i"].to_numpy()
data = np.ones(len(trn), dtype=np.float32)

X = sparse.csr_matrix((data, (rows, cols)), shape=(U, I))

print("X nnz:", X.nnz, "shape:", X.shape)

def bm25_weight(X, K1=1.2, B=0.75):
    # X: user-item (csr), binary or counts
    X = X.tocsr().astype(np.float32)
    N = X.shape[0]

    # item DF
    df = np.diff(X.tocsc().indptr)  # per item nonzero
    idf = np.log((N - df + 0.5) / (df + 0.5))
    idf = np.maximum(idf, 0).astype(np.float32)

    # user length
    row_sums = np.array(X.sum(axis=1)).ravel().astype(np.float32)
    avg_len = row_sums.mean()

    # BM25 transform on nonzeros
    X = X.tocoo()
    denom = X.data + K1 * (1 - B + B * (row_sums[X.row] / (avg_len + 1e-8)))
    new_data = X.data * (K1 + 1) / (denom + 1e-8)
    new_data = new_data * idf[X.col]

    return sparse.csr_matrix((new_data.astype(np.float32), (X.row, X.col)), shape=X.shape)

X_bm25 = bm25_weight(X, K1=1.2, B=0.75)
print("BM25 nnz:", X_bm25.nnz, "data range:", float(X_bm25.data.min()), float(X_bm25.data.max()))



X nnz: 5099242 shape: (31360, 6807)
BM25 nnz: 5099242 data range: 0.0 10.327887535095215


In [19]:
def fit_ease(X, lam=500.0):
    # X: (U,I) csr
    X = X.tocsr().astype(np.float32)

    G = (X.T @ X).toarray().astype(np.float32)  # (I,I)
    diag = np.diag_indices_from(G)
    G[diag] += lam

    P = np.linalg.inv(G)  # (I,I)
    B = -P / np.diag(P)
    B[diag] = 0.0
    return B  # dense (I,I)

lam = 500.0
B = fit_ease(X_bm25, lam=lam)
print("B shape:", B.shape, "B finite:", np.isfinite(B).all())


B shape: (6807, 6807) B finite: True


In [23]:
def predict_topk(X_train, B, k=10):
    X_train = X_train.tocsr()

    # (U, I) score
    scores = X_train @ B
    scores = np.asarray(scores, dtype=np.float32)

    # seen item 제거
    for u in range(X_train.shape[0]):
        scores[u, X_train[u].indices] = -1e9

    # 1️⃣ 후보 k개 뽑기 (정렬 X)
    topk = np.argpartition(-scores, kth=k-1, axis=1)[:, :k]   # (U, k)

    # 2️⃣ 후보 내부 점수
    topk_scores = scores[np.arange(scores.shape[0])[:, None], topk]  # (U, k)

    # 3️⃣ 후보 내부 정렬
    order = np.argsort(-topk_scores, axis=1)                  # (U, k)
    topk = topk[np.arange(topk.shape[0])[:, None], order]     # (U, k)

    return topk.astype(np.int32)



def normalized_recall_at_10(topk, valid_df, k=10):
    gt = valid_df.groupby("u")["i"].apply(set).to_dict()
    recalls = []
    for u in range(topk.shape[0]):
        if u not in gt:
            continue
        pred = topk[u, :k]
        hit = sum((i in gt[u]) for i in pred)
        denom = min(k, len(gt[u]))
        recalls.append(hit / denom if denom > 0 else 0.0)
    return float(np.mean(recalls))

topk = predict_topk(X_bm25, B, k=10)
print("topk shape:", topk.shape, "dtype:", topk.dtype)
print("sample element type:", type(topk[0,0]))

score = normalized_recall_at_10(topk, valid, k=10)
print(f"normalized Recall@10 = {score:.6f}")


topk shape: (31360, 10) dtype: int32
sample element type: <class 'numpy.int32'>
normalized Recall@10 = 0.112978


In [None]:
item_pop = np.array(X.sum(axis=0)).ravel().astype(np.float32) + 1.0
pop_penalty = np.log(item_pop)  # 완만하게

def predict_topk_with_pop_penalty(X_train, B, k=10, alpha=0.15):
    X_train = X_train.tocsr()

    # score 계산
    scores = np.asarray(X_train @ B, dtype=np.float32)

    # popularity penalty
    scores -= alpha * pop_penalty[None, :]

    # seen 제거
    for u in range(X_train.shape[0]):
        scores[u, X_train[u].indices] = -1e9

    # 1️⃣ 후보 k개 추출
    topk = np.argpartition(-scores, kth=k-1, axis=1)[:, :k]  # (U, k)

    # 2️⃣ 후보 점수
    topk_scores = scores[np.arange(scores.shape[0])[:, None], topk]  # (U, k)

    # 3️⃣ 후보 내부 정렬
    order = np.argsort(-topk_scores, axis=1)
    topk = topk[np.arange(topk.shape[0])[:, None], order]

    return topk.astype(np.int32)


topk2 = predict_topk_with_pop_penalty(X_bm25, B, k=10, alpha=0.15)
score2 = normalized_recall_at_10(topk2, valid, k=10)
print(f"normalized Recall@10 (pop-penalty) = {score2:.6f}")


TypeError: unhashable type: 'numpy.ndarray'

In [None]:
with open(ITEM2ATTR_PATH, "r") as f:
    item2attrs = json.load(f)

# item2attrs 키가 문자열이라 int로 캐스팅
# 현재 item2idx는 "학습에 존재하는 item id"만 포함
all_attr = sorted({a for v in item2attrs.values() for a in v})
attr2idx = {a:i for i,a in enumerate(all_attr)}
A_cols = []
A_rows = []
A_data = []

for item_id, attrs in item2attrs.items():
    item_id = int(item_id)
    if item_id not in item2idx:
        continue
    i = item2idx[item_id]
    for a in attrs:
        A_rows.append(i)
        A_cols.append(attr2idx[a])
        A_data.append(1.0)

A = sparse.csr_matrix((A_data, (A_rows, A_cols)), shape=(I, len(attr2idx)), dtype=np.float32)
print("A shape:", A.shape, "nnz:", A.nnz)

beta = 0.5  # 속성 비중 (전처리 하이퍼)
X_user_attr = X @ A  # (U, attr)
X_ext = sparse.hstack([X, np.sqrt(beta) * X_user_attr], format="csr")
print("X_ext:", X_ext.shape, "nnz:", X_ext.nnz)


In [None]:
# EASE 학습용(마스킹 반영된) train 저장
trn_out = trn[["user","item","time"]].copy()
val_out = valid[["user","item","time"]].copy()

trn_out.to_csv(OUT_DIR / "train_for_ease.csv", index=False)
val_out.to_csv(OUT_DIR / "valid_for_ease.csv", index=False)

print("saved:", OUT_DIR / "train_for_ease.csv")
print("saved:", OUT_DIR / "valid_for_ease.csv")
