In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# 1) Veri yükleme (ml-100k split)
train = pd.read_csv("ml-100k/u1.base", delimiter="\t", header=None, names=["user","item","rating","ts"])
test  = pd.read_csv("ml-100k/u1.test",  delimiter="\t", header=None, names=["user","item","rating","ts"])

In [5]:
nb_users = max(train.user.max(), test.user.max())
nb_items = max(train.item.max(), test.item.max())

print(f"Kullanıcı sayısı: {nb_users} , film sayısı: {nb_items}")


Kullanıcı sayısı: 943 , film sayısı: 1682


In [7]:
def to_user_item_matrix(df, nb_users, nb_items):
    X = np.zeros((nb_users, nb_items), dtype=np.float32)
    for u, it, r, _ in df.values:
        X[u-1, it-1] = r
    return X

train_mat = to_user_item_matrix(train, nb_users, nb_items)
test_mat  = to_user_item_matrix(test,  nb_users, nb_items)

print(f"Kullanıcı matrisi: {train_mat}-{train_mat.shape} , film matrisi: {test_mat}-{test_mat.shape}")

Kullanıcı matrisi: [[5. 3. 4. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 5. 0. ... 0. 0. 0.]]-(943, 1682) , film matrisi: [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]-(943, 1682)


In [8]:
# 2) Binarize + mask
#  rating == 0 -> bilinmiyor (-1); 1-2 -> 0; 3-5 -> 1
def binarize(x):
    y = x.copy()
    y[y == 0]  = -1.0
    y[(y==1) | (y==2)] = 0.0
    y[y >= 3]  = 1.0
    return y

train_bin = binarize(train_mat)
test_bin  = binarize(test_mat)

train_t = torch.tensor(train_bin, device=device)
test_t  = torch.tensor(test_bin,  device=device)

In [9]:
# 3) RBM tanımı (Bernoulli-Bernoulli)
class RBM(nn.Module):
    def __init__(self, nv, nh):
        super().__init__()
        # W: nh x nv (PyTorch param değil; manuel güncelleyeceğiz)
        self.W = torch.randn(nh, nv, device=device) * 0.01
        self.hbias = torch.zeros(1, nh, device=device)
        self.vbias = torch.zeros(1, nv, device=device)

    def sample_h(self, v):
        # p(h=1|v) = sigmoid(v W^T + hbias)
        prob = torch.sigmoid(v @ self.W.t() + self.hbias)
        return prob, torch.bernoulli(prob)

    def sample_v(self, h):
        # p(v=1|h) = sigmoid(h W + vbias)
        prob = torch.sigmoid(h @ self.W + self.vbias)
        return prob, torch.bernoulli(prob)

    @torch.no_grad()
    def update(self, v0, vk, ph0, phk, lr=0.01, momentum=0.5, weight_decay=1e-4,
               vW=None, vh=None, vv=None):
        # momentum buffer'ları
        if vW is None:
            vW = torch.zeros_like(self.W)
            vh = torch.zeros_like(self.hbias)
            vv = torch.zeros_like(self.vbias)

        dW = (ph0.t() @ v0 - phk.t() @ vk) / v0.size(0)  # ortalama
        db = torch.sum(v0 - vk, dim=0, keepdim=True) / v0.size(0)
        da = torch.sum(ph0 - phk, dim=0, keepdim=True) / v0.size(0)

        # weight decay (L2 benzeri)
        dW -= weight_decay * self.W

        # momentum güncellemesi
        vW = momentum * vW + lr * dW
        vv = momentum * vv + lr * db
        vh = momentum * vh + lr * da

        self.W     += vW
        self.vbias += vv
        self.hbias += vh
        return vW, vh, vv

nv, nh = nb_items, 128
rbm = RBM(nv, nh)

In [10]:
# 4) Mini-batch eğitim (CD-k)
batch_size = 128
epochs = 15
k = 5          # CD-k adımı
lr = 0.05
momentum = 0.5
weight_decay = 1e-4

dataset = TensorDataset(train_t)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)

vW = vh = vv = None
for epoch in range(1, epochs+1):
    running = 0.0
    cnt = 0
    for (v0,) in loader:
        # v0: (B, nv)
        vk = v0.clone()
        # mask: bilinmeyenler (-1) bu pozisyonları koru
        mask = (v0 >= 0).float()

        ph0, _ = rbm.sample_h(v0)
        for _ in range(k):
            _, hk = rbm.sample_h(vk)
            pvk, vk_sample = rbm.sample_v(hk)
            # bilinmeyen pozisyonları (mask==0) v0’dan taşımayız; sadece bilinenleri güncelliyoruz
            vk = v0 * (1 - mask) + vk_sample * mask

        phk, _ = rbm.sample_h(vk)

        vW, vh, vv = rbm.update(v0, vk, ph0, phk, lr, momentum, weight_decay, vW, vh, vv)

        # rekonstrüksiyon hatası: sadece bilinenlerde
        loss = torch.mean(torch.abs(v0[mask.bool()] - vk[mask.bool()]))
        running += loss.item()
        cnt += 1
    print(f"Epoch {epoch:02d} | train recon loss: {running/cnt:.4f}")

Epoch 01 | train recon loss: 0.4340
Epoch 02 | train recon loss: 0.3196
Epoch 03 | train recon loss: 0.2990
Epoch 04 | train recon loss: 0.2869
Epoch 05 | train recon loss: 0.2815
Epoch 06 | train recon loss: 0.2765
Epoch 07 | train recon loss: 0.2772
Epoch 08 | train recon loss: 0.2713
Epoch 09 | train recon loss: 0.2711
Epoch 10 | train recon loss: 0.2686
Epoch 11 | train recon loss: 0.2676
Epoch 12 | train recon loss: 0.2651
Epoch 13 | train recon loss: 0.2637
Epoch 14 | train recon loss: 0.2645
Epoch 15 | train recon loss: 0.2617


In [11]:
# 5) Test rekonstrüksiyon hatası
with torch.no_grad():
    v = train_t  # görünür başlangıç olarak train’i kullan (standart yaklaşım)
    ph, _ = rbm.sample_h(v)
    pv, _ = rbm.sample_v(ph)
    mask_te = (test_t >= 0).float()
    test_loss = torch.mean(torch.abs(test_t[mask_te.bool()] - pv[mask_te.bool()]))
    print("Test recon loss:", float(test_loss))

Test recon loss: 0.2696119248867035


In [12]:
def topk_metrics(train_bin, test_bin, proba, K=10):
    # train'de zaten 1 olanları öneriden hariç tutmak istersen maskelersin.
    P = []
    R = []
    for u in range(train_bin.shape[0]):
        # testteki pozitifler
        pos = np.where(test_bin[u] == 1.0)[0]
        if len(pos) == 0:
            continue
        scores = proba[u].copy()

        # (Opsiyonel) zaten train'de 1 olanları öneriden çıkar
        already_pos = np.where(train_bin[u] == 1.0)[0]
        scores[already_pos] = -1e9

        topk = np.argsort(scores)[-K:][::-1]
        hit = len(set(topk) & set(pos))
        P.append(hit / K)
        R.append(hit / len(pos))
    return float(np.mean(P)), float(np.mean(R))

with torch.no_grad():
    ph, _ = rbm.sample_h(train_t)
    pv, _ = rbm.sample_v(ph)      # öneri olasılıkları ~ p(v=1|h)
    proba = pv.detach().cpu().numpy()
    P10, R10 = topk_metrics(train_bin, test_bin, proba, K=10)
    print(f"Precision@10={P10:.4f} | Recall@10={R10:.4f}")


Precision@10=0.1009 | Recall@10=0.0270


In [18]:
# 1) Film başlıklarını oku (ml-1m için)
# movies.dat biçimi: MovieID::Title::Genres
movies = pd.read_csv("ml-1m/movies.dat", sep="::", header=None, engine="python", encoding="latin-1")
movies.columns = ["movie_id", "title", "genres"]

# 2) Yardımcı: kullanıcı vektörünü al (binarize edilmiş train_bin'den)
#    train_bin: (nb_users, nb_items) numpy float32; 1=pozitif, 0=negatif, -1=bilinmiyor
def get_user_vector(user_id_1_based, train_bin):
    # RBM kodunda kullanıcı indeksleri 0-bazlı; MovieLens ise 1-bazlı idi
    return train_bin[user_id_1_based - 1].copy()

# 3) Modelden olasılık skorlarını üret (p(v=1|h))
@torch.no_grad()
def rbm_predict_proba_for_user(rbm, user_vec_np):
    # user_vec_np: shape (nb_items,), değerler {-1,0,1}
    v = torch.tensor(user_vec_np, dtype=torch.float32, device=rbm.W.device).unsqueeze(0)  # (1, nv)
    ph, _ = rbm.sample_h(v)
    pv, _ = rbm.sample_v(ph)   # (1, nv)
    return pv.squeeze(0).cpu().numpy()  # (nv,)

# 4) Tek kullanıcı için top-K öneri
def recommend_topk_for_user(user_id_1b, K, rbm, train_bin, movies_df):
    user_vec = get_user_vector(user_id_1b, train_bin)
    proba = rbm_predict_proba_for_user(rbm, user_vec)

    # Zaten kullanıcının pozitif verdiği filmleri öneriden çıkar (yeni öneri)
    already_pos = np.where(user_vec == 1.0)[0]
    proba[already_pos] = -1e9

    # (Opsiyonel) açıkça 0 verdiği (beğenmediği) filmleri de çıkarabilirsin:
    # already_neg = np.where(user_vec == 0.0)[0]
    # proba[already_neg] = -1e9

    topk_idx = np.argsort(proba)[-K:][::-1]
    # MovieID’ler 1-bazlı olduğu için +1
    rec_movie_ids = (topk_idx + 1)

    # Başlıkları eşleştir
    rec = movies_df[movies_df["movie_id"].isin(rec_movie_ids)][["movie_id","title","genres"]]
    # Sıralamayı proba’ya göre yapalım
    order = {mid: proba[mid-1] for mid in rec_movie_ids}
    rec = rec.sort_values(by="movie_id", key=lambda s: s.map(order), ascending=False)
    rec["score"] = rec["movie_id"].map(order)
    return rec.reset_index(drop=True)

In [65]:
# --- Örnek kullanım:
user_id = 10
K = 10
recs = recommend_topk_for_user(user_id, K, rbm, train_bin, movies)
print(recs)

   movie_id                                    title  \
0       483                  King of the Hill (1993)   
1       169  Free Willy 2: The Adventure Home (1995)   
2       498                         Mr. Jones (1993)   
3       315                   Specialist, The (1994)   
4       408                         8 Seconds (1994)   
5       603                     Bye Bye, Love (1995)   
6        64                     Two if by Sea (1996)   
7       316                          Stargate (1994)   
8       318         Shawshank Redemption, The (1994)   
9       191               Scarlet Letter, The (1995)   

                       genres     score  
0                       Drama  0.987347  
1  Adventure|Children's|Drama  0.983292  
2               Drama|Romance  0.981222  
3                      Action  0.979614  
4                       Drama  0.979212  
5                      Comedy  0.978529  
6              Comedy|Romance  0.978144  
7     Action|Adventure|Sci-Fi  0.977154  
8    