In [6]:
import os, random
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models, regularizers, callbacks

SEED = 42
np.random.seed(SEED); random.seed(SEED); tf.random.set_seed(SEED)
DATA_DIR = "ml-100k"

def load_movielens_100k(data_dir):
    base_path = os.path.join(data_dir, "u1.base")
    test_path = os.path.join(data_dir, "u1.test")
    data_path = os.path.join(data_dir, "u.data")

    if os.path.exists(base_path) and os.path.exists(test_path):
        train_df = pd.read_csv(base_path, sep="\t", header=None,
                               names=["user","item","rating","ts"])
        test_df  = pd.read_csv(test_path,  sep="\t", header=None,
                               names=["user","item","rating","ts"])
    elif os.path.exists(data_path):
        df = pd.read_csv(data_path, sep="\t", header=None,
                         names=["user","item","rating","ts"])
        train_rows, test_rows = [], []
        for uid, grp in df.groupby("user"):
            grp = grp.sample(frac=1.0, random_state=SEED)
            n_test = max(1, int(0.2*len(grp)))
            test_rows.append(grp.iloc[:n_test])
            train_rows.append(grp.iloc[n_test:])
        train_df = pd.concat(train_rows).reset_index(drop=True)
        test_df  = pd.concat(test_rows).reset_index(drop=True)
    else:
        raise FileNotFoundError("MovieLens 100k dosyaları bulunamadı.")

    # tipleri garantiye al
    for col in ["user","item","rating"]:
        train_df[col] = pd.to_numeric(train_df[col], errors="raise")
        test_df[col]  = pd.to_numeric(test_df[col],  errors="raise")

    n_users = int(max(train_df["user"].max(), test_df["user"].max()))
    n_items = int(max(train_df["item"].max(), test_df["item"].max()))

    # --- Hızlı ve güvenli kurulum: pivot -> değerler ---
    def build_matrix_fast(df, n_users, n_items):
        # kullanıcı ve item id'lerini 0-index'e indir
        piv = df.copy()
        piv["user0"] = piv["user"].astype(int) - 1
        piv["item0"] = piv["item"].astype(int) - 1
        mat = np.zeros((n_users, n_items), dtype=np.float32)
        mat[piv["user0"].to_numpy(), piv["item0"].to_numpy()] = piv["rating"].to_numpy(dtype=np.float32)
        return mat

    train_mat = build_matrix_fast(train_df, n_users, n_items)
    test_mat  = build_matrix_fast(test_df,  n_users, n_items)

    train_mask = (train_mat > 0).astype(np.float32)
    test_mask  = (test_mat  > 0).astype(np.float32)

    print(f"Users={n_users} | Items={n_items} | "
          f"Train obs={int(train_mask.sum())} | Test obs={int(test_mask.sum())}")
    return train_mat, test_mat, train_mask, test_mask

# BURADAN SONRASI önceki AutoEncoder kodunla aynı:
train_mat, test_mat, train_mask, test_mask = load_movielens_100k(DATA_DIR)

@tf.function
def masked_mse(y_true, y_pred):
    mask = tf.cast(tf.not_equal(y_true, 0.0), tf.float32)
    diff = (y_true - y_pred) * mask
    return tf.reduce_sum(tf.square(diff)) / (tf.reduce_sum(mask) + 1e-8)

def build_ae(n_items, l2=1e-4, dropout=0.2, bottleneck=128):
    inp = layers.Input(shape=(n_items,))
    x = layers.Dense(512, activation="relu", kernel_regularizer=regularizers.l2(l2))(inp)
    x = layers.Dropout(dropout)(x)
    x = layers.Dense(256, activation="relu", kernel_regularizer=regularizers.l2(l2))(x)
    z = layers.Dense(bottleneck, activation="relu", kernel_regularizer=regularizers.l2(l2), name="bottleneck")(x)
    x = layers.Dense(256, activation="relu", kernel_regularizer=regularizers.l2(l2))(z)
    x = layers.Dropout(dropout)(x)
    x = layers.Dense(512, activation="relu", kernel_regularizer=regularizers.l2(l2))(x)
    out = layers.Dense(n_items, activation=None)(x)
    return models.Model(inp, out)

n_users, n_items = train_mat.shape
ae = build_ae(n_items)
ae.compile(optimizer=tf.keras.optimizers.Adam(1e-3), loss=masked_mse)

es  = callbacks.EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True, verbose=1)
rlr = callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2, verbose=1, min_lr=1e-5)
mc  = callbacks.ModelCheckpoint("ml100k_ae_best.keras", monitor="val_loss", save_best_only=True, verbose=1)

history = ae.fit(
    train_mat, train_mat,
    validation_data=(train_mat, train_mat),
    epochs=50, batch_size=128, shuffle=True,
    callbacks=[es, rlr, mc], verbose=1
)

best = tf.keras.models.load_model("ml100k_ae_best.keras", custom_objects={"masked_mse": masked_mse})
pred = np.clip(best.predict(train_mat, verbose=0), 1.0, 5.0)

# Basit RMSE (yalnızca test maskeli yerlerde)
sqerr = ((pred - test_mat) ** 2) * (test_mat > 0)
rmse = float(np.sqrt(sqerr.sum() / (test_mat > 0).sum()))
print(f"[TEST] RMSE: {rmse:.4f}")


Users=943 | Items=1682 | Train obs=80000 | Test obs=20000
Epoch 1/50
Epoch 1: val_loss improved from inf to 4.24531, saving model to ml100k_ae_best.keras
Epoch 2/50
Epoch 2: val_loss improved from 4.24531 to 2.34077, saving model to ml100k_ae_best.keras
Epoch 3/50
Epoch 3: val_loss improved from 2.34077 to 1.53806, saving model to ml100k_ae_best.keras
Epoch 4/50
Epoch 4: val_loss did not improve from 1.53806
Epoch 5/50
Epoch 5: val_loss improved from 1.53806 to 1.25604, saving model to ml100k_ae_best.keras
Epoch 6/50
Epoch 6: val_loss improved from 1.25604 to 1.17404, saving model to ml100k_ae_best.keras
Epoch 7/50
Epoch 7: val_loss did not improve from 1.17404
Epoch 8/50
Epoch 8: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 8: val_loss did not improve from 1.17404
Epoch 9/50
Epoch 9: val_loss did not improve from 1.17404
Epoch 10/50
Epoch 10: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.

Epoch 10: val_loss did not improve from 1.17404

In [7]:
# ---------------------------------------------------
# 6) Precision / Recall @K (kullanıcı-bazlı, train'de görülmeyenlere öner)
# ---------------------------------------------------
def evaluate_topk(pred_scores, train_mat, test_mat, K_list=(5, 10, 20), rel_thresh=4.0):
    """
    pred_scores: (n_users, n_items) tahmin puanları
    train_mat:   eğitimde görülen puanlar (görülen item'ları hariç tutacağız)
    test_mat:    testteki gerçek puanlar
    rel_thresh:  testte "ilgili" kabul eşiği (>=4.0 => ilgili)
    """
    train_seen = (train_mat > 0)
    test_rel   = (test_mat >= rel_thresh)

    results = {}
    for K in K_list:
        tot_prec, tot_rec, n_users_eval = 0.0, 0.0, 0
        for u in range(pred_scores.shape[0]):
            # bu kullanıcının testte ilgili sayısı:
            n_rel = int(test_rel[u].sum())
            if n_rel == 0:
                continue  # kullanıcıda ölçülecek ilgili yoksa atla

            scores = pred_scores[u].copy()
            # eğitimde görülen item'ları öneriden çıkar
            scores[train_seen[u]] = -1e9

            topk_idx = np.argpartition(scores, -K)[-K:]
            topk_idx = topk_idx[np.argsort(scores[topk_idx])[::-1]]  # skor sırasına getir

            hits = test_rel[u, topk_idx].sum()
            prec = float(hits) / K
            rec  = float(hits) / n_rel

            tot_prec += prec
            tot_rec  += rec
            n_users_eval += 1

        results[K] = {
            "precision": tot_prec / max(1, n_users_eval),
            "recall":    tot_rec  / max(1, n_users_eval),
            "n_users":   n_users_eval
        }
    return results

metrics = evaluate_topk(pred, train_mat, test_mat, K_list=(5,10,20), rel_thresh=4.0)
print("\nTop-K değerlendirme (relevant>=4):")
for K, m in metrics.items():
    print(f"@{K}: Precision={m['precision']:.4f} | Recall={m['recall']:.4f} | users={m['n_users']}")

# ---------------------------------------------------
# 7) Tek kullanıcı için öneri örneği
# ---------------------------------------------------
def recommend_for_user(user_id_1based, pred_scores, train_mat, top_k=10):
    u = user_id_1based - 1
    scores = pred_scores[u].copy()
    scores[train_mat[u] > 0] = -1e9  # zaten izlediklerini ele
    topk_idx = np.argpartition(scores, -top_k)[-top_k:]
    topk_idx = topk_idx[np.argsort(scores[topk_idx])[::-1]]
    # 1-based item ID’leri döndürelim
    items_1based = (topk_idx + 1).tolist()
    return items_1based, scores[topk_idx].tolist()

# Örnek: kullanıcı 1 için 10 öneri
items, scores = recommend_for_user(1, pred, train_mat, top_k=10)
print("\nKullanıcı #1 için öneriler (itemID:score):")
for iid, sc in zip(items, scores):
    print(f"{iid}: {sc:.3f}")

# ---------------------------------------------------
# 8) Modeli kaydet (ileride tekrar yüklemek için)
# ---------------------------------------------------
best.save("ml100k_ae_best.keras")
print("\nModel kaydedildi: ml100k_ae_best.keras")



Top-K değerlendirme (relevant>=4):
@5: Precision=0.0026 | Recall=0.0003 | users=456
@10: Precision=0.0055 | Recall=0.0015 | users=456
@20: Precision=0.0202 | Recall=0.0117 | users=456

Kullanıcı #1 için öneriler (itemID:score):
1367: 5.000
1599: 4.863
1467: 4.822
1449: 4.668
1452: 4.598
1368: 4.597
408: 4.570
913: 4.570
1064: 4.549
1500: 4.535

Model kaydedildi: ml100k_ae_best.keras


In [8]:
def load_movie_meta(data_dir=DATA_DIR):
    uitem = os.path.join(data_dir, "u.item")
    mdat  = os.path.join("ml-1m", "movies.dat")  # varsa 1M
    if os.path.exists(uitem):
        # MovieLens 100k
        genre_cols = ["unknown","Action","Adventure","Animation","Children","Comedy","Crime","Documentary","Drama",
                      "Fantasy","Film-Noir","Horror","Musical","Mystery","Romance","Sci-Fi","Thriller","War","Western"]
        cols = ["movieId","title","release_date","video_release_date","imdb_url"] + genre_cols
        meta = pd.read_csv(uitem, sep="|", header=None, names=cols, encoding="latin-1")
        # Tür listesini tek kolonda topla (True olanları birleştir)
        genres = []
        gmat = meta[genre_cols].astype(bool)
        for i in range(len(meta)):
            glist = [g for g,flag in zip(genre_cols, gmat.iloc[i].tolist()) if flag]
            genres.append(", ".join(glist) if glist else "Unknown")
        meta = meta[["movieId","title"]].assign(genres=genres)
        return meta
    elif os.path.exists(mdat):
        # MovieLens 1M (opsiyonel)
        meta = pd.read_csv(mdat, sep="::", engine="python", header=None,
                           names=["movieId","title","genres"], encoding="latin-1")
        return meta[["movieId","title","genres"]]
    else:
        raise FileNotFoundError("Ne ml-100k/u.item ne de ml-1m/movies.dat bulunamadı.")

meta = load_movie_meta(DATA_DIR)
meta.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"Animation, Children, Comedy"
1,2,GoldenEye (1995),"Action, Adventure, Thriller"
2,3,Four Rooms (1995),Thriller
3,4,Get Shorty (1995),"Action, Comedy, Drama"
4,5,Copycat (1995),"Crime, Drama, Thriller"


In [22]:
def recommend_for_user(user_id, score_matrix, train_mat, meta_df, topk=10, min_score=1.0, clip_to_1_5=True):
    """
    user_id: 1-index (MovieLens kullanıcı id'si gibi)
    score_matrix: model çıktı skoru (ör: best.predict(train_mat)) shape=(n_users, n_items)
    train_mat: orijinal eğitim matrisi (izlenen/puanlananları elemek için)
    meta_df: load_movie_meta çıktısı (movieId=1..n_items)
    """
    uidx = user_id - 1  # 0-index'e indir
    scores = score_matrix[uidx].copy()

    # İstersek 1..5’e kırp
    if clip_to_1_5:
        scores = np.clip(scores, 1.0, 5.0)

    # Kullanıcının zaten puanladıklarını çıkar
    already_rated = train_mat[uidx] > 0
    scores[already_rated] = -np.inf

    # En yüksek skorlu top-k indeksleri al
    top_idx = np.argpartition(-scores, range(topk))[:topk]
    top_idx = top_idx[np.argsort(-scores[top_idx])]

    # movieId = column_index + 1 (100k setinde kolonlar 1..1682)
    rec_df = pd.DataFrame({
        "movieId": top_idx + 1,
        "score": scores[top_idx]
    })
    rec_df = rec_df.merge(meta_df, on="movieId", how="left")
    # skor eşiği uygula (opsiyonel)
    rec_df = rec_df[rec_df["score"] >= min_score]
    # kolon sırası ve format
    rec_df = rec_df[["movieId","title","genres","score"]]
    return rec_df.reset_index(drop=True)

# --- ÖRNEK KULLANIM ---
# Eğitimden sonra zaten şunları üretmiştin:
# best = tf.keras.models.load_model("ml100k_ae_best.keras", custom_objects={"masked_mse": masked_mse})
# pred = np.clip(best.predict(train_mat, verbose=0), 1.0, 5.0)

user_id = 11
rec_df = recommend_for_user(user_id, pred, train_mat, meta, topk=10)
print(rec_df.to_string(index=False))


 movieId                                title                      genres    score
    1367                         Faust (1994)                   Animation 4.656963
    1599        Someone Else's America (1995)                       Drama 4.608151
    1467 Saint of Fort Washington, The (1993)                       Drama 4.593640
    1449               Pather Panchali (1955)                       Drama 4.366854
    1645              Butcher Boy, The (1998)                       Drama 4.275976
    1235          Big Bang Theory, The (1994)                       Crime 4.148536
     913 Love and Death on Long Island (1997)                      Comedy 4.140354
    1064                     Crossfire (1947)            Crime, Film-Noir 4.102300
     408                Close Shave, A (1995) Animation, Comedy, Thriller 4.101787
    1500            Santa with Muscles (1996)                      Comedy 4.097122
