In [1]:
import os.path

import torch

from pandas import read_csv
from torch.optim import SGD
from torch.nn import MSELoss
from torch.utils.data import DataLoader
from tensorboardX import SummaryWriter
from torch import randperm
from config import DATA_DIR, MODELS_DIR
from src.data_set import RatingsDataset
from src.loss import MiningOutliersLoss
from src.model import MF
from src.runner import Runner
from src.utils import (
    create_dataset,
    mine_outliers_scipy,
    mine_outliers_sklearn,
    mine_outliers_torch,
    DataConverter,
    DataProcessor,
)

DF_PATH = f"{DATA_DIR}/BookCrossing/BX-Book-Ratings.csv"

In [2]:
columns = ["user_id", "item_id", "rating"]
original_df = read_csv(
    DF_PATH, skipinitialspace=True, sep=";", names=columns, encoding="latin-1", low_memory=False
)
original_df = original_df.iloc[1:, :]
original_df = original_df.astype({"user_id": "int32"})
original_df = original_df.astype({"item_id": str})
original_df = original_df.astype({"rating": "int32"})

n_users = original_df.user_id.nunique()
n_items = original_df.item_id.nunique()
print(n_users)
print(n_items)


105283
340556


In [4]:
data_converter = DataConverter(original_df=original_df)

model = MF(
    n_users=data_converter.n_users,
    n_items=data_converter.n_item,
    include_bias=True
)

criterion = MSELoss()
optimizer = SGD(model.parameters(), lr=5, weight_decay=1e-5)
epochs = 50

if os.path.exists(f"{MODELS_DIR}/book_crossing/model.pt"):
    model.load_state_dict(torch.load(f"{MODELS_DIR}/book_crossing/model.pt"))
#
# runner = Runner(
#     model=model,
#     criterion=criterion,
#     optimizer=optimizer,
#     epochs=epochs
# )
#
# train_set = create_dataset(data_frame=data_converter.encoded_df)
# train_load = DataLoader(train_set, batch_size=1000, shuffle=True)
# with SummaryWriter("runs/Book-Crossing") as writer:
#     for epoch in range(epochs):
#         epoch_loss = runner.train(train_loader=train_load, epoch=epoch, writer=writer)
#         print(f"epoch={epoch + 1}, loss={epoch_loss}")
        # torch.save(model.state_dict(), f"{MODELS_DIR}/book_crossing/model.pt")

In [5]:
from collections import namedtuple
from pandas import DataFrame
from src.utils import ProcColumn

Row = namedtuple("Row", "user_id item_id rating")
working_df = original_df.copy()

user_original_id_to_encoded_id = ProcColumn(working_df.user_id)
item_original_id_to_encoded_id = ProcColumn(working_df.item_id)
working_df.user_id = user_original_id_to_encoded_id.encoded_col
working_df.item_id = item_original_id_to_encoded_id.encoded_col

df_after_mf = []
with torch.no_grad():
    for (index, worker_id, song_id, rating) in working_df.itertuples():
        user_id_as_tensor = torch.LongTensor([worker_id])
        item_id_as_tensor = torch.LongTensor([song_id])
        predicted_rating = model(users=user_id_as_tensor, items=item_id_as_tensor,).item()

        original_worker_id = user_original_id_to_encoded_id.get_name(index=worker_id)
        original_item_id = item_original_id_to_encoded_id.get_name(index=song_id)
        df_after_mf.append(
            Row(
                user_id=original_worker_id,
                item_id=original_item_id,
                rating=predicted_rating,
            )
        )

df_after_mf = DataFrame(
    df_after_mf, columns=["user_id", "item_id", "rating"]
)
df_after_mf.head()

Unnamed: 0,user_id,item_id,rating
0,276725,034545104X,0.105944
1,276726,0155061224,2.668854
2,276727,0446520802,0.103979
3,276729,052165615X,2.252617
4,276729,0521795028,3.478775


In [7]:
compare = original_df[["user_id", "item_id", "rating"]].reset_index(drop=True)
df_after_mf = df_after_mf.reset_index(drop=True)
mask = (compare["rating"] == df_after_mf["rating"])
changes = compare[mask].copy()
changes["New rating"] = df_after_mf.rating
print(f"Number of hits: {len(changes)} / {len(original_df)}")
print(f"Hit ratio: {(len(changes) / len(original_df)) * 100}")
changes.head(len(changes))

Number of hits: 1 / 1149780
Hit ratio: 8.697316008279845e-05


Unnamed: 0,user_id,item_id,rating,New rating
8172,278418,816741271,0,0.0


In [10]:
import recmetrics
original_df["cf_prediction"] = df_after_mf.rating
print("MSE: ", recmetrics.mse(original_df.rating, df_after_mf.rating))
print("RMSE: ", recmetrics.rmse(original_df.rating, df_after_mf.rating))
original_df.head(100)

MSE:  0.32380431222722267
RMSE:  0.569038058680808


Unnamed: 0,user_id,item_id,rating,cf_prediction
1,276725,034545104X,0,2.668854
2,276726,0155061224,5,0.103979
3,276727,0446520802,0,2.252617
4,276729,052165615X,3,3.478775
5,276729,0521795028,6,0.852733
...,...,...,...,...
96,276798,3499134004,0,0.070007
97,276798,349915398X,0,5.866992
98,276798,3548603203,6,0.174416
99,276798,3764501383,0,-0.156772
