In [1]:
import torch
import os
from pandas import read_csv
from torch.optim import SGD
from torch.utils.data import DataLoader
from tensorboardX import SummaryWriter
from config import DATA_DIR, MODELS_DIR
from src.loss import Loss
from src.model import HistogramMF
from src.runner import Runner
from src.create_dataset import create_dataset
from src.data_processor import DataProcessor
from src.data_encoder import DataEncoder

DF_PATH = f"{DATA_DIR}/BookCrossing/BX-Book-Ratings-With-Histogram_features.csv"

In [2]:
columns = ["user_id", "item_id", "rating", "original_mass", "total_mass"]
original_df = read_csv(DF_PATH, skipinitialspace=True, names=columns)
original_df = original_df.iloc[1:, :]

original_df = original_df.astype({"user_id": "int32"})
original_df = original_df.astype({"item_id": str})
original_df = original_df.astype({"rating": "int32"})
original_df = original_df.astype({"original_mass": "float64"})
original_df = original_df.astype({"total_mass": "float64"})

  original_df = read_csv(DF_PATH, skipinitialspace=True, names=columns)


In [None]:
data_encoder = DataEncoder(original_df=original_df)
data_processor = DataProcessor(original_df=original_df)

n_users = original_df.user_id.nunique()
n_items = original_df.item_id.nunique()

min_rating = min(original_df.rating.values)
max_rating = max(original_df.rating.values)

model = HistogramMF(
    n_users=n_users,
    n_items=n_items,
    data_encoder=data_encoder,
    data_processor=data_processor,
    min_rating=min_rating,
    max_rating=max_rating,
)

if os.path.exists(f"{MODELS_DIR}/book_crossing/model.pt"):
    model.load_state_dict(torch.load(f"{MODELS_DIR}/book_crossing/model.pt"))
else:
    epochs = 10

    criterion = Loss()
    optimizer = SGD(model.parameters(), lr=5, weight_decay=1e-7)
    runner = Runner(model=model, criterion=criterion, optimizer=optimizer)

    train_set = create_dataset(data_encoder=data_encoder)
    train_load = DataLoader(train_set, batch_size=1000, shuffle=True)

    with SummaryWriter(f"runs/book_crossing/dev") as writer:
        for epoch in range(epochs):
            epoch_loss = runner.train(train_loader=train_load, epoch=epoch, writer=writer)
            print(f"epoch={epoch + 1}, loss={epoch_loss}")

    torch.save(model.state_dict(), f"{MODELS_DIR}/book_crossing/model.pt")

 74%|███████▍  | 853/1150 [03:02<00:59,  4.97batch/s, train_loss=0.282]

In [None]:
from pandas import DataFrame
from collections import namedtuple

Row = namedtuple("Row", "user_id item_id rating")

data_encoder = DataEncoder(original_df=original_df)
dataframe_after_mf = []

with torch.no_grad():
    for (index, user_id, item_id, rating) in original_df.itertuples():
        encoded_user_id = data_encoder.get_encoded_user_id(original_id=user_id)
        encoded_item_id = data_encoder.get_encoded_item_id(original_id=item_id)

        user_id_as_tensor = torch.LongTensor([encoded_user_id])
        item_id_as_tensor = torch.LongTensor([encoded_item_id])
        output = model(users=user_id_as_tensor, items=item_id_as_tensor,).squeeze()[0]
        predicted_rating = torch.round(output).item()

        dataframe_after_mf.append(Row(user_id=user_id, item_id=item_id, rating=predicted_rating))

df_after_mf = DataFrame(dataframe_after_mf, columns=["user_id", "item_id", "rating"])
df_after_mf.head()

In [None]:
mask = (original_df["rating"] == df_after_mf["rating"])
changes = original_df[mask].copy()
changes["New rating"] = df_after_mf.rating
print(f"Number of hits: {len(changes)} / {len(original_df)}")
changes.head(len(changes))