In [1]:
import torch
import os
from pandas import read_csv
from torch.optim import SGD
from torch.utils.data import DataLoader
from tensorboardX import SummaryWriter
from config import DATA_DIR, MODELS_DIR
from src.loss import Loss
from src.model import HistogramMF
from src.runner import Runner
from src.create_dataset import create_dataset
from src.data_processor import DataProcessor
from src.data_encoder import DataEncoder

DF_PATH = f"{DATA_DIR}/Steam/steam-200k_with_histogram_features.csv"

In [2]:
columns = ["user_id", "item_id", "action", "rating", "null", "original_mass", "total_mass"]
original_df = read_csv(DF_PATH, skipinitialspace=True, names=columns, low_memory=False)

original_df = original_df[["user_id", "item_id", "rating", "original_mass", "total_mass"]]
original_df = original_df[1:][:]

original_df = original_df.astype({"user_id": "int32"})
original_df = original_df.astype({"item_id": str})
original_df = original_df.astype({"rating": "float64"})
original_df = original_df.astype({"original_mass": "float64"})
original_df = original_df.astype({"total_mass": "float64"})

original_df.head()

Unnamed: 0,user_id,item_id,rating,original_mass,total_mass
1.0,151603712,Fallout 4,1.0,32.0,65.0
2.0,151603712,Fallout 4,87.0,63.5,65.0
3.0,151603712,Spore,1.0,32.0,65.0
4.0,151603712,Spore,14.9,63.0,65.0
5.0,151603712,Fallout New Vegas,1.0,32.0,65.0


In [3]:
data_encoder = DataEncoder(original_df=original_df)
data_processor = DataProcessor(original_df=original_df)

n_users = original_df.user_id.nunique()
n_items = original_df.item_id.nunique()

min_rating = min(original_df.rating.values)
max_rating = max(original_df.rating.values)

model = HistogramMF(
    n_users=n_users,
    n_items=n_items,
    data_encoder=data_encoder,
    data_processor=data_processor,
    min_rating=min_rating,
    max_rating=max_rating,
)

if os.path.exists(f"{MODELS_DIR}/steam/model.pt"):
    model.load_state_dict(torch.load(f"{MODELS_DIR}/steam/model.pt"))

epochs = 50

criterion = Loss()
optimizer = SGD(model.parameters(), lr=5, weight_decay=1e-7)
runner = Runner(model=model, criterion=criterion, optimizer=optimizer)

train_set = create_dataset(data_encoder=data_encoder)
train_load = DataLoader(train_set, batch_size=1000, shuffle=True)

with SummaryWriter(f"runs/steam/dev") as writer:
    for epoch in range(epochs):
        epoch_loss = runner.train(train_loader=train_load, epoch=epoch, writer=writer)
        print(f"epoch={epoch + 1}, loss={epoch_loss}")
        torch.save(model.state_dict(), f"{MODELS_DIR}/steam/model.pt")

  7%|▋         | 14/200 [00:07<01:44,  1.77batch/s, train_loss=inf]     


ValueError: cannot convert float NaN to integer

In [None]:
from pandas import DataFrame
from collections import namedtuple

Row = namedtuple("Row", "user_id item_id rating")

data_encoder = DataEncoder(original_df=original_df)
dataframe_after_mf = []

with torch.no_grad():
    for (index, user_id, item_id, rating, _, _) in original_df.itertuples():
        encoded_user_id = data_encoder.get_encoded_user_id(original_id=user_id)
        encoded_item_id = data_encoder.get_encoded_item_id(original_id=item_id)

        user_id_as_tensor = torch.LongTensor([encoded_user_id])
        item_id_as_tensor = torch.LongTensor([encoded_item_id])
        output = model(users=user_id_as_tensor, items=item_id_as_tensor,).squeeze()[0]
        predicted_rating = torch.round(output).item()

        dataframe_after_mf.append(Row(user_id=user_id, item_id=item_id, rating=predicted_rating))

df_after_mf = DataFrame(dataframe_after_mf, columns=["user_id", "item_id", "rating"])
df_after_mf.head()

In [None]:
compare = original_df[["user_id", "item_id", "rating"]].reset_index(drop=True)
df_after_mf = df_after_mf.reset_index(drop=True)
# df_after_mf.head()
mask = (compare["rating"] == df_after_mf["rating"])
changes = compare[mask].copy()
changes["New rating"] = df_after_mf.rating
print(f"Number of hits: {len(changes)} / {len(original_df)}")
changes.head(len(changes))