In [1]:
import torch
import os
from pandas import read_csv
from torch.optim import SGD
from torch.utils.data import DataLoader
from tensorboardX import SummaryWriter
from config import DATA_DIR, MODELS_DIR
from src.loss import Loss
from src.model import HistogramMF
from src.runner import Runner
from src.create_dataset import create_dataset
from src.data_processor import DataProcessor
from src.data_encoder import DataEncoder

DF_PATH = f"{DATA_DIR}/BookCrossing/BX-Book-Ratings-With-Histogram_features.csv"

In [2]:
columns = ["user_id", "item_id", "rating", "original_mass", "total_mass"]
original_df = read_csv(DF_PATH, skipinitialspace=True, names=columns)
original_df = original_df.iloc[1:, :]

original_df = original_df.astype({"user_id": "int32"})
original_df = original_df.astype({"item_id": str})
original_df = original_df.astype({"rating": "int32"})
original_df = original_df.astype({"original_mass": "float64"})
original_df = original_df.astype({"total_mass": "float64"})

  original_df = read_csv(DF_PATH, skipinitialspace=True, names=columns)


In [3]:
data_encoder = DataEncoder(original_df=original_df)
data_processor = DataProcessor(original_df=original_df)

n_users = original_df.user_id.nunique()
n_items = original_df.item_id.nunique()

min_rating = min(original_df.rating.values)
max_rating = max(original_df.rating.values)

model = HistogramMF(
    n_users=n_users,
    n_items=n_items,
    data_encoder=data_encoder,
    data_processor=data_processor,
    min_rating=min_rating,
    max_rating=max_rating,
)

if os.path.exists(f"{MODELS_DIR}/book_crossing/model.pt"):
    model.load_state_dict(torch.load(f"{MODELS_DIR}/book_crossing/model.pt"))

epochs = 30

criterion = Loss()
optimizer = SGD(model.parameters(), lr=5, weight_decay=1e-7)
runner = Runner(model=model, criterion=criterion, optimizer=optimizer)

train_set = create_dataset(data_encoder=data_encoder)
train_load = DataLoader(train_set, batch_size=1000, shuffle=True)

with SummaryWriter(f"runs/book_crossing/dev") as writer:
    for epoch in range(epochs):
        epoch_loss = runner.train(train_loader=train_load, epoch=epoch, writer=writer)
        print(f"epoch={epoch + 1}, loss={epoch_loss}")
        torch.save(model.state_dict(), f"{MODELS_DIR}/book_crossing/model.pt")

100%|██████████| 1150/1150 [03:56<00:00,  4.86batch/s, train_loss=0.107] 


epoch=1, loss=126.46271296398457


100%|██████████| 1150/1150 [04:14<00:00,  4.52batch/s, train_loss=0.108] 


epoch=2, loss=116.80160687568862


100%|██████████| 1150/1150 [04:07<00:00,  4.65batch/s, train_loss=0.0976]


epoch=3, loss=108.09200614968316


100%|██████████| 1150/1150 [04:13<00:00,  4.53batch/s, train_loss=0.093] 


epoch=4, loss=100.69215547356235


100%|██████████| 1150/1150 [04:30<00:00,  4.25batch/s, train_loss=0.0806]


epoch=5, loss=93.92955021569662


100%|██████████| 1150/1150 [04:23<00:00,  4.36batch/s, train_loss=0.0806]


epoch=6, loss=88.12323430574853


100%|██████████| 1150/1150 [04:20<00:00,  4.41batch/s, train_loss=0.0746]


epoch=7, loss=82.73913739522304


100%|██████████| 1150/1150 [03:57<00:00,  4.84batch/s, train_loss=0.0767]


epoch=8, loss=78.05626472277531


100%|██████████| 1150/1150 [03:57<00:00,  4.84batch/s, train_loss=0.066] 


epoch=9, loss=73.60863434933397


100%|██████████| 1150/1150 [03:57<00:00,  4.84batch/s, train_loss=0.0643]


epoch=10, loss=69.79698466667756


100%|██████████| 1150/1150 [03:57<00:00,  4.83batch/s, train_loss=0.0534]


epoch=11, loss=66.33225483194985


100%|██████████| 1150/1150 [03:57<00:00,  4.83batch/s, train_loss=0.0584]


epoch=12, loss=63.23476320012412


100%|██████████| 1150/1150 [03:57<00:00,  4.85batch/s, train_loss=0.0558]


epoch=13, loss=60.420259927603


100%|██████████| 1150/1150 [03:57<00:00,  4.85batch/s, train_loss=0.0501]


epoch=14, loss=57.91521319677888


100%|██████████| 1150/1150 [03:56<00:00,  4.86batch/s, train_loss=0.0645]


epoch=15, loss=55.606179252233275


100%|██████████| 1150/1150 [03:55<00:00,  4.88batch/s, train_loss=0.0452]


epoch=16, loss=53.51069722473927


100%|██████████| 1150/1150 [03:55<00:00,  4.89batch/s, train_loss=0.0421]


epoch=17, loss=51.58726173254153


100%|██████████| 1150/1150 [03:55<00:00,  4.89batch/s, train_loss=0.0438]


epoch=18, loss=49.95414984160208


100%|██████████| 1150/1150 [03:55<00:00,  4.89batch/s, train_loss=0.0428]


epoch=19, loss=48.34943963945828


100%|██████████| 1150/1150 [04:02<00:00,  4.74batch/s, train_loss=0.0415]


epoch=20, loss=46.95314278803115


100%|██████████| 1150/1150 [03:55<00:00,  4.89batch/s, train_loss=0.0432]


epoch=21, loss=45.65478127362179


100%|██████████| 1150/1150 [03:56<00:00,  4.87batch/s, train_loss=0.0385]


epoch=22, loss=44.52379343355622


100%|██████████| 1150/1150 [03:55<00:00,  4.88batch/s, train_loss=0.0395]


epoch=23, loss=43.514243885382704


100%|██████████| 1150/1150 [04:00<00:00,  4.79batch/s, train_loss=0.0407]


epoch=24, loss=42.60378576224886


100%|██████████| 1150/1150 [04:00<00:00,  4.77batch/s, train_loss=0.0415]


epoch=25, loss=41.689758625226155


100%|██████████| 1150/1150 [04:00<00:00,  4.77batch/s, train_loss=0.0416]


epoch=26, loss=40.8852017771892


100%|██████████| 1150/1150 [04:00<00:00,  4.78batch/s, train_loss=0.0342]


epoch=27, loss=40.25177603227672


100%|██████████| 1150/1150 [04:02<00:00,  4.74batch/s, train_loss=0.0357]


epoch=28, loss=39.49946111527469


100%|██████████| 1150/1150 [04:01<00:00,  4.76batch/s, train_loss=0.0375]


epoch=29, loss=38.837134739998085


100%|██████████| 1150/1150 [04:02<00:00,  4.75batch/s, train_loss=0.0329]

epoch=30, loss=38.1259297607129





In [4]:
from pandas import DataFrame
from collections import namedtuple

Row = namedtuple("Row", "user_id item_id rating")

data_encoder = DataEncoder(original_df=original_df)
dataframe_after_mf = []

with torch.no_grad():
    for (index, user_id, item_id, rating, _, _) in original_df.itertuples():
        encoded_user_id = data_encoder.get_encoded_user_id(original_id=user_id)
        encoded_item_id = data_encoder.get_encoded_item_id(original_id=item_id)

        user_id_as_tensor = torch.LongTensor([encoded_user_id])
        item_id_as_tensor = torch.LongTensor([encoded_item_id])
        output = model(users=user_id_as_tensor, items=item_id_as_tensor,).squeeze()[0]
        predicted_rating = torch.round(output).item()

        dataframe_after_mf.append(Row(user_id=user_id, item_id=item_id, rating=predicted_rating))

df_after_mf = DataFrame(dataframe_after_mf, columns=["user_id", "item_id", "rating"])
df_after_mf.head()

Unnamed: 0,user_id,item_id,rating
0,276725,034545104X,0.0
1,276726,0155061224,3.0
2,276727,0446520802,-0.0
3,276729,052165615X,3.0
4,276729,0521795028,4.0


In [14]:
compare = original_df[["user_id", "item_id", "rating"]].reset_index(drop=True)
df_after_mf = df_after_mf.reset_index(drop=True)
# df_after_mf.head()
mask = (compare["rating"] == df_after_mf["rating"])
changes = compare[mask].copy()
changes["New rating"] = df_after_mf.rating
print(f"Number of hits: {len(changes)} / {len(original_df)}")
changes.head(len(changes))

Number of hits: 1061273 / 1149780


Unnamed: 0,user_id,item_id,rating,New rating
0,276725,034545104X,0,0.0
2,276727,0446520802,0,-0.0
3,276729,052165615X,3,3.0
8,276744,038550120X,7,7.0
10,276746,0425115801,0,-0.0
...,...,...,...,...
1149773,276704,0806917695,5,5.0
1149774,276704,0876044011,0,0.0
1149775,276704,1563526298,9,9.0
1149776,276706,0679447156,0,-0.0
