In [12]:
import torch
import os
from pandas import read_csv
from torch.optim import SGD
from torch.utils.data import DataLoader
from tensorboardX import SummaryWriter
from config import DATA_DIR, MODELS_DIR
from src.loss import Loss
from src.model import HistogramMF
from src.runner import Runner
from src.create_dataset import create_dataset
from src.data_processor import DataProcessor
from src.data_encoder import DataEncoder

DF_PATH = f"{DATA_DIR}/MovieLens/ratings_500k_with_histogram_features.csv"

In [13]:
columns = ["user_id", "item_id", "rating", "original_mass", "total_mass"]
original_df = read_csv(DF_PATH, skipinitialspace=True, usecols=columns)
original_df.head()
print(len(original_df))

17464


In [8]:
data_encoder = DataEncoder(original_df=original_df)
data_processor = DataProcessor(original_df=original_df)

n_users = original_df.user_id.nunique()
n_items = original_df.item_id.nunique()

min_rating = min(original_df.rating.values)
max_rating = max(original_df.rating.values)

model = HistogramMF(
    n_users=n_users,
    n_items=n_items,
    data_encoder=data_encoder,
    data_processor=data_processor,
    min_rating=min_rating,
    max_rating=max_rating,
)

if os.path.exists(f"{MODELS_DIR}/MovieLens/model.pt"):
    model.load_state_dict(torch.load(f"{MODELS_DIR}/MovieLens/model.pt"))

epochs = 30

criterion = Loss()
optimizer = SGD(model.parameters(), lr=5, weight_decay=1e-7)
runner = Runner(model=model, criterion=criterion, optimizer=optimizer)

train_set = create_dataset(data_encoder=data_encoder)
train_load = DataLoader(train_set, batch_size=1000, shuffle=True)

with SummaryWriter(f"runs/MovieLens/dev") as writer:
    for epoch in range(epochs):
        epoch_loss = runner.train(train_loader=train_load, epoch=epoch, writer=writer)
        print(f"epoch={epoch + 1}, loss={epoch_loss}")
        torch.save(model.state_dict(), f"{MODELS_DIR}/MovieLens/model.pt")

100%|██████████| 18/18 [00:03<00:00,  5.24batch/s, train_loss=0.00535]


epoch=1, loss=0.06736982653058808


100%|██████████| 18/18 [00:02<00:00,  8.20batch/s, train_loss=0.0017] 


epoch=2, loss=0.05853080012161157


100%|██████████| 18/18 [00:02<00:00,  7.88batch/s, train_loss=0.00295]


epoch=3, loss=0.05119853867127977


100%|██████████| 18/18 [00:02<00:00,  8.10batch/s, train_loss=0.003]  


epoch=4, loss=0.04943895403270064


100%|██████████| 18/18 [00:02<00:00,  7.97batch/s, train_loss=0.00192]


epoch=5, loss=0.045490836739540096


100%|██████████| 18/18 [00:02<00:00,  8.26batch/s, train_loss=0.00293]


epoch=6, loss=0.038760292840415024


100%|██████████| 18/18 [00:02<00:00,  7.95batch/s, train_loss=0.00282]


epoch=7, loss=0.03588474225175792


100%|██████████| 18/18 [00:02<00:00,  8.24batch/s, train_loss=0.00151]


epoch=8, loss=0.0324409078143794


100%|██████████| 18/18 [00:02<00:00,  7.82batch/s, train_loss=0.00233] 


epoch=9, loss=0.029478583956586903


100%|██████████| 18/18 [00:02<00:00,  8.24batch/s, train_loss=0.00138] 


epoch=10, loss=0.026924630960513805


100%|██████████| 18/18 [00:02<00:00,  8.02batch/s, train_loss=0.000865]


epoch=11, loss=0.025094084183717596


100%|██████████| 18/18 [00:02<00:00,  8.21batch/s, train_loss=0.00104] 


epoch=12, loss=0.023570487158051857


100%|██████████| 18/18 [00:02<00:00,  8.00batch/s, train_loss=0.000587]


epoch=13, loss=0.024425956905401985


100%|██████████| 18/18 [00:02<00:00,  8.21batch/s, train_loss=0.00143] 


epoch=14, loss=0.0253197464182459


100%|██████████| 18/18 [00:02<00:00,  8.02batch/s, train_loss=0.00133] 


epoch=15, loss=0.021932037913593754


100%|██████████| 18/18 [00:02<00:00,  8.21batch/s, train_loss=0.000957]


epoch=16, loss=0.022406159337738464


100%|██████████| 18/18 [00:02<00:00,  8.03batch/s, train_loss=0.000697]


epoch=17, loss=0.022719374292369542


100%|██████████| 18/18 [00:02<00:00,  8.02batch/s, train_loss=0.00224] 


epoch=18, loss=0.022885145296310555


100%|██████████| 18/18 [00:02<00:00,  8.26batch/s, train_loss=0.00179] 


epoch=19, loss=0.023264763799206965


100%|██████████| 18/18 [00:02<00:00,  8.00batch/s, train_loss=0.00155] 


epoch=20, loss=0.022623248874113478


100%|██████████| 18/18 [00:02<00:00,  8.03batch/s, train_loss=0.00181] 


epoch=21, loss=0.021751703574739656


100%|██████████| 18/18 [00:02<00:00,  7.75batch/s, train_loss=0.00162] 


epoch=22, loss=0.021204645848479763


100%|██████████| 18/18 [00:02<00:00,  6.96batch/s, train_loss=0.0042]  


epoch=23, loss=0.022033123145714914


100%|██████████| 18/18 [00:02<00:00,  7.69batch/s, train_loss=0.000875]


epoch=24, loss=0.020357267358179748


100%|██████████| 18/18 [00:02<00:00,  8.16batch/s, train_loss=0.00207] 


epoch=25, loss=0.020547396114160277


100%|██████████| 18/18 [00:02<00:00,  7.69batch/s, train_loss=0.00213] 


epoch=26, loss=0.02065147405040675


100%|██████████| 18/18 [00:02<00:00,  8.02batch/s, train_loss=0.00124] 


epoch=27, loss=0.018924417327190267


100%|██████████| 18/18 [00:02<00:00,  7.89batch/s, train_loss=0.000786]


epoch=28, loss=0.01930408071546719


100%|██████████| 18/18 [00:02<00:00,  8.00batch/s, train_loss=0.000808]


epoch=29, loss=0.018962144115875507


100%|██████████| 18/18 [00:02<00:00,  7.85batch/s, train_loss=0.000987]

epoch=30, loss=0.018186843599738746





In [9]:
from pandas import DataFrame
from collections import namedtuple

Row = namedtuple("Row", "user_id item_id rating")

data_encoder = DataEncoder(original_df=original_df)
dataframe_after_mf = []

with torch.no_grad():
    for (index, user_id, item_id, rating, _, _) in original_df.itertuples():
        encoded_user_id = data_encoder.get_encoded_user_id(original_id=user_id)
        encoded_item_id = data_encoder.get_encoded_item_id(original_id=item_id)

        user_id_as_tensor = torch.LongTensor([encoded_user_id])
        item_id_as_tensor = torch.LongTensor([encoded_item_id])
        output = model(users=user_id_as_tensor, items=item_id_as_tensor,).squeeze()[0]
        predicted_rating = torch.round(output).item()

        dataframe_after_mf.append(Row(user_id=user_id, item_id=item_id, rating=predicted_rating))

df_after_mf = DataFrame(dataframe_after_mf, columns=["user_id", "item_id", "rating"])
df_after_mf.head()

Unnamed: 0,user_id,item_id,rating
0,6010bbc8e7ef4b21fa38f9c3a9754ef3,2,2.0
1,3c888e77b992ae3cd2adfe16774e23b9,2,3.0
2,2afd218c3aecb6828d2be327f8b9c46f,2,3.0
3,fd5b08ce362d855ca9152a894348130c,2,4.0
4,9c8073214a052e414811b76012df8847,2,2.0


In [10]:
compare = original_df[["user_id", "item_id", "rating"]].reset_index(drop=True)
df_after_mf = df_after_mf.reset_index(drop=True)
mask = (compare["rating"] == df_after_mf["rating"])
changes = compare[mask].copy()
changes["New rating"] = df_after_mf.rating
print(f"Number of hits: {len(changes)} / {len(original_df)}")
print(f"Hit ratio: {(len(changes) / len(original_df)) * 100}")
changes.head(len(changes))

Number of hits: 17395 / 17464
Hit ratio: 99.60490151168118


Unnamed: 0,user_id,item_id,rating,New rating
0,6010bbc8e7ef4b21fa38f9c3a9754ef3,2,2,2.0
1,3c888e77b992ae3cd2adfe16774e23b9,2,3,3.0
2,2afd218c3aecb6828d2be327f8b9c46f,2,3,3.0
3,fd5b08ce362d855ca9152a894348130c,2,4,4.0
4,9c8073214a052e414811b76012df8847,2,2,2.0
...,...,...,...,...
17459,607f6e34a0b5923333f6b16d3a59cc98,2000,5,5.0
17460,78b5e9744073532cc376976b5fc6b2fc,2000,7,7.0
17461,7cecbffe1da5ae974952db6c13695afe,2000,5,5.0
17462,ed7ed76453bd846859f5e6b9149df276,2000,7,7.0
