In [1]:
import os.path

import recmetrics as recmetrics
from pandas import read_csv
from torch.optim import SGD
from torch.utils.data import DataLoader

from config import DATA_DIR
from src.loss import MiningOutliersLoss
from src.model import MF
from src.runner import Runner
from src.utils import create_dataset, DataConverter, DataProcessor

DF_PATH = f"{DATA_DIR}/MovieLens/ratings.csv"

In [16]:
columns = ["user_id", "item_id", "rating"]
original_df = read_csv(DF_PATH, skipinitialspace=True, usecols=["userId", "movieId", "rating"], nrows=50000)
original_df.columns = ["user_id", "item_id", "rating"]

original_df = original_df.astype({"user_id": "int32"})
original_df = original_df.astype({"item_id": str})
original_df = original_df.astype({"rating": "float64"})

data_converter = DataConverter(original_df=original_df)
data_processor = DataProcessor(original_df=data_converter.original_df)

print(original_df.describe())
original_df.head()

           user_id        rating
count  50000.00000  50000.000000
mean     195.32608      3.592330
std      116.36048      1.056803
min        1.00000      0.500000
25%       92.00000      3.000000
50%      187.00000      4.000000
75%      294.00000      4.500000
max      406.00000      5.000000


Unnamed: 0,user_id,item_id,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5


In [17]:
from tensorboardX import SummaryWriter
from torch.nn import MSELoss
from config import MODELS_DIR
import torch

model = MF(
    n_users=data_converter.n_users,
    n_items=data_converter.n_item,
    include_bias=True
)

if os.path.exists(f"{MODELS_DIR}/MovieLens/model.pt"):
    model.load_state_dict(torch.load(f"{MODELS_DIR}/MovieLens/model.pt"))

# criterion = MSELoss()
# optimizer = SGD(model.parameters(), lr=5, weight_decay=1e-7)
# epochs = 50
#
# runner = Runner(
#     model=model,
#     criterion=criterion,
#     optimizer=optimizer,
#     epochs=epochs
# )
#
# train_set = create_dataset(data_frame=data_converter.encoded_df)
# train_load = DataLoader(train_set, batch_size=1000, shuffle=True)
# with SummaryWriter("runs/dev/MovieLens") as writer:
#     for epoch in range(epochs):
#         epoch_loss = runner.train(train_loader=train_load, epoch=epoch, writer=writer)
#         print(f"epoch={epoch + 1}, loss={epoch_loss}")
#         torch.save(model.state_dict(), f"{MODELS_DIR}/MovieLens/model.pt")

In [6]:
# items_group_by_users = data_converter.original_df.groupby("user_id")
# outliers = dict(sorted(outliers.items(), key=lambda item: item[1]))
# for user_id, item_id in outliers.items():
#     number_of_items = len(items_group_by_users.get_group(user_id))
#     print(f"user: {user_id}, dist: {item_id}, #items: {number_of_items}")

In [18]:
from collections import namedtuple
from pandas import DataFrame
from src.utils import ProcColumn

Row = namedtuple("Row", "user_id item_id rating")
working_df = original_df.copy()

user_original_id_to_encoded_id = ProcColumn(working_df.user_id)
item_original_id_to_encoded_id = ProcColumn(working_df.item_id)
working_df.user_id = user_original_id_to_encoded_id.encoded_col
working_df.item_id = item_original_id_to_encoded_id.encoded_col

df_after_mf = []
with torch.no_grad():
    for (index, worker_id, song_id, rating) in working_df.itertuples():
        user_id_as_tensor = torch.LongTensor([worker_id])
        item_id_as_tensor = torch.LongTensor([song_id])
        predicted_rating = model(users=user_id_as_tensor, items=item_id_as_tensor,).item()

        original_worker_id = user_original_id_to_encoded_id.get_name(index=worker_id)
        original_item_id = item_original_id_to_encoded_id.get_name(index=song_id)
        df_after_mf.append(
            Row(
                user_id=original_worker_id,
                item_id=original_item_id,
                rating=predicted_rating,
            )
        )

df_after_mf = DataFrame(
    df_after_mf, columns=["user_id", "item_id", "rating"]
)
df_after_mf.head()

Unnamed: 0,user_id,item_id,rating
0,1,296,4.868937
1,1,306,3.688776
2,1,307,4.924071
3,1,665,4.937776
4,1,899,3.438493


In [10]:
compare = original_df[["user_id", "item_id", "rating"]].reset_index(drop=True)
print(compare)
df_after_mf = df_after_mf.reset_index(drop=True)
mask = (compare["rating"] == df_after_mf["rating"])
changes = compare[mask].copy()
changes["New rating"] = df_after_mf.rating
print(f"Number of hits: {len(changes)} / {len(original_df)}")
print(f"Hit ratio: {(len(changes) / len(original_df)) * 100}")
changes.head(len(changes))

       user_id item_id  rating
0            1     296     5.0
1            1     306     3.5
2            1     307     5.0
3            1     665     5.0
4            1     899     3.5
...        ...     ...     ...
49995      406   99007     4.5
49996      406   99728     0.5
49997      406  101577     4.5
49998      406  101962     5.0
49999      406  102407     4.5

[50000 rows x 3 columns]
Number of hits: 50000 / 50000
Hit ratio: 100.0


Unnamed: 0,user_id,item_id,rating,New rating
0,,,,4.868937
1,,,,3.688776
2,,,,4.924071
3,,,,4.937776
4,,,,3.438493
...,...,...,...,...
49995,,,,4.495297
49996,,,,0.479298
49997,,,,4.500709
49998,,,,4.994490


In [20]:
import recmetrics
original_df["cf_prediction"] = df_after_mf.rating
print("MSE: ", recmetrics.mse(original_df.rating, df_after_mf.rating))
print("RMSE: ", recmetrics.rmse(original_df.rating, df_after_mf.rating))
original_df.head(100)

MSE:  0.02465608016032059
RMSE:  0.1570225466623204


Unnamed: 0,user_id,item_id,rating,cf_predictions,cf_prediction
0,1,296,5.0,4.868937,4.868937
1,1,306,3.5,3.688776,3.688776
2,1,307,5.0,4.924071,4.924071
3,1,665,5.0,4.937776,4.937776
4,1,899,3.5,3.438493,3.438493
...,...,...,...,...,...
95,2,733,4.5,4.327505,4.327505
96,2,858,3.5,3.719010,3.719010
97,2,914,4.0,3.938428,3.938428
98,2,953,4.5,4.450279,4.450279
