In [2]:
import os.path

import numpy as np
import torch

from pandas import read_csv
from torch.optim import SGD
from torch.nn import MSELoss
from torch.utils.data import DataLoader
from tensorboardX import SummaryWriter
from torch import randperm
from config import DATA_DIR, MODELS_DIR
from src.data_set import RatingsDataset
from src.loss import MiningOutliersLoss
from src.model import MF
from src.runner import Runner
from src.utils import (
    create_dataset,
    mine_outliers_scipy,
    mine_outliers_sklearn,
    mine_outliers_torch,
    DataConverter,
    DataProcessor,
)

In [3]:
columns = ["user_id", "item_id", "rating", "original_mass", "total_mass"]
original_df = read_csv(f"{DATA_DIR}/MovieLens100k/ua_with_histogram_features.csv", skipinitialspace=True, names=columns, low_memory=False)
original_df = original_df.iloc[1:, :]

original_df = original_df.astype({"user_id": "float64"})
original_df = original_df.astype({"item_id": str})
original_df = original_df.astype({"rating": "float64"})
original_df = original_df.astype({"original_mass": "float64"})
original_df = original_df.astype({"total_mass": "float64"})

original_df = original_df[["user_id", "item_id", "rating"]]

original_df.head(200)

Unnamed: 0,user_id,item_id,rating
0.0,1.0,1.0,5.0
1.0,1.0,2.0,3.0
2.0,1.0,3.0,4.0
3.0,1.0,4.0,3.0
4.0,1.0,5.0,3.0
...,...,...,...
195.0,1.0,205.0,3.0
196.0,1.0,206.0,4.0
197.0,1.0,207.0,5.0
198.0,1.0,208.0,5.0


In [4]:
data_converter = DataConverter(original_df=original_df)

model = MF(
    n_users=data_converter.n_users,
    n_items=data_converter.n_item,
    include_bias=True
)

criterion = MSELoss()
optimizer = SGD(model.parameters(), lr=5, weight_decay=1e-7)
epochs = 50

if os.path.exists(f"{MODELS_DIR}/MovieLens100k/model.pt"):
    model.load_state_dict(torch.load(f"{MODELS_DIR}/MovieLens100k/model.pt"))

# runner = Runner(
#     model=model,
#     criterion=criterion,
#     optimizer=optimizer,
#     epochs=epochs
# )
#
# train_set = create_dataset(data_frame=data_converter.encoded_df)
# train_load = DataLoader(train_set, batch_size=1000, shuffle=True)
# with SummaryWriter("runs/MovieLens100k") as writer:
#     for epoch in range(epochs):
#         epoch_loss = runner.train(train_loader=train_load, epoch=epoch, writer=writer)
#         print(f"epoch={epoch + 1}, loss={epoch_loss}")
#         torch.save(model.state_dict(), f"{MODELS_DIR}/MovieLens100k/model.pt")

In [5]:
from pandas import DataFrame
from collections import namedtuple

Row = namedtuple("Row", "user_id item_id rating")

dataframe_after_mf = []

with torch.no_grad():
    for (index, user_id, item_id, rating) in original_df.itertuples():
        encoded_user_id = data_converter.get_encoded_user_id(original_id=user_id)
        encoded_item_id = data_converter.get_encoded_item_id(original_id=item_id)

        user_id_as_tensor = torch.LongTensor([encoded_user_id])
        item_id_as_tensor = torch.LongTensor([encoded_item_id])
        output = model(users=user_id_as_tensor, items=item_id_as_tensor,).squeeze()
        output = output.item()

        dataframe_after_mf.append(Row(user_id=user_id, item_id=item_id, rating=output))

after_histogram_mf_dataframe = DataFrame(dataframe_after_mf, columns=["user_id", "item_id", "rating"])

In [6]:
after_histogram_mf_dataframe.head()

Unnamed: 0,user_id,item_id,rating
0,1.0,1.0,5.06321
1,1.0,2.0,3.463972
2,1.0,3.0,3.896021
3,1.0,4.0,4.021567
4,1.0,5.0,2.638696


In [7]:
import recmetrics

print(f"MSE: {recmetrics.mse(original_df.rating, after_histogram_mf_dataframe.rating)}, RMSE: {recmetrics.rmse(original_df.rating, after_histogram_mf_dataframe.rating)}")

MSE: 0.10570819181547442, RMSE: 0.3251279622171468


In [8]:
data_with_predictions = original_df.copy()
data_with_predictions["prediction"] = after_histogram_mf_dataframe.rating

data_with_predictions.head()

Unnamed: 0,user_id,item_id,rating,prediction
0.0,1.0,1.0,5.0,5.06321
1.0,1.0,2.0,3.0,3.463972
2.0,1.0,3.0,4.0,3.896021
3.0,1.0,4.0,3.0,4.021567
4.0,1.0,5.0,3.0,2.638696


In [12]:
import pandas as pd
import numpy as np

statistics = []
for threshold in np.arange(0, 5.5, 0.5):
    tp, fn, fp, tn = 0, 0, 0, 0

    for (_, user_id, item_id, rating, prediction) in data_with_predictions.itertuples():
        if rating >= threshold:
            if prediction >= threshold:
                tp += 1
            else:
                fn += 1
        else:
            if prediction >= threshold:
                fp += 1
            else:
                tn += 1

        if tp == 0:
            precision = 0
            recall = 0
            f1 = 0
        else:
            precision = tp / (tp + fp)
            recall = tp / (tp + fn)
            f1 = 2 * (precision * recall) / (precision + recall)
    temp = [threshold, tp, fp, tn, fn, precision, recall, f1]
    statistics.append(temp)

results = pd.DataFrame(statistics)
results.rename(columns={0:'threshold', 1:'tp', 2: 'fp', 3: 'tn', 4:'fn', 5: 'Precision', 6:'Recall', 7:'F1'}, inplace=True)

In [15]:
results.head(11)

Unnamed: 0,threshold,tp,fp,tn,fn,Precision,Recall,F1
0,0.0,90570,0,0,0,1.0,1.0,1.0
1,0.5,90547,0,0,23,1.0,0.999746,0.999873
2,1.0,89200,0,0,1370,1.0,0.984874,0.992379
3,1.5,84902,743,4825,100,0.991325,0.998824,0.99506
4,2.0,82022,58,5510,2980,0.999293,0.964942,0.981817
5,2.5,73952,1377,14566,675,0.98172,0.990955,0.986316
6,3.0,64463,106,15837,10164,0.998358,0.863803,0.926219
7,3.5,47892,2090,38574,2014,0.958185,0.959644,0.958914
8,4.0,32439,108,40556,17467,0.996682,0.650002,0.786848
9,4.5,16437,1183,70339,2611,0.93286,0.862925,0.896531


In [22]:
from collections import defaultdict

def precision_recall_at_k(k: int, threshold=2.5):
    user_est_true = defaultdict(list)
    for (_, user_id, item_id, rating, prediction) in data_with_predictions.itertuples():
        user_est_true[user_id].append((prediction, rating))

    precisions, recalls = dict(), dict()
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings[:k])
        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls

In [23]:
results=[]
for i in range(2, 11):
    precisions, recalls = precision_recall_at_k(k=i)
    # Precision and recall can then be averaged over all users
    prec = sum(prec for prec in precisions.values()) / len(precisions)
    rec = sum(rec for rec in recalls.values()) / len(recalls)
    results.append({'K': i, 'Precision': prec, 'Recall': rec})
results

[{'K': 2, 'Precision': 1.0, 'Recall': 0.07324856850525181},
 {'K': 3, 'Precision': 1.0, 'Recall': 0.10987285275787743},
 {'K': 4, 'Precision': 1.0, 'Recall': 0.14579017341912856},
 {'K': 5, 'Precision': 1.0, 'Recall': 0.18144238273361327},
 {'K': 6, 'Precision': 1.0, 'Recall': 0.21645832481586091},
 {'K': 7, 'Precision': 1.0, 'Recall': 0.2502370806132021},
 {'K': 8, 'Precision': 0.9997348886532343, 'Recall': 0.28234942223087445},
 {'K': 9, 'Precision': 0.9997348886532343, 'Recall': 0.3128710957679508},
 {'K': 10, 'Precision': 0.9996288441145281, 'Recall': 0.3406827422047578}]