In [1]:
from pandas import read_csv
from torch.optim import SGD
from torch.nn import MSELoss
from torch.utils.data import DataLoader
from tensorboardX import SummaryWriter
from torch import randperm
from config import DATA_DIR
from src.data_set import RatingsDataset
from src.loss import MiningOutliersLoss
from src.model import MF
from src.runner import Runner
from src.utils import (
    create_dataset,
    mine_outliers_scipy,
    mine_outliers_sklearn,
    mine_outliers_torch,
    DataConverter,
    DataProcessor,
    mean_centralised
)

DF_PATH = f"{DATA_DIR}/BookCrossing/BX-Book-Ratings.csv"

In [2]:
columns = ["user_id", "item_id", "rating"]
original_df = read_csv(
    DF_PATH, skipinitialspace=True, sep=";", names=columns, encoding="latin-1", low_memory=False
)
original_df = original_df.iloc[1:, :]
original_df = original_df.astype({"user_id": "int32"})
original_df = original_df.astype({"item_id": str})
original_df = original_df.astype({"rating": "int32"})

data_converter = DataConverter(
    original_df=original_df, n_random_users=1, n_ratings_per_random_user=200
)

original_df = mean_centralised(dataframe=original_df)

In [None]:
model = MF(
    n_users=data_converter.n_users,
    n_items=data_converter.n_item,
)

criterion = MSELoss()
optimizer = SGD(model.parameters(), lr=5, weight_decay=1e-5)
epochs = 5

runner = Runner(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    epochs=epochs
)

train_set = create_dataset(data_converter=data_converter)
train_load = DataLoader(train_set, batch_size=1000, shuffle=True)
with SummaryWriter("runs/Book-Crossing") as writer:
    for epoch in range(epochs):
        epoch_loss = runner.train(train_loader=train_load, epoch=epoch, writer=writer)
        print(f"epoch={epoch + 1}, loss={epoch_loss}")

outliers = mine_outliers_sklearn(model=model, data_converter=data_converter)

100%|██████████| 1150/1150 [06:29<00:00,  2.96batch/s, train_loss=0.0166]


epoch=1, loss=21.02686258271278


100%|██████████| 1150/1150 [07:30<00:00,  2.55batch/s, train_loss=0.0169]


epoch=2, loss=17.49854668634768


100%|██████████| 1150/1150 [06:09<00:00,  3.12batch/s, train_loss=0.0147]


epoch=3, loss=16.196321662863905


100%|██████████| 1150/1150 [05:15<00:00,  3.64batch/s, train_loss=0.0133]


epoch=4, loss=15.167109014939298


100%|██████████| 1150/1150 [05:14<00:00,  3.65batch/s, train_loss=0.0118]


epoch=5, loss=14.362449028365475


In [None]:
items_group_by_users = data_converter.original_df.groupby("user_id")
outliers = dict(sorted(outliers.items(), key=lambda item: item[1]))
for user_id, item_id in outliers.items():
    number_of_items = len(items_group_by_users.get_group(user_id))
    print(f"user: {user_id}, dist: {item_id}, #items: {number_of_items}")