In [1]:
from pandas import read_csv
from torch.optim import SGD
from torch.utils.data import DataLoader

from config import DATA_DIR
from src.loss import MiningOutliersLoss
from src.model import MF
from src.runner import Runner
from src.utils import create_dataset, mine_outliers, DataConverter, DataProcessor

DF_PATH = f"{DATA_DIR}/BookCrossing/BX-Book-Ratings.csv"

In [2]:
columns = ["user_id", "item_id", "rating"]
original_df = read_csv(
    DF_PATH, skipinitialspace=True, sep=";", names=columns, encoding="latin-1", low_memory=False
)
original_df = original_df.iloc[1:, :]
original_df = original_df.astype({"user_id": "int32"})
original_df = original_df.astype({"item_id": str})
original_df = original_df.astype({"rating": "int32"})

data_converter = DataConverter(
    original_df=original_df, n_random_users=1, n_ratings_per_random_user=200
)
data_processor = DataProcessor(original_df=data_converter.original_df)

In [3]:
model = MF(
    n_users=data_converter.n_users,
    n_items=data_converter.n_item,
)

criterion = MiningOutliersLoss(data_converter=data_converter, data_processor=data_processor)
optimizer = SGD(model.parameters(), lr=5, weight_decay=1e-5)
runner = Runner(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
)

train_set = create_dataset(data_converter=data_converter)
train_load = DataLoader(train_set, batch_size=1000, shuffle=True)
epochs = 10
for epoch in range(epochs):
    epoch_loss = runner.train(train_loader=train_load)
    print(f"epoch={epoch + 1}, loss={epoch_loss}")

outliers = mine_outliers(model=model, data_converter=data_converter)

 10%|▉         | 112/1150 [00:23<03:39,  4.72batch/s, train_loss=0.0208]


KeyboardInterrupt: 

In [None]:
items_group_by_users = data_converter.original_df.groupby("user_id")
outliers = dict(sorted(outliers.items(), key=lambda item: item[1]))
for user_id, item_id in outliers.items():
    number_of_items = len(items_group_by_users.get_group(user_id))
    print(f"user: {user_id}, dist: {item_id}, #items: {number_of_items}")