In [1]:
from pandas import read_csv
from torch.optim import SGD
from torch.utils.data import DataLoader

from config import DATA_DIR
from src.loss import MiningOutliersLoss
from src.model import MF
from src.runner import Runner
from src.utils import create_dataset, mine_outliers, DataConverter, DataProcessor

DF_PATH = f"{DATA_DIR}/BookCrossing/BX-Book-Ratings.csv"

In [7]:
columns = ["user_id", "item_id", "rating"]
original_df = read_csv(
    DF_PATH, skipinitialspace=True, sep=";", names=columns, encoding="latin-1", low_memory=False
)
original_df = original_df.iloc[1:, :]
original_df = original_df.astype({"user_id": "int32"})
original_df = original_df.astype({"item_id": str})
original_df = original_df.astype({"rating": "int32"})

data_converter = DataConverter(
    original_df=original_df, n_random_users=0, n_ratings_per_random_user=200
)
data_processor = DataProcessor(original_df=data_converter.original_df)

In [None]:
model = MF(
    n_users=data_converter.n_users,
    n_items=data_converter.n_item,
)

criterion = MiningOutliersLoss(data_converter=data_converter, data_processor=data_processor)
optimizer = SGD(model.parameters(), lr=5, weight_decay=1e-5)
runner = Runner(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
)

train_set = create_dataset(data_converter=data_converter)
train_load = DataLoader(train_set, batch_size=1000, shuffle=True)
epochs = 10
for epoch in range(epochs):
    epoch_loss = runner.train(train_loader=train_load)
    print(f"epoch={epoch + 1}, loss={epoch_loss}")

outliers = mine_outliers(model=model, data_converter=data_converter)

100%|██████████| 1150/1150 [09:28<00:00,  2.02batch/s, train_loss=993]    


epoch=1, loss=1390464.8779907227


100%|██████████| 1150/1150 [09:38<00:00,  1.99batch/s, train_loss=868]    


epoch=2, loss=1332462.1447143555


100%|██████████| 1150/1150 [09:12<00:00,  2.08batch/s, train_loss=867]    


epoch=3, loss=1289148.1022949219


100%|██████████| 1150/1150 [07:09<00:00,  2.67batch/s, train_loss=885]    


epoch=4, loss=1252073.9248046875


100%|██████████| 1150/1150 [08:28<00:00,  2.26batch/s, train_loss=807]    


epoch=5, loss=1212279.6314086914


100%|██████████| 1150/1150 [07:59<00:00,  2.40batch/s, train_loss=771]    


epoch=6, loss=1164635.2467041016


100%|██████████| 1150/1150 [09:11<00:00,  2.08batch/s, train_loss=759]    


epoch=7, loss=1104683.3448486328


100%|██████████| 1150/1150 [08:40<00:00,  2.21batch/s, train_loss=737]


epoch=8, loss=1035243.0542602539


100%|██████████| 1150/1150 [08:48<00:00,  2.18batch/s, train_loss=645]


epoch=9, loss=960779.3254394531


100%|██████████| 1150/1150 [08:40<00:00,  2.21batch/s, train_loss=612]


epoch=10, loss=883090.5211181641


In [None]:
items_group_by_users = data_converter.original_df.groupby("user_id")
outliers = dict(sorted(outliers.items(), key=lambda item: item[1]))
for user_id, item_id in outliers.items():
    number_of_items = len(items_group_by_users.get_group(user_id))
    print(f"user: {user_id}, dist: {item_id}, #items: {number_of_items}")