In [2]:
from pandas import read_csv
from torch.optim import SGD
from torch.utils.data import DataLoader
from tensorboardX import SummaryWriter
from torch import randperm
from config import DATA_DIR
from src.data_set import RatingsDataset
from src.loss import MiningOutliersLoss
from src.model import MF
from src.runner import Runner
from src.utils import create_dataset, mine_outliers, DataConverter, DataProcessor

"""
The Deam dataset is based on Arousal-Valence 2D emotional model.
The Valence/Arousal ratings were collected using Amazon Mechanical Turks service.
Each turk from the collected crowd were asked to mark his own emotion for the current song on a 2D plane, Arousal/Valence.
For more information please read: https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0173392
"""

DF_PATH = f"{DATA_DIR}" \
          f"/DEAM/annotations/annotations per each rater/" \
          f"song_level/static_annotations_songs_1_2000.csv"

def select_n_random(trainset: RatingsDataset):
    """
    Selects n random data points and their corresponding labels from a dataset
    """
    perm = randperm(len(trainset))
    return trainset[perm][:100]

In [3]:
"""
This block of code calculates the outliers alongside the valence axis
"""
columns = ["workerID", "SongId", "Valence"]
original_df = read_csv(DF_PATH, skipinitialspace=True, usecols=columns)
original_df.columns = ["user_id", "item_id", "rating"]

data_converter = DataConverter(
        original_df=original_df, n_random_users=10, n_ratings_per_random_user=50
)
data_processor = DataProcessor(original_df=data_converter.original_df)

valence_model = MF(
    n_users=data_converter.n_users,
    n_items=data_converter.n_item,
)
epochs = 25

criterion = MiningOutliersLoss(data_converter=data_converter, data_processor=data_processor)
optimizer = SGD(valence_model.parameters(), lr=5, weight_decay=1e-3)
runner = Runner(
    model=valence_model,
    criterion=criterion,
    optimizer=optimizer,
    epochs=epochs
)

train_set = create_dataset(data_converter=data_converter)
train_load = DataLoader(train_set, batch_size=1000, shuffle=True)
users, items, ratings = select_n_random(train_set)

with SummaryWriter("runs/DEAM/valence") as writer:
    writer.add_graph(valence_model, (users, items))

    for epoch in range(epochs):
        epoch_loss = runner.train(train_loader=train_load, epoch=epoch, writer=writer)
        print(f"epoch={epoch + 1}, loss={epoch_loss}")

100%|██████████| 18/18 [01:55<00:00,  6.42s/batch, train_loss=0.52] 


epoch=1, loss=12.52741659102697


100%|██████████| 18/18 [00:57<00:00,  3.19s/batch, train_loss=0.462]


epoch=2, loss=8.95700982387432


100%|██████████| 18/18 [00:57<00:00,  3.20s/batch, train_loss=0.401]


epoch=3, loss=7.638953863373436


100%|██████████| 18/18 [00:55<00:00,  3.11s/batch, train_loss=0.387]


epoch=4, loss=6.8531454209450375


100%|██████████| 18/18 [00:54<00:00,  3.02s/batch, train_loss=0.331]


epoch=5, loss=6.273051167721569


100%|██████████| 18/18 [00:54<00:00,  3.01s/batch, train_loss=0.299]


epoch=6, loss=5.864035482826074


100%|██████████| 18/18 [00:56<00:00,  3.13s/batch, train_loss=0.285]


epoch=7, loss=5.520366364538422


100%|██████████| 18/18 [00:52<00:00,  2.94s/batch, train_loss=0.28] 


epoch=8, loss=5.301010321526112


100%|██████████| 18/18 [00:52<00:00,  2.91s/batch, train_loss=0.323]


epoch=9, loss=5.116883718973373


100%|██████████| 18/18 [00:52<00:00,  2.91s/batch, train_loss=0.298]


epoch=10, loss=4.931212429948863


100%|██████████| 18/18 [00:52<00:00,  2.90s/batch, train_loss=0.272]


epoch=11, loss=4.788327869336141


100%|██████████| 18/18 [00:52<00:00,  2.91s/batch, train_loss=0.24] 


epoch=12, loss=4.63465940451721


100%|██████████| 18/18 [00:52<00:00,  2.91s/batch, train_loss=0.275]


epoch=13, loss=4.518484907284812


100%|██████████| 18/18 [00:52<00:00,  2.90s/batch, train_loss=0.236]


epoch=14, loss=4.395158682225651


100%|██████████| 18/18 [00:52<00:00,  2.90s/batch, train_loss=0.264]


epoch=15, loss=4.280674747577841


100%|██████████| 18/18 [00:52<00:00,  2.91s/batch, train_loss=0.245]


epoch=16, loss=4.204324508666993


100%|██████████| 18/18 [00:52<00:00,  2.93s/batch, train_loss=0.234]


epoch=17, loss=4.09930317175141


100%|██████████| 18/18 [00:53<00:00,  2.96s/batch, train_loss=0.215]


epoch=18, loss=3.988694556145252


100%|██████████| 18/18 [00:53<00:00,  2.96s/batch, train_loss=0.234]


epoch=19, loss=3.9257494881024497


100%|██████████| 18/18 [00:53<00:00,  2.97s/batch, train_loss=0.216]


epoch=20, loss=3.8177073989171713


100%|██████████| 18/18 [00:53<00:00,  2.97s/batch, train_loss=0.227]


epoch=21, loss=3.781229142564955


100%|██████████| 18/18 [00:53<00:00,  2.94s/batch, train_loss=0.219]


epoch=22, loss=3.696437201978755


100%|██████████| 18/18 [00:53<00:00,  2.95s/batch, train_loss=0.211]


epoch=23, loss=3.6415017993974486


100%|██████████| 18/18 [00:52<00:00,  2.94s/batch, train_loss=0.209]


epoch=24, loss=3.581915304286847


100%|██████████| 18/18 [00:52<00:00,  2.90s/batch, train_loss=0.184]


epoch=25, loss=3.53232809530551


In [None]:
"""
This block of code calculates the outliers alongside the arousal axis
"""
columns = ["workerID", "SongId", "Arousal"]
original_df = read_csv(DF_PATH, skipinitialspace=True, usecols=columns)
original_df.columns = ["user_id", "item_id", "rating"]

data_converter = DataConverter(
        original_df=original_df, n_random_users=10, n_ratings_per_random_user=50
)
data_processor = DataProcessor(original_df=data_converter.original_df)

arousal_model = MF(
    n_users=data_converter.n_users,
    n_items=data_converter.n_item,
)
epochs = 25

criterion = MiningOutliersLoss(data_converter=data_converter, data_processor=data_processor)
optimizer = SGD(arousal_model.parameters(), lr=5, weight_decay=1e-3)
runner = Runner(
    model=arousal_model,
    criterion=criterion,
    optimizer=optimizer,
    epochs=epochs,
)

train_set = create_dataset(data_converter=data_converter)
train_load = DataLoader(train_set, batch_size=1000, shuffle=True)
users, items, ratings = select_n_random(train_set)

epochs = 25
with SummaryWriter("runs/DEAM/arousal") as writer:
    writer.add_graph(arousal_model, (users, items))

    for epoch in range(epochs):
        epoch_loss = runner.train(train_loader=train_load, epoch=epoch, writer=writer)
        print(f"epoch={epoch + 1}, loss={epoch_loss}")

In [5]:
valence_outliers = mine_outliers(model=valence_model, data_converter=data_converter)
arousal_outliers = mine_outliers(model=arousal_model, data_converter=data_converter)

items_group_by_users = data_converter.original_df.groupby("user_id")
combined_outliers = {}
for user_id, valence_dist in valence_outliers.items():
    arousal_dist = arousal_outliers[user_id]
    combined_outliers[user_id] = valence_dist + arousal_dist

combined_outliers = dict(sorted(combined_outliers.items(), key=lambda item: item[1]))
for user_id, item_id in combined_outliers.items():
    number_of_items = len(items_group_by_users.get_group(user_id))
    print(f"user: {user_id}, dist: {item_id}, #items: {number_of_items}")

user: 65794ea9f5122952403585a237bc5e52, dist: -59.39469909667969, #items: 3
user: random_guy_189, dist: -51.721527099609375, #items: 50
user: 2a6b63b7690efa2390c8d9fee11b1407, dist: -50.432456970214844, #items: 3
user: ad3b997c4f2382a66e49f035cacfa682, dist: -35.1258544921875, #items: 3
user: 374a5659c02e12b01db6319436f17a7d, dist: -30.808713912963867, #items: 3
user: fd5b08ce362d855ca9152a894348130c, dist: -30.02642059326172, #items: 222
user: d88c800327bffffea5562e23c276ede3, dist: -28.628963470458984, #items: 2
user: random_guy_191, dist: -27.281002044677734, #items: 50
user: 623681f76a3eab5d9c86fbc0e1ca264b, dist: -11.554872512817383, #items: 12
user: random_guy_187, dist: -10.641124725341797, #items: 50
user: random_guy_190, dist: -9.69948673248291, #items: 50
user: da37d1548ffd0631809f7be341e4fe4d, dist: -6.105973243713379, #items: 3
user: ccfcf36a939a8af15a987fa562a49207, dist: -4.857481002807617, #items: 7
user: bb50b45a1874ede476874bd57e4cabb4, dist: -1.272186279296875, #items