In [1]:
from pandas import read_csv
from torch.optim import SGD
from torch.utils.data import DataLoader
from tensorboardX import SummaryWriter
from torch import randperm
from config import DATA_DIR
from src.data_set import RatingsDataset
from src.loss import MiningOutliersLoss
from src.model import MF
from src.runner import Runner
from src.utils import create_dataset, mine_outliers, DataConverter, DataProcessor

"""
The Deam dataset is based on Arousal-Valence 2D emotional model.
The Valence/Arousal ratings were collected using Amazon Mechanical Turks service.
Each turk from the collected crowd were asked to mark his own emotion for the current song on a 2D plane, Arousal/Valence.
For more information please read: https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0173392
"""

DF_PATH = f"{DATA_DIR}" \
          f"/DEAM/annotations/annotations per each rater/" \
          f"song_level/static_annotations_songs_1_2000.csv"

def select_n_random(trainset: RatingsDataset):
    """
    Selects n random data points and their corresponding labels from a dataset
    """
    perm = randperm(len(trainset))
    return trainset[perm][:100]

In [2]:
"""
This block of code calculates the outliers alongside the valence axis
"""
columns = ["workerID", "SongId", "Valence"]
original_df = read_csv(DF_PATH, skipinitialspace=True, usecols=columns)
original_df.columns = ["user_id", "item_id", "rating"]

data_converter = DataConverter(
        original_df=original_df, n_random_users=10, n_ratings_per_random_user=50
)
data_processor = DataProcessor(original_df=data_converter.original_df)

valence_model = MF(
    n_users=data_converter.n_users,
    n_items=data_converter.n_item,
)

criterion = MiningOutliersLoss(data_converter=data_converter, data_processor=data_processor)
optimizer = SGD(valence_model.parameters(), lr=5, weight_decay=1e-3)
runner = Runner(
    model=valence_model,
    criterion=criterion,
    optimizer=optimizer,
)

train_set = create_dataset(data_converter=data_converter)
train_load = DataLoader(train_set, batch_size=1000, shuffle=True)
users, items, ratings = select_n_random(train_set)

epochs = 25
with SummaryWriter("runs/DEAM/valence") as writer:
    writer.add_graph(valence_model, (users, items))

    for epoch in range(epochs):
        epoch_loss = runner.train(train_loader=train_load, epoch=epoch, writer=writer)
        print(f"epoch={epoch + 1}, loss={epoch_loss}")

100%|██████████| 18/18 [00:16<00:00,  1.08batch/s, train_loss=0.544]


epoch=1, loss=12.566120662198522


100%|██████████| 18/18 [00:13<00:00,  1.29batch/s, train_loss=0.466]


epoch=2, loss=9.022809059871182


100%|██████████| 18/18 [00:13<00:00,  1.30batch/s, train_loss=0.445]


epoch=3, loss=7.739140686541672


100%|██████████| 18/18 [00:14<00:00,  1.21batch/s, train_loss=0.31] 


epoch=4, loss=6.902965693936803


100%|██████████| 18/18 [00:13<00:00,  1.33batch/s, train_loss=0.355]


epoch=5, loss=6.27783614339868


100%|██████████| 18/18 [00:13<00:00,  1.35batch/s, train_loss=0.308]


epoch=6, loss=5.838185322488491


100%|██████████| 18/18 [00:15<00:00,  1.13batch/s, train_loss=0.305]


epoch=7, loss=5.512302591157652


100%|██████████| 18/18 [00:14<00:00,  1.26batch/s, train_loss=0.304]


epoch=8, loss=5.290005278654614


100%|██████████| 18/18 [00:14<00:00,  1.23batch/s, train_loss=0.273]


epoch=9, loss=5.093792617101393


100%|██████████| 18/18 [00:14<00:00,  1.26batch/s, train_loss=0.266]


epoch=10, loss=4.897748326218475


100%|██████████| 18/18 [00:14<00:00,  1.27batch/s, train_loss=0.272]


epoch=11, loss=4.7546366201060435


100%|██████████| 18/18 [00:14<00:00,  1.27batch/s, train_loss=0.23] 


epoch=12, loss=4.602937892296501


100%|██████████| 18/18 [00:17<00:00,  1.03batch/s, train_loss=0.265]


epoch=13, loss=4.4650772993159


100%|██████████| 18/18 [00:15<00:00,  1.13batch/s, train_loss=0.235]


epoch=14, loss=4.3514051982199


100%|██████████| 18/18 [00:18<00:00,  1.02s/batch, train_loss=0.258]


epoch=15, loss=4.239231139756832


100%|██████████| 18/18 [00:14<00:00,  1.25batch/s, train_loss=0.219]


epoch=16, loss=4.138110883847312


100%|██████████| 18/18 [00:15<00:00,  1.18batch/s, train_loss=0.222]


epoch=17, loss=4.044411213221886


100%|██████████| 18/18 [00:15<00:00,  1.17batch/s, train_loss=0.21] 


epoch=18, loss=3.9646255717930456


100%|██████████| 18/18 [00:13<00:00,  1.35batch/s, train_loss=0.209]


epoch=19, loss=3.8832749200718037


100%|██████████| 18/18 [00:15<00:00,  1.15batch/s, train_loss=0.218]


epoch=20, loss=3.7875751688470487


100%|██████████| 18/18 [00:14<00:00,  1.24batch/s, train_loss=0.196]


epoch=21, loss=3.7328273198931052


100%|██████████| 18/18 [00:13<00:00,  1.29batch/s, train_loss=0.21] 


epoch=22, loss=3.6709469641847727


100%|██████████| 18/18 [00:13<00:00,  1.31batch/s, train_loss=0.212]


epoch=23, loss=3.598860358147206


100%|██████████| 18/18 [00:15<00:00,  1.19batch/s, train_loss=0.2]  


epoch=24, loss=3.5515914362357366


100%|██████████| 18/18 [00:15<00:00,  1.18batch/s, train_loss=0.19] 


epoch=25, loss=3.495545592248687


In [3]:
"""
This block of code calculates the outliers alongside the arousal axis
"""
columns = ["workerID", "SongId", "Arousal"]
original_df = read_csv(DF_PATH, skipinitialspace=True, usecols=columns)
original_df.columns = ["user_id", "item_id", "rating"]

data_converter = DataConverter(
        original_df=original_df, n_random_users=10, n_ratings_per_random_user=50
)
data_processor = DataProcessor(original_df=data_converter.original_df)

arousal_model = MF(
    n_users=data_converter.n_users,
    n_items=data_converter.n_item,
)

criterion = MiningOutliersLoss(data_converter=data_converter, data_processor=data_processor)
optimizer = SGD(arousal_model.parameters(), lr=5, weight_decay=1e-3)
runner = Runner(
    model=arousal_model,
    criterion=criterion,
    optimizer=optimizer,
)

train_set = create_dataset(data_converter=data_converter)
train_load = DataLoader(train_set, batch_size=1000, shuffle=True)
users, items, ratings = select_n_random(train_set)

epochs = 25
with SummaryWriter("runs/DEAM/arousal") as writer:
    writer.add_graph(arousal_model, (users, items))

    for epoch in range(epochs):
        epoch_loss = runner.train(train_loader=train_load, epoch=epoch, writer=writer)
        print(f"epoch={epoch + 1}, loss={epoch_loss}")

100%|██████████| 18/18 [00:15<00:00,  1.16batch/s, train_loss=0.559]


epoch=1, loss=13.253642229642113


100%|██████████| 18/18 [00:14<00:00,  1.28batch/s, train_loss=0.495]


epoch=2, loss=9.94417393955848


100%|██████████| 18/18 [00:15<00:00,  1.13batch/s, train_loss=0.487]


epoch=3, loss=8.581540510502098


100%|██████████| 18/18 [00:15<00:00,  1.15batch/s, train_loss=0.436]


epoch=4, loss=7.651847515438602


100%|██████████| 18/18 [00:14<00:00,  1.26batch/s, train_loss=0.382]


epoch=5, loss=7.0083750661003155


100%|██████████| 18/18 [00:14<00:00,  1.26batch/s, train_loss=0.353]


epoch=6, loss=6.538311951443367


100%|██████████| 18/18 [00:14<00:00,  1.27batch/s, train_loss=0.332]


epoch=7, loss=6.17285811377561


100%|██████████| 18/18 [00:14<00:00,  1.28batch/s, train_loss=0.306]


epoch=8, loss=5.907209679947849


100%|██████████| 18/18 [00:15<00:00,  1.17batch/s, train_loss=0.266]


epoch=9, loss=5.682115655036388


100%|██████████| 18/18 [00:14<00:00,  1.20batch/s, train_loss=0.296]


epoch=10, loss=5.46839219773162


100%|██████████| 18/18 [00:15<00:00,  1.18batch/s, train_loss=0.294]


epoch=11, loss=5.268947287658438


100%|██████████| 18/18 [00:16<00:00,  1.10batch/s, train_loss=0.294]


epoch=12, loss=5.080242905106287


100%|██████████| 18/18 [00:15<00:00,  1.13batch/s, train_loss=0.256]


epoch=13, loss=4.9524977768387535


100%|██████████| 18/18 [00:17<00:00,  1.02batch/s, train_loss=0.274]


epoch=14, loss=4.816632916335743


100%|██████████| 18/18 [00:13<00:00,  1.29batch/s, train_loss=0.244]


epoch=15, loss=4.682187159937941


100%|██████████| 18/18 [00:14<00:00,  1.27batch/s, train_loss=0.237]


epoch=16, loss=4.571688019178715


100%|██████████| 18/18 [00:14<00:00,  1.26batch/s, train_loss=0.24] 


epoch=17, loss=4.449817425486457


100%|██████████| 18/18 [00:14<00:00,  1.25batch/s, train_loss=0.262]


epoch=18, loss=4.337310405177201


100%|██████████| 18/18 [00:14<00:00,  1.26batch/s, train_loss=0.246]


epoch=19, loss=4.245567693876528


100%|██████████| 18/18 [00:14<00:00,  1.27batch/s, train_loss=0.223]


epoch=20, loss=4.148849510414473


100%|██████████| 18/18 [00:14<00:00,  1.28batch/s, train_loss=0.238]


epoch=21, loss=4.070344209916364


100%|██████████| 18/18 [00:13<00:00,  1.32batch/s, train_loss=0.203]


epoch=22, loss=3.9778268076789827


100%|██████████| 18/18 [00:13<00:00,  1.30batch/s, train_loss=0.235]


epoch=23, loss=3.9035234063492767


100%|██████████| 18/18 [00:14<00:00,  1.28batch/s, train_loss=0.193]


epoch=24, loss=3.8419016557748873


100%|██████████| 18/18 [00:13<00:00,  1.29batch/s, train_loss=0.195]

epoch=25, loss=3.7851556667470336





In [4]:
valence_outliers = mine_outliers(model=valence_model, data_converter=data_converter)
arousal_outliers = mine_outliers(model=arousal_model, data_converter=data_converter)

items_group_by_users = data_converter.original_df.groupby("user_id")
combined_outliers = {}
for user_id, valence_dist in valence_outliers.items():
    arousal_dist = arousal_outliers[user_id]
    combined_outliers[user_id] = valence_dist + arousal_dist

combined_outliers = dict(sorted(combined_outliers.items(), key=lambda item: item[1]))
for user_id, item_id in combined_outliers.items():
    number_of_items = len(items_group_by_users.get_group(user_id))
    print(f"user: {user_id}, dist: {item_id}, #items: {number_of_items}")

user: 65794ea9f5122952403585a237bc5e52, dist: -64.40780639648438, #items: 3
user: 2a6b63b7690efa2390c8d9fee11b1407, dist: -63.590091705322266, #items: 3
user: random_guy_187, dist: -51.64360809326172, #items: 50
user: random_guy_189, dist: -38.11405563354492, #items: 50
user: ad3b997c4f2382a66e49f035cacfa682, dist: -32.63254928588867, #items: 3
user: fd5b08ce362d855ca9152a894348130c, dist: -31.736291885375977, #items: 222
user: d88c800327bffffea5562e23c276ede3, dist: -31.23208236694336, #items: 2
user: random_guy_190, dist: -23.735185623168945, #items: 50
user: random_guy_193, dist: -21.650558471679688, #items: 50
user: random_guy_191, dist: -20.967144012451172, #items: 50
user: 374a5659c02e12b01db6319436f17a7d, dist: -15.478662490844727, #items: 3
user: 623681f76a3eab5d9c86fbc0e1ca264b, dist: -10.94134521484375, #items: 12
user: 80db3788bc598d1b32979bea958d9358, dist: -6.76093864440918, #items: 6
user: da37d1548ffd0631809f7be341e4fe4d, dist: -5.815742492675781, #items: 3
user: 615d836