In [10]:
import torch
from pandas import read_csv
from torch.optim import SGD
from torch.nn import MSELoss
from torch.utils.data import DataLoader
from tensorboardX import SummaryWriter
from torch import randperm
from config import DATA_DIR, MODELS_DIR
from src.data_set import RatingsDataset
from src.model import MF, SingleMF
from src.runner import Runner, SingleMFRunner
from src.utils import (
    create_dataset,
    mine_outliers_scipy,
    DataConverter,
)
from src.consistency import direct_consistency_calculation, mf_consistency_calculation
from os.path import exists


"""
The Deam dataset is based on Arousal-Valence 2D emotional model.
The Valence/Arousal ratings were collected using Amazon Mechanical Turks service.
Each turk from the collected crowd were asked to mark his own emotion for the current song on a 2D plane, Arousal/Valence.
For more information please read: https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0173392
"""

DF_PATH = f"{DATA_DIR}" \
          f"/DEAM/annotations/annotations per each rater/" \
          f"song_level/static_annotations_songs_1_2000.csv"

def select_n_random(trainset: RatingsDataset):
    """
    Selects n random data points and their corresponding labels from a dataset
    """
    perm = randperm(len(trainset))
    return trainset[perm][:100]

In [11]:
"""
This block analyze raw data consistency using the direct calculation defined by:
consistency += row.rating - row.song.mean() for all rows in dataset
"""
columns = ["workerID", "SongId", "Valence"]
valence_df = read_csv(DF_PATH, skipinitialspace=True, usecols=columns)
valence_df.columns = ["user_id", "item_id", "rating"]
valence_consistency = direct_consistency_calculation(data_frame=valence_df)

columns = ["workerID", "SongId", "Arousal"]
arousal_df = read_csv(DF_PATH, skipinitialspace=True, usecols=columns)
arousal_df.columns = ["user_id", "item_id", "rating"]
arousal_consistency = direct_consistency_calculation(data_frame=arousal_df)

print(f"Raw data consistency according to direct calculation is: \x1b[33m{valence_consistency + arousal_consistency}\x1b[32m")

  0%|          | 0/17464 [00:00<?, ?it/s]

  0%|          | 0/17464 [00:00<?, ?it/s]

Raw data consistency according to direct calculation is: [33m4.11226608321158e-13[32m


In [12]:
"""
This block of code calculates the outliers alongside the valence axis
"""
columns = ["workerID", "SongId", "Valence"]
original_df = read_csv(DF_PATH, skipinitialspace=True, usecols=columns)
original_df.columns = ["user_id", "item_id", "rating"]

valence_data_converter = DataConverter(
        original_df=original_df, n_random_users=0, n_ratings_per_random_user=0
)

valence_model = MF(
        n_users=valence_data_converter.n_users,
        n_items=valence_data_converter.n_item,
        include_bias=True
)

if exists(f"{MODELS_DIR}/DEAM/raw/valence.pt"):
    valence_model.load_state_dict(torch.load(f"{MODELS_DIR}/DEAM/raw/valence.pt"))
else:
    epochs = 100

    criterion = MSELoss()
    optimizer = SGD(valence_model.parameters(), lr=5, weight_decay=1e-3)
    runner = Runner(
        model=valence_model,
        criterion=criterion,
        optimizer=optimizer,
        epochs=epochs
    )

    train_set = create_dataset(data_frame=valence_data_converter.encoded_df)
    train_load = DataLoader(train_set, batch_size=1000, shuffle=True)
    users, items, ratings = select_n_random(train_set)

    with SummaryWriter("runs/DEAM/raw/valence") as writer:
        writer.add_graph(valence_model, (users, items))

        for epoch in range(epochs):
            epoch_loss = runner.train(train_loader=train_load, epoch=epoch, writer=writer)

            print(f"epoch={epoch + 1}, loss={epoch_loss}")

    torch.save(valence_model.state_dict(), f"{MODELS_DIR}/DEAM/raw/valence.pt")

In [13]:
"""
This block of code calculates the outliers alongside the Arousal axis
"""
columns = ["workerID", "SongId", "Arousal"]
original_df = read_csv(DF_PATH, skipinitialspace=True, usecols=columns)
original_df.columns = ["user_id", "item_id", "rating"]

arousal_data_converter = DataConverter(
        original_df=original_df, n_random_users=0, n_ratings_per_random_user=9
)

arousal_model = MF(
        n_users=arousal_data_converter.n_users,
        n_items=arousal_data_converter.n_item,
        include_bias=True
)

if exists(f"{MODELS_DIR}/DEAM/raw/arousal.pt"):
    arousal_model.load_state_dict(torch.load(f"{MODELS_DIR}/DEAM/raw/arousal.pt"))
else:
    epochs = 100

    criterion = MSELoss()
    optimizer = SGD(arousal_model.parameters(), lr=5, weight_decay=1e-3)
    runner = Runner(
        model=arousal_model,
        criterion=criterion,
        optimizer=optimizer,
        epochs=epochs,
    )

    train_set = create_dataset(data_frame=arousal_data_converter.encoded_df)
    train_load = DataLoader(train_set, batch_size=1000, shuffle=True)
    users, items, ratings = select_n_random(train_set)

    with SummaryWriter("runs/DEAM/raw/arousal") as writer:
        writer.add_graph(arousal_model, (users, items))

        for epoch in range(epochs):
            epoch_loss = runner.train(train_loader=train_load, epoch=epoch, writer=writer)
            print(f"epoch={epoch + 1}, loss={epoch_loss}")

    torch.save(arousal_model.state_dict(), f"{MODELS_DIR}/DEAM/raw/arousal.pt")

In [14]:
valence_embeddings = list(valence_model.user_factors.parameters())[0].detach().cpu()
valence_similarities = mine_outliers_scipy(embeddings=valence_embeddings)
valence_outliers = {valence_data_converter.get_original_user_id(i): score for i, score in enumerate(valence_similarities)}

arousal_embeddings = list(arousal_model.user_factors.parameters())[0].detach().cpu()
arousal_similarities = mine_outliers_scipy(embeddings=arousal_embeddings)
arousal_outliers = {arousal_data_converter.get_original_user_id(i): score for i, score in enumerate(arousal_similarities)}


items_group_by_users = valence_data_converter.original_df.groupby("user_id")
combined_outliers = {}
for user_id, valence_dist in valence_outliers.items():
    arousal_dist = arousal_outliers[user_id]
    combined_outliers[user_id] = valence_dist + arousal_dist

combined_outliers = dict(sorted(combined_outliers.items(), key=lambda item: item[1]))
for user_id, item_id in combined_outliers.items():
    number_of_items = len(items_group_by_users.get_group(user_id))
    print(f"user: {user_id}, dist: {item_id}, #items: {number_of_items}")

user: 2a6b63b7690efa2390c8d9fee11b1407, dist: -19.4898900560531, #items: 3
user: ad3b997c4f2382a66e49f035cacfa682, dist: -15.605563848582392, #items: 3
user: 65794ea9f5122952403585a237bc5e52, dist: 4.922030200321515, #items: 3
user: 374a5659c02e12b01db6319436f17a7d, dist: 8.385318858240886, #items: 3
user: bb50b45a1874ede476874bd57e4cabb4, dist: 21.656517129592743, #items: 178
user: 80db3788bc598d1b32979bea958d9358, dist: 22.78875095120557, #items: 6
user: fd5b08ce362d855ca9152a894348130c, dist: 24.100798041009064, #items: 222
user: a30d244141cb2f51e0803e79bc4bd147, dist: 26.603400757398834, #items: 985
user: 485d8e33a731a830ef0aebd71b016d08, dist: 27.973517988528098, #items: 6
user: 615d836ba25132081e0ebd2182221a59, dist: 29.532718413539705, #items: 6
user: 623681f76a3eab5d9c86fbc0e1ca264b, dist: 30.02031994710819, #items: 12
user: 607f6e34a0b5923333f6b16d3a59cc98, dist: 31.420089077718337, #items: 955
user: 807f0025a626896f04566aa37cfbce0d, dist: 33.33688205152358, #items: 3
user: 46

In [15]:
"""
This block of code inserting new outlier user into the dataset, and then tries to optimize the new user embedding according
to the optimized item embedding of the model.
We want to check if we can detect outlier in post training phase.
This block works on the Valence axis
"""
valence_single_mf_model = SingleMF(optimized_item_factors=valence_model.user_factors)
criterion = MSELoss()
optimizer = SGD(valence_single_mf_model.parameters(), lr=.1, weight_decay=1e-7)
outlier_dataframe = valence_data_converter.create_outlier_dataset(
    original_df=original_df, number_of_users_to_add=1, n_ratings_per_random_user=10
)

outlier_data_converter = DataConverter(original_df=outlier_dataframe)
outlier_dataset = create_dataset(data_frame=outlier_data_converter.encoded_df)
outlier_train_load = DataLoader(outlier_dataset, batch_size=len(outlier_dataset), shuffle=True)

single_mf_runner = SingleMFRunner(
    model=valence_single_mf_model, criterion=criterion, optimizer=optimizer
)

epochs = 1000
for epoch in range(epochs):
    epoch_loss = single_mf_runner.train(train_loader=outlier_train_load)
    if epoch % 100 == 0:
        print(f"epoch={epoch + 1}, loss={epoch_loss}")

epoch=1, loss=6.605821228027343
epoch=101, loss=0.031595855951309204
epoch=201, loss=0.020129847526550292
epoch=301, loss=0.013532790541648864
epoch=401, loss=0.009189076721668243
epoch=501, loss=0.0062507688999176025
epoch=601, loss=0.004253362864255905
epoch=701, loss=0.0028943920508027075
epoch=801, loss=0.0019696403294801714
epoch=901, loss=0.0013403533026576041


In [16]:
"""
This block of code inserting new outlier user into the dataset, and then tries to optimize the new user embedding according
to the optimized item embedding of the model.
We want to check if we can detect outlier in post training phase.
This block works on the Arousal axis
"""
arousal_single_mf_model = SingleMF(optimized_item_factors=arousal_model.user_factors)
criterion = MSELoss()
optimizer = SGD(arousal_single_mf_model.parameters(), lr=.1, weight_decay=1e-7)
outlier_dataframe = arousal_data_converter.create_outlier_dataset(
    original_df=original_df, number_of_users_to_add=1, n_ratings_per_random_user=10
)

outlier_data_converter = DataConverter(original_df=outlier_dataframe)
outlier_dataset = create_dataset(data_frame=outlier_data_converter.encoded_df)
outlier_train_load = DataLoader(outlier_dataset, batch_size=len(outlier_dataset), shuffle=True)

single_mf_runner = SingleMFRunner(
    model=arousal_single_mf_model, criterion=criterion, optimizer=optimizer
)

epochs = 1000
for epoch in range(epochs):
    epoch_loss = single_mf_runner.train(train_loader=outlier_train_load)
    if epoch % 100 == 0:
        print(f"epoch={epoch + 1}, loss={epoch_loss}")

epoch=1, loss=3.796558380126953
epoch=101, loss=0.04127430319786072
epoch=201, loss=0.01450343132019043
epoch=301, loss=0.006760402023792267
epoch=401, loss=0.0036561232060194015
epoch=501, loss=0.0020927783101797105
epoch=601, loss=0.0012206911109387875
epoch=701, loss=0.0007162551395595073
epoch=801, loss=0.00042104823514819143
epoch=901, loss=0.000247656786814332


In [17]:
"""
This block add the new outlier user embeddings into the existing user embeddings and tries to detect whether his outlier or not.
"""
valence_embeddings = list(valence_model.user_factors.parameters())[0].detach().cpu()
outlier_valence_embeddings = list(valence_single_mf_model.user_factors.parameters())[0].detach().cpu()
valence_embeddings = torch.cat((valence_embeddings, outlier_valence_embeddings), 0)

arousal_embeddings = list(arousal_model.user_factors.parameters())[0].detach().cpu()
outlier_arousal_embeddings = list(arousal_single_mf_model.user_factors.parameters())[0].detach().cpu()
arousal_embeddings = torch.cat((arousal_embeddings, outlier_arousal_embeddings), 0)

valence_similarities = mine_outliers_scipy(embeddings=valence_embeddings)
arousal_similarities = mine_outliers_scipy(embeddings=arousal_embeddings)

combined_outliers = {}
for i, (valence_dist, arousal_dist) in enumerate(zip(valence_similarities, arousal_similarities)):
    if i == len(arousal_similarities) - 1:
        outlier_id = outlier_data_converter.get_original_user_id(encoded_id=0)
        combined_outliers[outlier_id] = valence_dist + arousal_dist
        continue

    user_id = valence_data_converter.get_original_user_id(encoded_id=i)
    combined_outliers[user_id] = valence_dist + arousal_dist


items_group_by_users = valence_data_converter.original_df.groupby("user_id")
outlier_items_group_by_users = outlier_data_converter.original_df.groupby("user_id")

combined_outliers = dict(sorted(combined_outliers.items(), key=lambda item: item[1]))
for user_id, dist in combined_outliers.items():
    try:
        number_of_items = len(items_group_by_users.get_group(user_id))
        print(f"user: {user_id}, dist: {dist}, #items: {number_of_items}")
    except KeyError:
        # handle outlier
        number_of_items = len(outlier_items_group_by_users.get_group(user_id))
        print(f"user: {user_id}, dist: {dist}, #items: {10}")

user: 2a6b63b7690efa2390c8d9fee11b1407, dist: -19.707501784668843, #items: 3
user: ad3b997c4f2382a66e49f035cacfa682, dist: -15.448985258073254, #items: 3
user: 65794ea9f5122952403585a237bc5e52, dist: 5.291826742580714, #items: 3
user: 374a5659c02e12b01db6319436f17a7d, dist: 8.278991187635402, #items: 3
user: bb50b45a1874ede476874bd57e4cabb4, dist: 21.678874842457095, #items: 178
user: 80db3788bc598d1b32979bea958d9358, dist: 22.665842619475804, #items: 6
user: fd5b08ce362d855ca9152a894348130c, dist: 24.46144392547742, #items: 222
user: a30d244141cb2f51e0803e79bc4bd147, dist: 26.68188571789692, #items: 985
user: 485d8e33a731a830ef0aebd71b016d08, dist: 27.60853836693581, #items: 6
user: 615d836ba25132081e0ebd2182221a59, dist: 29.547430017907757, #items: 6
user: 623681f76a3eab5d9c86fbc0e1ca264b, dist: 30.051473652781745, #items: 12
user: 607f6e34a0b5923333f6b16d3a59cc98, dist: 31.33470478166948, #items: 955
user: 807f0025a626896f04566aa37cfbce0d, dist: 33.3339832583648, #items: 3
user: 46a

In [18]:
"""
This block tries to identify the consistency in the dataset after MF.
"""
valence_consistency = mf_consistency_calculation(data_frame=valence_data_converter.original_df, model=valence_model)
arousal_consistency = mf_consistency_calculation(data_frame=arousal_data_converter.original_df, model=arousal_model)

print(f"Raw data consistency according to matrix factorization calculation is: \x1b[33m{valence_consistency + arousal_consistency}\x1b[32m")

mf_calculation: 100%|██████████| 17464/17464 [00:01<00:00, 8969.30it/s]
mf_calculation: 100%|██████████| 17464/17464 [00:01<00:00, 9944.98it/s] 

Raw data consistency according to matrix factorization calculation is: [33m6322.026951014996[32m



