In [19]:
import torch
from pandas import read_csv
from torch.optim import SGD
from torch.nn import MSELoss
from torch.utils.data import DataLoader
from tensorboardX import SummaryWriter
from torch import randperm
from config import DATA_DIR, MODELS_DIR
from src.data_set import RatingsDataset
from src.model import MF, SingleMF
from src.runner import Runner, SingleMFRunner
from src.utils import (
    create_dataset,
    mine_outliers_scipy,
    DataConverter,
)
from src.consistency import direct_consistency_calculation, mf_consistency_calculation
from os.path import exists


"""
In this example we are running the same experiments as in deam_raw.ipynb, with the following difference:
- in pre-processing time, the data set is normalized around zero.
The normalization is done with:
rating[i] = rating[i] - sum(song[j].ratings) / total_annotations_by_user[i]
"""

DF_PATH = f"{DATA_DIR}" \
          f"/DEAM/annotations/annotations per each rater/" \
          f"song_level/static_annotations_songs_1_2000_mean_centralised.csv"

def select_n_random(trainset: RatingsDataset):
    """
    Selects n random data points and their corresponding labels from a dataset
    """
    perm = randperm(len(trainset))
    return trainset[perm][:100]

In [20]:
"""
This block of code calculates the outliers alongside the valence axis.
"""
columns = ["user_id", "item_id", "rating"]
original_df = read_csv(DF_PATH, skipinitialspace=True, usecols=columns)

valence_data_converter = DataConverter(
        original_df=original_df, n_random_users=0, n_ratings_per_random_user=0
)

valence_model = MF(
        n_users=valence_data_converter.n_users,
        n_items=valence_data_converter.n_item,
        include_bias=True
)

if exists(f"{MODELS_DIR}/DEAM/mean_centralised/valence.pt"):
    valence_model.load_state_dict(torch.load(f"{MODELS_DIR}/DEAM/mean_centralised/valence.pt"))
else:
    epochs = 100

    criterion = MSELoss()
    optimizer = SGD(valence_model.parameters(), lr=5, weight_decay=1e-3)
    runner = Runner(
        model=valence_model,
        criterion=criterion,
        optimizer=optimizer,
        epochs=epochs
    )

    train_set = create_dataset(data_frame=valence_data_converter.encoded_df)
    train_load = DataLoader(train_set, batch_size=1000, shuffle=True)
    users, items, ratings = select_n_random(train_set)

    with SummaryWriter("runs/DEAM/mean_centralised/valence") as writer:
        writer.add_graph(valence_model, (users, items))

        for epoch in range(epochs):
            epoch_loss = runner.train(train_loader=train_load, epoch=epoch, writer=writer)

            print(f"epoch={epoch + 1}, loss={epoch_loss}")

    torch.save(valence_model.state_dict(), f"{MODELS_DIR}/DEAM/mean_centralised/valence.pt")

In [21]:
"""
This block of code calculates the outliers alongside the Arousal axis.
"""
columns = ["user_id", "item_id", "rating"]
original_df = read_csv(DF_PATH, skipinitialspace=True, usecols=columns)

arousal_data_converter = DataConverter(
        original_df=original_df, n_random_users=0, n_ratings_per_random_user=0
)

arousal_model = MF(
        n_users=arousal_data_converter.n_users,
        n_items=arousal_data_converter.n_item,
        include_bias=True
)

if exists(f"{MODELS_DIR}/DEAM/mean_centralised/arousal.pt"):
    arousal_model.load_state_dict(torch.load(f"{MODELS_DIR}/DEAM/mean_centralised/arousal.pt"))
else:
    epochs = 100

    criterion = MSELoss()
    optimizer = SGD(arousal_model.parameters(), lr=5, weight_decay=1e-3)
    runner = Runner(
        model=arousal_model,
        criterion=criterion,
        optimizer=optimizer,
        epochs=epochs,
    )

    train_set = create_dataset(data_frame=arousal_data_converter.encoded_df)
    train_load = DataLoader(train_set, batch_size=1000, shuffle=True)
    users, items, ratings = select_n_random(train_set)

    with SummaryWriter("runs/DEAM/mean_centralised/arousal") as writer:
        writer.add_graph(arousal_model, (users, items))

        for epoch in range(epochs):
            epoch_loss = runner.train(train_loader=train_load, epoch=epoch, writer=writer)
            print(f"epoch={epoch + 1}, loss={epoch_loss}")

    torch.save(arousal_model.state_dict(), f"{MODELS_DIR}/DEAM/mean_centralised/arousal.pt")

In [22]:
valence_embeddings = list(valence_model.user_factors.parameters())[0].detach().cpu()
valence_similarities = mine_outliers_scipy(embeddings=valence_embeddings)
valence_outliers = {valence_data_converter.get_original_user_id(i): score for i, score in enumerate(valence_similarities)}

arousal_embeddings = list(arousal_model.user_factors.parameters())[0].detach().cpu()
arousal_similarities = mine_outliers_scipy(embeddings=arousal_embeddings)
arousal_outliers = {arousal_data_converter.get_original_user_id(i): score for i, score in enumerate(arousal_similarities)}


items_group_by_users = valence_data_converter.original_df.groupby("user_id")
combined_outliers = {}
for user_id, valence_dist in valence_outliers.items():
    arousal_dist = arousal_outliers[user_id]
    combined_outliers[user_id] = valence_dist + arousal_dist

combined_outliers = dict(sorted(combined_outliers.items(), key=lambda item: item[1]))
for user_id, item_id in combined_outliers.items():
    number_of_items = len(items_group_by_users.get_group(user_id))
    print(f"user: {user_id}, dist: {item_id}, #items: {number_of_items}")

user: 2f790705ae66e70e81cc0f11ce0f4b9b, dist: -95.75972818550389, #items: 2
user: ff18a27328ffd40ef52b7ebb7a0ded94, dist: -90.60384847646597, #items: 20
user: c3c21239b85dcdd6679fc212afd02a49, dist: -83.32957716813507, #items: 9
user: 19fee46f2810f34a8b69a7768d897a59, dist: -69.6422728128787, #items: 1
user: 3111e02887b600ee085c72c0a3df33e8, dist: -66.910503722715, #items: 1
user: 54cc66fe7cbb01775a6b7c7d703cdeff, dist: -59.22067829593945, #items: 3
user: 38531641e6c0628757776b0088bcc854, dist: -53.823952958916166, #items: 7
user: a34913ea1010b5812a14d1fef9586a4f, dist: -38.976865918449754, #items: 3
user: 4dedf223de3f8ebde100df78a5428251, dist: -34.870674058252405, #items: 3
user: fc2fa5656d42b49f3caf01f663085069, dist: -29.021478974624625, #items: 6
user: a186cdd58a92051b7c73adc9bd6e65ca, dist: -24.32363059285435, #items: 7
user: fd5b08ce362d855ca9152a894348130c, dist: -23.86235196700427, #items: 222
user: ad3b997c4f2382a66e49f035cacfa682, dist: -22.760694546369205, #items: 3
user: b

In [23]:
"""
This block of code inserting new outlier user into the dataset, and then tries to optimize the new user embedding according
to the optimized item embedding of the model.
We want to check if we can detect outlier in post training phase.
This block works on the Valence axis
"""
valence_single_mf_model = SingleMF(optimized_item_factors=valence_model.user_factors)
criterion = MSELoss()
optimizer = SGD(valence_single_mf_model.parameters(), lr=.1, weight_decay=1e-7)
outlier_dataframe = valence_data_converter.create_outlier_dataset(
    original_df=original_df, number_of_users_to_add=1, n_ratings_per_random_user=10
)

outlier_data_converter = DataConverter(original_df=outlier_dataframe)
outlier_dataset = create_dataset(data_frame=outlier_data_converter.encoded_df)
outlier_train_load = DataLoader(outlier_dataset, batch_size=len(outlier_dataset), shuffle=True)

single_mf_runner = SingleMFRunner(
    model=valence_single_mf_model, criterion=criterion, optimizer=optimizer
)

epochs = 1000
for epoch in range(epochs):
    epoch_loss = single_mf_runner.train(train_loader=outlier_train_load)
    if epoch % 100 == 0:
        print(f"epoch={epoch + 1}, loss={epoch_loss}")

epoch=1, loss=8.602796173095703
epoch=101, loss=0.16815229654312133
epoch=201, loss=0.13902761936187744
epoch=301, loss=0.12419184446334838
epoch=401, loss=0.1157263159751892
epoch=501, loss=0.11013238430023194
epoch=601, loss=0.10586302280426026
epoch=701, loss=0.10223348140716552
epoch=801, loss=0.0989396870136261
epoch=901, loss=0.09584566354751586


In [24]:
"""
This block of code inserting new outlier user into the dataset, and then tries to optimize the new user embedding according
to the optimized item embedding of the model.
We want to check if we can detect outlier in post training phase.
This block works on the Arousal axis
"""
arousal_single_mf_model = SingleMF(optimized_item_factors=arousal_model.user_factors)
criterion = MSELoss()
optimizer = SGD(arousal_single_mf_model.parameters(), lr=.1, weight_decay=1e-7)
outlier_dataframe = arousal_data_converter.create_outlier_dataset(
    original_df=original_df, number_of_users_to_add=1, n_ratings_per_random_user=10
)

outlier_data_converter = DataConverter(original_df=outlier_dataframe)
outlier_dataset = create_dataset(data_frame=outlier_data_converter.encoded_df)
outlier_train_load = DataLoader(outlier_dataset, batch_size=len(outlier_dataset), shuffle=True)

single_mf_runner = SingleMFRunner(
    model=arousal_single_mf_model, criterion=criterion, optimizer=optimizer
)

epochs = 1000
for epoch in range(epochs):
    epoch_loss = single_mf_runner.train(train_loader=outlier_train_load)
    if epoch % 100 == 0:
        print(f"epoch={epoch + 1}, loss={epoch_loss}")

epoch=1, loss=1.1504293441772462
epoch=101, loss=0.2887500524520874
epoch=201, loss=0.2813475847244263
epoch=301, loss=0.2741649627685547
epoch=401, loss=0.2671810626983643
epoch=501, loss=0.2603828191757202
epoch=601, loss=0.25376152992248535
epoch=701, loss=0.24731061458587647
epoch=801, loss=0.24102475643157958
epoch=901, loss=0.23489916324615479


In [25]:
"""
This block add the new outlier user embeddings into the existing user embeddings and tries to detect whether his outlier or not.
"""
valence_embeddings = list(valence_model.user_factors.parameters())[0].detach().cpu()
outlier_valence_embeddings = list(valence_single_mf_model.user_factors.parameters())[0].detach().cpu()
valence_embeddings = torch.cat((valence_embeddings, outlier_valence_embeddings), 0)

arousal_embeddings = list(arousal_model.user_factors.parameters())[0].detach().cpu()
outlier_arousal_embeddings = list(arousal_single_mf_model.user_factors.parameters())[0].detach().cpu()
arousal_embeddings = torch.cat((arousal_embeddings, outlier_arousal_embeddings), 0)

valence_similarities = mine_outliers_scipy(embeddings=valence_embeddings)
arousal_similarities = mine_outliers_scipy(embeddings=arousal_embeddings)

combined_outliers = {}
for i, (valence_dist, arousal_dist) in enumerate(zip(valence_similarities, arousal_similarities)):
    if i == len(arousal_similarities) - 1:
        outlier_id = outlier_data_converter.get_original_user_id(encoded_id=0)
        combined_outliers[outlier_id] = valence_dist + arousal_dist
        continue

    user_id = valence_data_converter.get_original_user_id(encoded_id=i)
    combined_outliers[user_id] = valence_dist + arousal_dist


items_group_by_users = valence_data_converter.original_df.groupby("user_id")
outlier_items_group_by_users = outlier_data_converter.original_df.groupby("user_id")

combined_outliers = dict(sorted(combined_outliers.items(), key=lambda item: item[1]))
for user_id, dist in combined_outliers.items():
    try:
        number_of_items = len(items_group_by_users.get_group(user_id))
        print(f"user: {user_id}, dist: {dist}, #items: {number_of_items}")
    except KeyError:
        # handle outlier
        number_of_items = len(outlier_items_group_by_users.get_group(user_id))
        print(f"user: {user_id}, dist: {dist}, #items: {10}")

user: 2f790705ae66e70e81cc0f11ce0f4b9b, dist: -96.25773126664771, #items: 2
user: ff18a27328ffd40ef52b7ebb7a0ded94, dist: -90.91353693956026, #items: 20
user: c3c21239b85dcdd6679fc212afd02a49, dist: -83.54034075772954, #items: 9
user: 19fee46f2810f34a8b69a7768d897a59, dist: -70.03113199386684, #items: 1
user: 3111e02887b600ee085c72c0a3df33e8, dist: -67.48582897594557, #items: 1
user: 54cc66fe7cbb01775a6b7c7d703cdeff, dist: -59.12530530839426, #items: 3
user: 38531641e6c0628757776b0088bcc854, dist: -53.942987479484295, #items: 7
user: a34913ea1010b5812a14d1fef9586a4f, dist: -39.07753250972723, #items: 3
user: 4dedf223de3f8ebde100df78a5428251, dist: -35.00141894799186, #items: 3
user: fc2fa5656d42b49f3caf01f663085069, dist: -29.024042440330387, #items: 6
user: a186cdd58a92051b7c73adc9bd6e65ca, dist: -24.308154194818187, #items: 7
user: fd5b08ce362d855ca9152a894348130c, dist: -23.681053571218328, #items: 222
user: ad3b997c4f2382a66e49f035cacfa682, dist: -23.24379224790102, #items: 3
user:

In [28]:
"""
This block analyze raw data consistency using the direct calculation defined by:
consistency += row.rating - row.song.mean() for all rows in dataset.
In addition we try to identify the consistency in the dataset after dropping the outliers using the direct calculation.
"""

outliers = dict(sorted(combined_outliers.items(), key=lambda item: item[1])[:20])
outliers_names = outliers.keys()

columns = ["user_id", "item_id", "rating"]
valence_df = read_csv(DF_PATH, skipinitialspace=True, usecols=columns)

columns = ["user_id", "item_id", "rating"]
arousal_df = read_csv(DF_PATH, skipinitialspace=True, usecols=columns)

valence_consistency = direct_consistency_calculation(data_frame=valence_df)
arousal_consistency = direct_consistency_calculation(data_frame=arousal_df)

print(f"Raw data consistency with outliers according to direct calculation is: \x1b[33m{valence_consistency + arousal_consistency}\x1b[32m")

valence_df_without_outliers = valence_df[~valence_df.user_id.isin(outliers_names)]
arousal_df_without_outliers = arousal_df[~arousal_df.user_id.isin(outliers_names)]

valence_consistency = direct_consistency_calculation(data_frame=valence_df_without_outliers)
arousal_consistency = direct_consistency_calculation(data_frame=arousal_df_without_outliers)

print(f"Raw data consistency without outliers according to direct calculation is: \x1b[43m{valence_consistency + arousal_consistency}\x1b[42m")

  0%|          | 0/17464 [00:00<?, ?it/s]

  0%|          | 0/17464 [00:00<?, ?it/s]

Raw data consistency with outliers according to direct calculation is: [33m2.0250467969162855e-13[32m


  0%|          | 0/16657 [00:00<?, ?it/s]

  0%|          | 0/16657 [00:00<?, ?it/s]

Raw data consistency without outliers according to direct calculation is: [43m8.970602038971265e-14[42m


In [29]:
"""
This block tries to identify the consistency in the dataset after MF.
The mf consistency is defined by:
consistency += row.rating - model.prediction(row.user, row.item) for all rows in dataset.
First we are trying to identify the consistency with outliers, afterwards we are removing
the outliers and re-run the calculation.
"""
valence_consistency = mf_consistency_calculation(data_frame=valence_df, model=valence_model)
arousal_consistency = mf_consistency_calculation(data_frame=arousal_df, model=arousal_model)

print(f"Raw data consistency with outliers according to matrix factorization calculation is: \x1b[33m{valence_consistency + arousal_consistency}\x1b[32m")

valence_consistency = mf_consistency_calculation(data_frame=valence_df_without_outliers, model=valence_model)
arousal_consistency = mf_consistency_calculation(data_frame=arousal_df_without_outliers, model=arousal_model)

print(f"Raw data consistency without outliers according to matrix factorization calculation is: \x1b[41m{valence_consistency + arousal_consistency}\x1b[42m")

mf_calculation: 100%|██████████| 17464/17464 [00:01<00:00, 8766.90it/s]
mf_calculation: 100%|██████████| 17464/17464 [00:01<00:00, 9527.35it/s]


Raw data consistency with outliers according to matrix factorization calculation is: [33m187.95970437093638[32m


mf_calculation: 100%|██████████| 16657/16657 [00:01<00:00, 9006.80it/s]
mf_calculation: 100%|██████████| 16657/16657 [00:02<00:00, 7762.33it/s]

Raw data consistency without outliers according to matrix factorization calculation is: [41m-54.19022360781673[42m



