In [99]:
"""
The Deam dataset is based on Arousal-Valence 2D emotional model.
The Valence/Arousal ratings were collected using Amazon Mechanical Turks service.
Each turk from the collected crowd were asked to mark his own emotion for the current song on a 2D plane, Arousal/Valence.
For more information please read: https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0173392
"""
import os
import torch
import pandas
import plotly.express as px
import numpy as np
from pandas import read_csv
from torch.optim import SGD
from torch.nn import MSELoss
from torch.utils.data import DataLoader
from config import DATA_DIR, MODELS_DIR
from src.model import MF, SingleMF
from src.runner import SingleMFRunner
from src.utils import (
    create_dataset,
    mine_outliers_scipy,
    mine_outliers_sklearn,
    DataConverter,
)
from tensorboardX import SummaryWriter
from src.consistency import direct_consistency_calculation, mf_consistency_calculation
from sklearn.manifold import TSNE
from pandas import DataFrame
from src.utils import ProcColumn
from src.runner import Runner
from src.data_set import RatingsDataset
from collections import namedtuple
from src.consistency import clac_cronbach_alpha
from colorama import Fore, Style

Row = namedtuple("Row", "workerID SongId Valence Arousal Emotion")

# experiment = "mean_centralised"
# experiment = "standardized"
experiment = "raw"
include_bias = experiment == "raw"

DF_PATH = f"{DATA_DIR}" \
          f"/DEAM/annotations/annotations per each rater/" \
          f"song_level/static_annotations_songs_1_2000_{experiment}.csv"


def get_emotion(valence: int, arousal: int, valence_mean: float, arousal_mean: float) -> int:
    """
    Selects emotion based on Valence/Arousal.
    """
    if arousal <= arousal_mean and valence <= valence_mean:
        return 3

    if arousal >= arousal_mean and valence <= valence_mean:
        return 2

    if arousal <= arousal_mean and valence >= valence_mean:
        return 4

    if arousal >= arousal_mean and valence >= valence_mean:
        return 1

In [100]:
def select_n_random(trainset: RatingsDataset):
    """
    Selects n random data points and their corresponding labels from a dataset
    """
    perm = torch.randperm(len(trainset))
    return trainset[perm][:100]

In [101]:
columns = ["workerID", "SongId", "Valence", "Arousal"]
original_df = pandas.read_csv(DF_PATH, skipinitialspace=True, usecols=columns)
original_df.head()

Unnamed: 0,workerID,SongId,Valence,Arousal
0,6010bbc8e7ef4b21fa38f9c3a9754ef3,2,5,2
1,3c888e77b992ae3cd2adfe16774e23b9,2,2,3
2,2afd218c3aecb6828d2be327f8b9c46f,2,3,3
3,fd5b08ce362d855ca9152a894348130c,2,4,4
4,9c8073214a052e414811b76012df8847,2,2,2


In [102]:
"""
This block of code calculates the outliers alongside the valence axis
"""
valence_df = original_df[["workerID", "SongId", "Valence"]]
valence_df.columns = ["user_id", "item_id", "rating"]

valence_data_converter = DataConverter(original_df=valence_df)

valence_model = MF(
        n_users=valence_data_converter.n_users,
        n_items=valence_data_converter.n_item,
        include_bias=include_bias,
        n_factors=300,
)
if os.path.exists(f"{MODELS_DIR}/DEAM/{experiment}/valence.pt"):
    valence_model.load_state_dict(torch.load(f"{MODELS_DIR}/DEAM/{experiment}/valence.pt"))
else:
    epochs = 300

    criterion = MSELoss()
    optimizer = SGD(valence_model.parameters(), lr=5, weight_decay=1e-3)
    runner = Runner(
        model=valence_model,
        criterion=criterion,
        optimizer=optimizer,
        epochs=epochs
    )

    train_set = create_dataset(data_frame=valence_data_converter.encoded_df)
    train_load = DataLoader(train_set, batch_size=1000, shuffle=True)
    users, items, ratings = select_n_random(train_set)

    with SummaryWriter(f"runs/DEAM/{experiment}/valence") as writer:
        writer.add_graph(valence_model, (users, items))

        for epoch in range(epochs):
            epoch_loss = runner.train(train_loader=train_load, epoch=epoch, writer=writer)
            print(f"epoch={epoch + 1}, loss={epoch_loss}")

    torch.save(valence_model.state_dict(), f"{MODELS_DIR}/DEAM/{experiment}/valence.pt")

In [103]:
"""
This block of code calculates the outliers alongside the Arousal axis
"""
arousal_df = original_df[["workerID", "SongId", "Arousal"]]
arousal_df.columns = ["user_id", "item_id", "rating"]

arousal_data_converter = DataConverter(original_df=arousal_df)

arousal_model = MF(
        n_users=arousal_data_converter.n_users,
        n_items=arousal_data_converter.n_item,
        include_bias=include_bias,
        n_factors=300,
)
if os.path.exists(f"{MODELS_DIR}/DEAM/{experiment}/arousal.pt"):
    arousal_model.load_state_dict(torch.load(f"{MODELS_DIR}/DEAM/{experiment}/arousal.pt"))
else:
    epochs = 300

    criterion = MSELoss()
    optimizer = SGD(arousal_model.parameters(), lr=5, weight_decay=1e-3)
    runner = Runner(
        model=arousal_model,
        criterion=criterion,
        optimizer=optimizer,
        epochs=epochs,
    )

    train_set = create_dataset(data_frame=arousal_data_converter.encoded_df)
    train_load = DataLoader(train_set, batch_size=1000, shuffle=True)
    users, items, ratings = select_n_random(train_set)

    with SummaryWriter(f"runs/DEAM/{experiment}/arousal") as writer:
        writer.add_graph(arousal_model, (users, items))

        for epoch in range(epochs):
            epoch_loss = runner.train(train_loader=train_load, epoch=epoch, writer=writer)
            print(f"epoch={epoch + 1}, loss={epoch_loss}")

    torch.save(arousal_model.state_dict(), f"{MODELS_DIR}/DEAM/{experiment}/arousal.pt")

In [107]:
valence_embeddings = list(valence_model.user_factors.parameters())[0].detach().cpu()
valence_similarities = mine_outliers_sklearn(embeddings=valence_embeddings)
valence_outliers = {valence_data_converter.get_original_user_id(i): score for i, score in enumerate(valence_similarities)}

arousal_embeddings = list(arousal_model.user_factors.parameters())[0].detach().cpu()
arousal_similarities = mine_outliers_sklearn(embeddings=arousal_embeddings)
arousal_outliers = {arousal_data_converter.get_original_user_id(i): score for i, score in enumerate(arousal_similarities)}


items_group_by_users = valence_data_converter.original_df.groupby("user_id")
combined_outliers = {}
for user_id, valence_dist in valence_outliers.items():
    arousal_dist = arousal_outliers[user_id]
    combined_outliers[user_id] = valence_dist + arousal_dist

Outlier = namedtuple("Outlier", ["user_id" ,"cosine_similarity" ,"annotations"])
combined_outliers = dict(sorted(combined_outliers.items(), key=lambda item: item[1])[:10])
outliers = []
for user_id, dist in combined_outliers.items():
    print(f"'{user_id}',")
    number_of_items = len(items_group_by_users.get_group(user_id))
    outliers.append(Outlier(user_id=user_id[:5], cosine_similarity="{:.2f}".format(dist), annotations=number_of_items))

outliers = pandas.DataFrame.from_dict(outliers)
outliers.index += 1
print(outliers.to_latex())

'2a6b63b7690efa2390c8d9fee11b1407',
'ad3b997c4f2382a66e49f035cacfa682',
'65794ea9f5122952403585a237bc5e52',
'fd5b08ce362d855ca9152a894348130c',
'374a5659c02e12b01db6319436f17a7d',
'bb50b45a1874ede476874bd57e4cabb4',
'485d8e33a731a830ef0aebd71b016d08',
'615d836ba25132081e0ebd2182221a59',
'da37d1548ffd0631809f7be341e4fe4d',
'a30d244141cb2f51e0803e79bc4bd147',
\begin{tabular}{lllr}
\toprule
{} & user\_id & cosine\_similarity &  annotations \\
\midrule
1  &   2a6b6 &             12.04 &            3 \\
2  &   ad3b9 &             19.67 &            3 \\
3  &   65794 &             19.96 &            3 \\
4  &   fd5b0 &             20.85 &          222 \\
5  &   374a5 &             21.09 &            3 \\
6  &   bb50b &             31.69 &          178 \\
7  &   485d8 &             34.89 &            6 \\
8  &   615d8 &             35.81 &            6 \\
9  &   da37d &             38.08 &            3 \\
10 &   a30d2 &             41.99 &          985 \\
\bottomrule
\end{tabular}



  print(outliers.to_latex())


In [25]:
"""
This block of code inserting new outlier user into the dataset, and then tries to optimize the new user embedding according
to the optimized item embedding of the model.
We want to check if we can detect outlier in post training phase.
This block works on the Valence axis
"""

valence_single_mf_model = SingleMF(optimized_item_factors=valence_model.user_factors, n_factors=300)
criterion = MSELoss()
optimizer = SGD(valence_single_mf_model.parameters(), lr=.1, weight_decay=1e-7)
outlier_dataframe = valence_data_converter.create_outlier_dataset(
    original_df=valence_df, number_of_users_to_add=1, n_ratings_per_random_user=10
)

outlier_data_converter = DataConverter(original_df=outlier_dataframe)
outlier_dataset = create_dataset(data_frame=outlier_data_converter.encoded_df)
outlier_train_load = DataLoader(outlier_dataset, batch_size=len(outlier_dataset), shuffle=True)

single_mf_runner = SingleMFRunner(
    model=valence_single_mf_model, criterion=criterion, optimizer=optimizer
)

epochs = 1000
for epoch in range(epochs):
    epoch_loss = single_mf_runner.train(train_loader=outlier_train_load)
    if epoch % 100 == 0:
        print(f"epoch={epoch + 1}, loss={epoch_loss}")

epoch=1, loss=4.308509826660156
epoch=101, loss=0.07582364082336426
epoch=201, loss=0.04621601998806
epoch=301, loss=0.02899077534675598
epoch=401, loss=0.018272922933101655
epoch=501, loss=0.011526497453451157
epoch=601, loss=0.007271832972764969
epoch=701, loss=0.004587733745574951
epoch=801, loss=0.0028943827375769613
epoch=901, loss=0.0018260590732097626


In [26]:
"""
This block of code inserting new outlier user into the dataset, and then tries to optimize the new user embedding according
to the optimized item embedding of the model.
We want to check if we can detect outlier in post training phase.
This block works on the Arousal axis
"""

arousal_single_mf_model = SingleMF(optimized_item_factors=arousal_model.user_factors, n_factors=300)
criterion = MSELoss()
optimizer = SGD(arousal_single_mf_model.parameters(), lr=.1, weight_decay=1e-7)
outlier_dataframe = arousal_data_converter.create_outlier_dataset(
    original_df=arousal_df, number_of_users_to_add=1, n_ratings_per_random_user=10
)

outlier_data_converter = DataConverter(original_df=outlier_dataframe)
outlier_dataset = create_dataset(data_frame=outlier_data_converter.encoded_df)
outlier_train_load = DataLoader(outlier_dataset, batch_size=len(outlier_dataset), shuffle=True)

single_mf_runner = SingleMFRunner(
    model=arousal_single_mf_model, criterion=criterion, optimizer=optimizer
)

epochs = 1000
for epoch in range(epochs):
    epoch_loss = single_mf_runner.train(train_loader=outlier_train_load)
    if epoch % 100 == 0:
        print(f"epoch={epoch + 1}, loss={epoch_loss}")

epoch=1, loss=3.8076892852783204
epoch=101, loss=0.023093771934509278
epoch=201, loss=0.008550535887479782
epoch=301, loss=0.00434623584151268
epoch=401, loss=0.0025340836495161057
epoch=501, loss=0.0015426399186253548
epoch=601, loss=0.0009504844434559345
epoch=701, loss=0.0005875375121831894
epoch=801, loss=0.0003635013941675425
epoch=901, loss=0.00022494681179523467


In [27]:
"""
This block add the new outlier user embeddings into the existing user embeddings and tries to detect whether his outlier or not.
"""

valence_embeddings = list(valence_model.user_factors.parameters())[0].detach().cpu()
outlier_valence_embeddings = list(valence_single_mf_model.user_factors.parameters())[0].detach().cpu()
valence_embeddings = torch.cat((valence_embeddings, outlier_valence_embeddings), 0)

arousal_embeddings = list(arousal_model.user_factors.parameters())[0].detach().cpu()
outlier_arousal_embeddings = list(arousal_single_mf_model.user_factors.parameters())[0].detach().cpu()
arousal_embeddings = torch.cat((arousal_embeddings, outlier_arousal_embeddings), 0)

valence_similarities = mine_outliers_sklearn(embeddings=valence_embeddings)
arousal_similarities = mine_outliers_sklearn(embeddings=arousal_embeddings)

combined_outliers = {}
for i, (valence_dist, arousal_dist) in enumerate(zip(valence_similarities, arousal_similarities)):
    if i == len(arousal_similarities) - 1:
        outlier_id = outlier_data_converter.get_original_user_id(encoded_id=0)
        combined_outliers[outlier_id] = valence_dist + arousal_dist
        continue

    user_id = valence_data_converter.get_original_user_id(encoded_id=i)
    combined_outliers[user_id] = valence_dist + arousal_dist

items_group_by_users = valence_data_converter.original_df.groupby("user_id")
outlier_items_group_by_users = outlier_data_converter.original_df.groupby("user_id")

combined_outliers = dict(sorted(combined_outliers.items(), key=lambda item: item[1])[:10])
outliers = []
for user_id, dist in combined_outliers.items():
    try:
        number_of_items = len(items_group_by_users.get_group(user_id))
        outliers.append(Outlier(user_id=user_id[:5], cosine_similarity="{:.2f}".format(dist), annotations=number_of_items))
        # print(f"user: {user_id}, dist: {dist}, #items: {number_of_items}")
    except KeyError:
        # handle outlier
        number_of_items = len(outlier_items_group_by_users.get_group(user_id))
        outliers.append(Outlier(user_id=user_id, cosine_similarity="{:.2f}".format(dist), annotations=number_of_items))
        # print(f"user: {user_id}, dist: {dist}, #items: {10}")
outliers = pandas.DataFrame.from_dict(outliers)
outliers.index += 1
print(outliers.to_latex())

\begin{tabular}{lllr}
\toprule
{} &           user\_id & cosine\_similarity &  annotations \\
\midrule
1  &             2a6b6 &             12.12 &            3 \\
2  &             ad3b9 &             19.60 &            3 \\
3  &             65794 &             20.01 &            3 \\
4  &             fd5b0 &             20.96 &          222 \\
5  &             374a5 &             21.01 &            3 \\
6  &  random annotator &             23.33 &           10 \\
7  &             bb50b &             31.59 &          178 \\
8  &             485d8 &             34.93 &            6 \\
9  &             615d8 &             35.84 &            6 \\
10 &             da37d &             38.04 &            3 \\
\bottomrule
\end{tabular}



  print(outliers.to_latex())


In [50]:
"""
This block analyze raw data consistency using the direct calculation defined by:
consistency += row.rating - row.song.mean() for all rows in dataset.
In addition we try to identify the consistency in the dataset after dropping the outliers using the direct calculation.
"""

outliers_names = combined_outliers.keys()

valence_consistency = direct_consistency_calculation(data_frame=valence_data_converter.original_df)
arousal_consistency = direct_consistency_calculation(data_frame=arousal_data_converter.original_df)

print(f"Raw data consistency with outliers according to direct calculation is: \x1b[33m{valence_consistency + arousal_consistency}\x1b[32m")

valence_df_without_outliers = valence_df[~valence_df.user_id.isin(outliers_names)]
arousal_df_without_outliers = arousal_df[~arousal_df.user_id.isin(outliers_names)]

valence_consistency = direct_consistency_calculation(data_frame=valence_df_without_outliers)
arousal_consistency = direct_consistency_calculation(data_frame=arousal_df_without_outliers)

print(f"Raw data consistency without outliers according to direct calculation is: \x1b[43m{valence_consistency + arousal_consistency}\x1b[42m")

  0%|          | 0/17464 [00:00<?, ?it/s]

  0%|          | 0/17464 [00:00<?, ?it/s]

Raw data consistency with outliers according to direct calculation is: [33m-4.4853010194856324e-14[32m


  0%|          | 0/16429 [00:00<?, ?it/s]

  0%|          | 0/16429 [00:00<?, ?it/s]

Raw data consistency without outliers according to direct calculation is: [43m-1.0091927293842673e-13[42m


In [36]:
"""
This block tries to identify the consistency in the dataset after MF.
The mf consistency is defined by:
consistency += row.rating - model.prediction(row.user, row.item) for all rows in dataset.
First we are trying to identify the consistency with outliers, afterwards we are removing
the outliers and re-run the calculation.
"""

valence_consistency = mf_consistency_calculation(data_frame=valence_df, model=valence_model, outliers={}, round_prediction=False)
arousal_consistency = mf_consistency_calculation(data_frame=arousal_df, model=arousal_model, outliers={}, round_prediction=False)

print(f"Raw data consistency with outliers according to matrix factorization calculation is: \x1b[33m{valence_consistency + arousal_consistency}\x1b[32m")

valence_consistency = mf_consistency_calculation(data_frame=valence_df, model=valence_model, outliers=combined_outliers, round_prediction=False)
arousal_consistency = mf_consistency_calculation(data_frame=arousal_df, model=arousal_model, outliers=combined_outliers, round_prediction=False)

print(f"Raw data consistency without outliers according to matrix factorization calculation is: \x1b[41m{valence_consistency + arousal_consistency}\x1b[42m")

mf_calculation: 100%|██████████| 17464/17464 [00:02<00:00, 7943.78it/s]
mf_calculation: 100%|██████████| 17464/17464 [00:02<00:00, 8457.65it/s]


Raw data consistency with outliers according to matrix factorization calculation is: [33m6708.077201962471[32m


mf_calculation: 100%|██████████| 17464/17464 [00:02<00:00, 7718.31it/s]
mf_calculation: 100%|██████████| 17464/17464 [00:01<00:00, 9149.14it/s]

Raw data consistency without outliers according to matrix factorization calculation is: [41m6367.561932563782[42m





In [80]:
def to_mf_df():
    working_df = pandas.read_csv(DF_PATH, skipinitialspace=True, usecols=columns)
    valence_mean = valence_df.rating.mean()
    arousal_mean = arousal_df.rating.mean()

    user_original_id_to_encoded_id = ProcColumn(working_df.workerID)
    item_original_id_to_encoded_id = ProcColumn(working_df.SongId)
    working_df.workerID = user_original_id_to_encoded_id.encoded_col
    working_df.SongId = item_original_id_to_encoded_id.encoded_col

    df = []
    for (index, worker_id, song_id, valence, arousal) in working_df.itertuples():
        user_id_as_tensor = torch.LongTensor([worker_id])
        item_id_as_tensor = torch.LongTensor([song_id])
        with torch.no_grad():
            valence_prediction = valence_model(
                users=user_id_as_tensor, items=item_id_as_tensor,
            ).item()
            arousal_prediction = arousal_model(
                users=user_id_as_tensor, items=item_id_as_tensor,
            ).item()
            emotion_predicted = get_emotion(
                valence=valence_prediction,
                arousal=arousal_prediction,
                valence_mean=valence_mean,
                arousal_mean=arousal_mean,
            )

        original_worker_id = user_original_id_to_encoded_id.get_name(index=worker_id)
        original_item_id = item_original_id_to_encoded_id.get_name(index=song_id)
        df.append(
            Row(
                workerID=original_worker_id,
                SongId=original_item_id,
                Valence=valence_prediction,
                Arousal=arousal_prediction,
                Emotion=emotion_predicted,
            )
        )

    df = DataFrame(df, columns=["workerID", "SongId", "Valence", "Arousal", "Emotion"])
    return df

In [None]:
from sklearn.preprocessing import StandardScaler


def k_means(dataframe: DataFrame, title: str):
    mat = dataframe[["Valence", "Arousal"]]
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(mat)
    scaled_features = DataFrame(scaled_features)
    scaled_features.columns = ["Valence", "Arousal"]
    scaled_features["Emotion"] = dataframe.Emotion
    km = KMeans(n_clusters=4)
    y_km = km.fit_predict(scaled_features).astype(str)
    fig = px.scatter(
        x=scaled_features["Arousal"],
        y=scaled_features["Valence"],
        color_discrete_sequence=px.colors.qualitative.G10,
        color=y_km,
        hover_data={"Item id": original_df.SongId.values},
        title=title,
    )
    fig.show()

In [None]:
from sklearn.cluster import KMeans


def plot_embeddings(dataframe: DataFrame):
    valence_item_embeddings = list(valence_model.item_factors.parameters())[0].detach().cpu()
    arousal_item_embeddings = list(arousal_model.item_factors.parameters())[0].detach().cpu()

    valence_item_embeddings = np.array(valence_item_embeddings)
    valence_tsne = TSNE(n_components=2, random_state=0).fit_transform(valence_item_embeddings)
    km = KMeans(n_clusters=2)
    v_y_km = km.fit_predict(valence_tsne).astype(str)

    arousal_item_embeddings = np.array(arousal_item_embeddings)
    arousal_tsne = TSNE(n_components=2, random_state=0).fit_transform(arousal_item_embeddings)
    a_y_km = km.fit_predict(arousal_tsne).astype(str)

    fig = px.scatter(
        valence_tsne,
        x=0,
        y=1,
        color_discrete_sequence=px.colors.qualitative.G10,
        color=v_y_km,
        labels={"color": "cluster"},
        hover_data={"song_id": dataframe.SongId.unique()},
        title="Valence Embeddings"
    )
    fig.update_traces(marker_size=8)
    fig.show()

    fig = px.scatter(
        arousal_tsne,
        x=0,
        y=1,
        color_discrete_sequence=px.colors.qualitative.G10,
        color=a_y_km,
        labels={"color": "cluster"},
        hover_data={"song_id": dataframe.SongId.unique()},
        title="Arousal Embeddings"
    )
    fig.update_traces(marker_size=8)
    fig.show()

In [None]:
"""
This block calculates the cronbach alpha parameter for Valence/Arousal.
The first experiments is on the original dataset, the second one is on the dataset without outliers.
"""

valence_cronbach_alpha = clac_cronbach_alpha(data_frame=valence_data_converter.encoded_df)
arousal_cronbach_alpha = clac_cronbach_alpha(data_frame=arousal_data_converter.encoded_df)

print(f"valence_cronbach_alpha with outliers: {Fore.GREEN}{valence_cronbach_alpha}{Style.RESET_ALL}")
print(f"arousal_cronbach_alpha with outliers: {Fore.GREEN}{arousal_cronbach_alpha}{Style.RESET_ALL}")

valence_df_without_outliers_data_converter = DataConverter(original_df=valence_df_without_outliers)
arousal_df_without_outliers_data_converter = DataConverter(original_df=arousal_df_without_outliers)

valence_cronbach_alpha = clac_cronbach_alpha(data_frame=valence_df_without_outliers_data_converter.encoded_df)
arousal_cronbach_alpha = clac_cronbach_alpha(data_frame=arousal_df_without_outliers_data_converter.encoded_df)

print(f"valence_cronbach_alpha {Fore.BLUE}without outliers{Style.RESET_ALL}: {Fore.GREEN}{valence_cronbach_alpha}{Style.RESET_ALL}")
print(f"arousal_cronbach_alpha {Fore.BLUE}without outliers{Style.RESET_ALL}: {Fore.GREEN}{arousal_cronbach_alpha}{Style.RESET_ALL}")

In [97]:
df_after_mf = to_mf_df()

valence_df_after_mf = df_after_mf[["workerID", "SongId", "Valence"]]
valence_df_after_mf.columns = ["user_id", "item_id", "rating"]

arousal_df_after_mf = df_after_mf[["workerID", "SongId", "Arousal"]]
arousal_df_after_mf.columns = ["user_id", "item_id", "rating"]

outliers_names = combined_outliers.keys()
arousal_df_after_mf.head()

Unnamed: 0,user_id,item_id,rating
0,6010bbc8e7ef4b21fa38f9c3a9754ef3,2,-2.323776
1,3c888e77b992ae3cd2adfe16774e23b9,2,-0.615398
2,2afd218c3aecb6828d2be327f8b9c46f,2,-1.437036
3,fd5b08ce362d855ca9152a894348130c,2,-0.822231
4,9c8073214a052e414811b76012df8847,2,-2.641157


In [98]:
valence_consistency = direct_consistency_calculation(data_frame=valence_df_after_mf)
arousal_consistency = direct_consistency_calculation(data_frame=arousal_df_after_mf)

print(f"Raw data consistency with outliers according to direct calculation is: \x1b[33m{valence_consistency + arousal_consistency}\x1b[32m")

valence_df_without_outliers = valence_df_after_mf[~valence_df_after_mf.user_id.isin(outliers_names)]
arousal_df_without_outliers = arousal_df_after_mf[~arousal_df_after_mf.user_id.isin(outliers_names)]

valence_consistency = direct_consistency_calculation(data_frame=valence_df_without_outliers)
arousal_consistency = direct_consistency_calculation(data_frame=arousal_df_without_outliers)

print(f"Raw data consistency without outliers according to direct calculation is: \x1b[43m{valence_consistency + arousal_consistency}\x1b[42m")

  0%|          | 0/17464 [00:00<?, ?it/s]

  0%|          | 0/17464 [00:00<?, ?it/s]

Raw data consistency with outliers according to direct calculation is: [33m4.707345624410664e-14[32m


  0%|          | 0/16429 [00:00<?, ?it/s]

  0%|          | 0/16429 [00:00<?, ?it/s]

Raw data consistency without outliers according to direct calculation is: [43m-1.1546319456101628e-14[42m


In [None]:
# k_means(dataframe=original_df, title="Original Dataset")
k_means(dataframe=df_after_mf, title="After MF Dataset")
# plot_embeddings(dataframe=df_after_mf)

In [None]:
# """
# This block plot the rows that their emotion was changed after mf.
# """
# mask = ((original_df["Valence"] == df_after_mf["Valence"]) & (original_df["Arousal"] == df_after_mf["Arousal"]))
# changes = original_df[mask].copy()
# changes["New Valence"] = df_after_mf.Valence
# changes["New Arousal"] = df_after_mf.Arousal
# print(f"Number of hits: {len(changes)} / {len(original_df)}")
# print(f"Hit ratio: {(len(changes) / len(original_df)) * 100}")
# changes.head(len(changes))