## Inspecting features and results

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display, HTML

import utils

#### RoBERTa baseline

In [None]:
# # Apply RoBERTa classifier to test set:
# roberta_test_df = utils.apply_roberta("../data/fakedes/test.tsv")
# roberta_test_df.to_csv("../outputs/roberta_results.csv")

In [None]:
# # Get performance using RoBERTa classifier:
# roberta_test_df = pd.read_csv("../outputs/roberta_results.csv")
# roberta_test_df["predicted_label"] = roberta_test_df["predicted_label"].replace({"REAL": 0, "FAKE": 1})
# roberta_test_df["prediction"] = roberta_test_df["predicted_label"]
# roberta_test_df["label"] = roberta_test_df["CATEGORY"]
# roberta_test_df["label"] = roberta_test_df["label"].replace({True: 0, False: 1})
# roberta_test_df["source"] = roberta_test_df["SOURCE"].fillna("")
# # Get performance on test set:
# utils.test_performance(roberta_test_df)

#### Prepare the dataset and features for analysis

In [None]:
feature_labels = ["anger", "anticipation", "disgust", "fear",
                  "joy", "sadness", "surprise", "trust",
                  "positive", "negative", "valence", "arousal",
                  "concreteness", "imageability", "hyperbolic",
                  "hurtful"]

In [None]:
# Prepare datasets:
data_df = []
for split in ["train", "dev", "test"]:
    data_df.append(utils.merge_data_outputs(split, feature_labels))

train_df, dev_df, test_df = data_df

#### Describe the dataset

In [None]:
print(train_df.shape)
print(dev_df.shape)
print(test_df.shape)

In [None]:
print("=== Train: fake ===")
print(train_df[train_df["label"] == 1].topic.value_counts())
print(train_df[train_df["label"] == 1].shape)
print("\n=== Train: true ===")
print(train_df[train_df["label"] == 0].topic.value_counts())
print(train_df[train_df["label"] == 0].shape)

In [None]:
print("=== Dev: fake ===")
print(dev_df[dev_df["label"] == 1].topic.value_counts())
print(dev_df[dev_df["label"] == 1].shape)
print("\n=== Dev: true ===")
print(dev_df[dev_df["label"] == 0].topic.value_counts())
print(dev_df[dev_df["label"] == 0].shape)

In [None]:
print("=== Test: fake ===")
print(test_df[test_df["label"] == 1].topic.value_counts())
print(test_df[test_df["label"] == 1].shape)
print("\n=== Test: true ===")
print(test_df[test_df["label"] == 0].topic.value_counts())
print(test_df[test_df["label"] == 0].shape)

In [None]:
# Join train and dev for the analyses:
train_df = pd.concat([train_df, dev_df], sort=False)

In [None]:
# Show contents of the first row:
test_df.iloc[0]

#### Explore features position (figure 2)

In [None]:
def moving_average(x, w):
    return np.convolve(x, np.ones(w), 'valid') / w

In [None]:
## EMOTIONS
## ============================================================

topic = "all" # "all", "covid-19", "politics", "entertainment"
dsplit = "all" # "train", "test"

tmp_flow_df = pd.concat([train_df, test_df], sort=True)

fake_df = tmp_flow_df[tmp_flow_df["label"] == 1]
true_df = tmp_flow_df[tmp_flow_df["label"] == 0]

if topic != "all":
    # Only per topic:
    fake_df = fake_df[fake_df["topic"] == topic]
    true_df = true_df[true_df["topic"] == topic]

# Keep only emotions:
fake_features_array = np.array(fake_df.features.values.tolist()).mean(0)
true_features_array = np.array(true_df.features.values.tolist()).mean(0)

moving_average_window = 2
fake_segment_mean = moving_average(fake_features_array.mean(axis=1), 2)
true_segment_mean = moving_average(true_features_array.mean(axis=1), 2)
# Keep only emotions (eight first features):
fake_features_array = np.array(fake_df.features.values.tolist()).mean(0)[:, :8]
true_features_array = np.array(true_df.features.values.tolist()).mean(0)[:, :8]

current_features = feature_labels[:8]
# Plot features:
for feat_i in range(len(current_features)):
    segment_features_true = true_features_array[:, feat_i] # All features per segments
    segment_features_true = moving_average(segment_features_true, moving_average_window)
    segment_features_fake = fake_features_array[:, feat_i] # All features per segments
    segment_features_fake = moving_average(segment_features_fake, moving_average_window)
    x_axis = [x for x in range(1, len(segment_features_true) + 1)]
    plt.figure(figsize=(9,2))
    plt.plot(x_axis, segment_features_fake, color="red", label="fake")
    plt.plot(x_axis, segment_features_true, color="green", label="true")
    plt.plot(x_axis, fake_segment_mean, color="lightcoral", label="fake avg", linestyle="dotted")
    plt.plot(x_axis, true_segment_mean, color="lightseagreen", label="true avg", linestyle="dotted")
    plt.xticks(x_axis)
    plt.title(current_features[feat_i].title())
    plt.legend(loc="upper right")
    plt.savefig("figures/features_x_segment_" + current_features[feat_i] + "_" + dsplit + "_" + topic + ".png", dpi=300, format="png", bbox_inches="tight")
    plt.show()

In [None]:
## SEMANTICO-AFFECTIVE
## ============================================================

topic = "all" # "all", "covid-19", "politics", "entertainment"
dsplit = "all" # "train", "test"

tmp_flow_df = pd.concat([train_df, test_df], sort=True)

fake_df = tmp_flow_df[tmp_flow_df["label"] == 1]
true_df = tmp_flow_df[tmp_flow_df["label"] == 0]

if topic != "all":
    # Only per topic:
    fake_df = fake_df[fake_df["topic"] == topic]
    true_df = true_df[true_df["topic"] == topic]

# Keep only emotions:
fake_features_array = np.array(fake_df.features.values.tolist()).mean(0)
true_features_array = np.array(true_df.features.values.tolist()).mean(0)

moving_average_window = 2
fake_segment_mean = moving_average(fake_features_array.mean(axis=1), 2)
true_segment_mean = moving_average(true_features_array.mean(axis=1), 2)
# Keep only semantico-affective features (features 11 to 14):
fake_features_array = np.array(fake_df.features.values.tolist()).mean(0)[:, 10:14]
true_features_array = np.array(true_df.features.values.tolist()).mean(0)[:, 10:14]

current_features = feature_labels[10:14]
# Plot features:
for feat_i in range(len(current_features)):
    x_axis = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    segment_features_true = true_features_array[:, feat_i] # All features per segments
    segment_features_true = moving_average(segment_features_true, moving_average_window)
    segment_features_fake = fake_features_array[:, feat_i] # All features per segments
    segment_features_fake = moving_average(segment_features_fake, moving_average_window)
    x_axis = [x for x in range(1, len(segment_features_true) + 1)]
    plt.figure(figsize=(9,2))
    plt.plot(x_axis, segment_features_fake, color="red", label="fake")
    plt.plot(x_axis, segment_features_true, color="green", label="true")
    plt.plot(x_axis, fake_segment_mean, color="lightcoral", label="fake avg", linestyle="dotted")
    plt.plot(x_axis, true_segment_mean, color="lightseagreen", label="true avg", linestyle="dotted")
    plt.xticks(x_axis)
    plt.title(current_features[feat_i].title())
    plt.legend(loc="upper right")
    plt.savefig("figures/features_x_segment_" + current_features[feat_i] + "_" + dsplit + "_" + topic + ".png", dpi=300, format="png", bbox_inches="tight")
    plt.show()

In [None]:
## SENTIMENT
## ============================================================

topic = "all" # "all", "covid-19", "politics", "entertainment"
dsplit = "all" # "train", "test"

tmp_flow_df = pd.concat([train_df, test_df], sort=True)

fake_df = tmp_flow_df[tmp_flow_df["label"] == 1]
true_df = tmp_flow_df[tmp_flow_df["label"] == 0]

if topic != "all":
    # Only per topic:
    fake_df = fake_df[fake_df["topic"] == topic]
    true_df = true_df[true_df["topic"] == topic]

# Keep only emotions:
fake_features_array = np.array(fake_df.features.values.tolist()).mean(0)
true_features_array = np.array(true_df.features.values.tolist()).mean(0)

moving_average_window = 2
fake_segment_mean = moving_average(fake_features_array.mean(axis=1), 2)
true_segment_mean = moving_average(true_features_array.mean(axis=1), 2)
# Keep only sentiment features (features 9 and 10):
fake_features_array = np.array(fake_df.features.values.tolist()).mean(0)[:, 8:10]
true_features_array = np.array(true_df.features.values.tolist()).mean(0)[:, 8:10]

current_features = feature_labels[8:10]
# Plot features:
for feat_i in range(len(current_features)):
    x_axis = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    segment_features_true = true_features_array[:, feat_i] # All features per segments
    segment_features_true = moving_average(segment_features_true, moving_average_window)
    segment_features_fake = fake_features_array[:, feat_i] # All features per segments
    segment_features_fake = moving_average(segment_features_fake, moving_average_window)
    x_axis = [x for x in range(1, len(segment_features_true) + 1)]
    plt.figure(figsize=(9,2))
    plt.plot(x_axis, segment_features_fake, color="red", label="fake")
    plt.plot(x_axis, segment_features_true, color="green", label="true")
    plt.plot(x_axis, fake_segment_mean, color="lightcoral", label="fake avg", linestyle="dotted")
    plt.plot(x_axis, true_segment_mean, color="lightseagreen", label="true avg", linestyle="dotted")
    plt.xticks(x_axis)
    plt.title(current_features[feat_i].title())
    plt.legend(loc="upper right")
    plt.savefig("figures/features_x_segment_" + current_features[feat_i] + "_" + dsplit + "_" + topic + ".png", dpi=300, format="png", bbox_inches="tight")
    plt.show()

In [None]:
## HYPERBOLIC-HURTFUL
## ============================================================

topic = "all" # "all", "covid-19", "politics", "entertainment"
dsplit = "all" # "train", "test"

tmp_flow_df = pd.concat([train_df, test_df], sort=True)

fake_df = tmp_flow_df[tmp_flow_df["label"] == 1]
true_df = tmp_flow_df[tmp_flow_df["label"] == 0]

if topic != "all":
    # Only per topic:
    fake_df = fake_df[fake_df["topic"] == topic]
    true_df = true_df[true_df["topic"] == topic]

# Keep only emotions:
fake_features_array = np.array(fake_df.features.values.tolist()).mean(0)
true_features_array = np.array(true_df.features.values.tolist()).mean(0)

moving_average_window = 2
fake_segment_mean = moving_average(fake_features_array.mean(axis=1), 2)
true_segment_mean = moving_average(true_features_array.mean(axis=1), 2)
# Keep only semantico-affective features (features 15 and 16):
fake_features_array = np.array(fake_df.features.values.tolist()).mean(0)[:, 14:16]
true_features_array = np.array(true_df.features.values.tolist()).mean(0)[:, 14:16]

current_features = feature_labels[14:16]
# Plot features:
for feat_i in range(len(current_features)):
    x_axis = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    segment_features_true = true_features_array[:, feat_i] # All features per segments
    segment_features_true = moving_average(segment_features_true, moving_average_window)
    segment_features_fake = fake_features_array[:, feat_i] # All features per segments
    segment_features_fake = moving_average(segment_features_fake, moving_average_window)
    x_axis = [x for x in range(1, len(segment_features_true) + 1)]
    plt.figure(figsize=(9,2))
    plt.plot(x_axis, segment_features_fake, color="red", label="fake")
    plt.plot(x_axis, segment_features_true, color="green", label="true")
    plt.plot(x_axis, fake_segment_mean, color="lightcoral", label="fake avg", linestyle="dotted")
    plt.plot(x_axis, true_segment_mean, color="lightseagreen", label="true avg", linestyle="dotted")
    plt.xticks(x_axis)
    plt.title(current_features[feat_i].title())
    plt.legend(loc="upper right")
    plt.savefig("figures/features_x_segment_" + current_features[feat_i] + "_" + dsplit + "_" + topic + ".png", dpi=300, format="png", bbox_inches="tight")
    plt.show()

#### Radar plot per topic (figure 3)

In [None]:
true_fake = [0, 1]
topics = ["politics", "entertainment", "covid-19"]
features_for_radar = ["anger", "anticipation", "disgust", "fear",
                  "joy", "sadness", "surprise", "trust", "hurtful"]

dDataForRadar_true = dict()
dDataForRadar_fake = dict()
for type_news in true_fake:
    dDataForRadar = dict()
    for dsplit in ["train", "test"]:
        tmp_df = train_df if dsplit == "train" else test_df
        if type_news == 0:
            tmp_df = tmp_df[tmp_df["label"] == 0]
        elif type_news == 1:
            tmp_df = tmp_df[tmp_df["label"] == 1]
        data_for_radar_tmp = []
        for topic in topics:
            tmp_df_tmp = tmp_df.copy()
            tmp_df_tmp = tmp_df_tmp[tmp_df_tmp["topic"] == topic]
            if tmp_df_tmp.shape[0] < 10:
                print("No data for:", dsplit, "-", type_news, "-", topic)
                new_list = [0.00 for f in features_for_radar]
            else:
                x_train = tmp_df_tmp[features_for_radar]
                y_train = tmp_df_tmp["label"]
                new_list = []
                for f in features_for_radar:
                    new_list.append(round(np.mean(x_train[f]), 4))
            data_for_radar_tmp.append(new_list)
        if type_news == 0:
            dDataForRadar_true[dsplit] = [topics, features_for_radar, data_for_radar_tmp]
        elif type_news == 1:
            dDataForRadar_fake[dsplit] = [topics, features_for_radar, data_for_radar_tmp]

In [None]:
def example_data():
    data = [
        data_to_plot["train"][1],
        ('Training set', data_to_plot["train"][2]),
        ('Test set', data_to_plot["test"][2])
    ]
    return data

for dsplit in ["true", "fake"]:
    print(dsplit)

    data_to_plot = dDataForRadar_fake
    if dsplit == "true":
        data_to_plot = dDataForRadar_true
        
    N = len(data_to_plot["train"][1])
    theta = utils.radar_factory(N, frame='polygon')
    data = example_data()
    spoke_labels = data.pop(0)

    print(data)
    print(spoke_labels)

    fig, axs = plt.subplots(figsize=(9, 4.5), ncols=2, nrows=1, subplot_kw=dict(projection='radar'))
    fig.subplots_adjust(wspace=0.35, hspace=0.0, top=0.95, bottom=0.05)

    colors = ['g', 'r', 'c']
    for ax, (title, case_data) in zip(axs.flat, data):
        ax.set_rgrids([0.02, 0.04, 0.06, 0.08, 0.1])
        ax.set_title(title, weight='bold', size='medium', position=(0.5, 1.1),
                        horizontalalignment='center', verticalalignment='center')
        ax.set_ylim(0, 0.09)
        for d, color in zip(case_data, colors):
            ax.plot(theta, d, color=color)
            ax.fill(theta, d, facecolor=color, alpha=0.25, label='_nolegend_')
        ax.set_varlabels(spoke_labels)

    labels = (data_to_plot["train"][0])
    legend = axs[0].legend(labels, loc=(0.95, 0.95), labelspacing=0.1, fontsize='small')

    plt.savefig("figures/radar_" + dsplit + "_sum.png", dpi=300, format="png", bbox_inches="tight")
    plt.show()
    print()

#### Inspect segments attention (figure4)

In [None]:
mask = (test_df["prediction"] == 0) & (test_df["label"] == 0)
results_both_w_attn_df_pred0 = test_df[mask]
attention_array_both = np.array(results_both_w_attn_df_pred0.attention_scores.values.tolist()).astype(float)
attention_mean_matrix_both = attention_array_both.mean(axis=0)
plt.imshow(attention_mean_matrix_both, interpolation='none', extent=[1, 10, 10, 1])
plt.clim(0.0991, 0.1015)
plt.colorbar()
plt.xlabel("Segment")
plt.ylabel("Segment")
plt.xticks(x_axis)
plt.savefig("figures/self_attention_true_label.png", dpi=300, format="png", bbox_inches="tight")
plt.show()

in_parts = attention_mean_matrix_both.mean(axis=0)
print(in_parts[:3].mean())
print(in_parts[3:7].mean())
print(in_parts[7:].mean())

In [None]:
mask = (test_df["prediction"] == 1) & (test_df["label"] == 1)
results_both_w_attn_df_pred1 = test_df[mask]
attention_array_both = np.array(results_both_w_attn_df_pred1.attention_scores.values.tolist()).astype(float)
attention_mean_matrix_both = attention_array_both.mean(axis=0)
plt.imshow(attention_mean_matrix_both, interpolation='none', extent=[1, 10, 10, 1])
plt.clim(0.0991, 0.1015)
plt.colorbar()
plt.xlabel("Segment")
plt.ylabel("Segment")
plt.xticks(x_axis)
plt.savefig("figures/self_attention_true_label.png", dpi=300, format="png", bbox_inches="tight")
plt.show()

in_parts = attention_mean_matrix_both.mean(axis=0)
print(in_parts[:3].mean())
print(in_parts[3:7].mean())
print(in_parts[7:].mean())

#### Visualising an article

In [None]:
tmp_df = test_df.copy()
doc_row = tmp_df.iloc[369]
text_doc = doc_row.text
text_segments = [" ".join(x.tolist()) for x in np.array_split(text_doc.replace("\n", " ").replace("  ", " ").split(" "), 10)][:5]
text_for_viz = """
{}
""".format("\n".join([x[:150] for x in text_segments])).lstrip()
text_attention = np.array(doc_row["attention_scores"]).astype(float)
emotions_to_highlight, sem_aff_features = utils.highlight_emotions(text_segments)
text_attention_vector = text_attention.mean(axis=0)
scaled_mat = (text_attention_vector - np.min(text_attention_vector)) / (np.max(text_attention_vector) - np.min(text_attention_vector)) * 0.9
attn_grey_indices = [int(round(x, 1)*10) for x in list(scaled_mat)][:5]

In [None]:
attn_grey_indices

In [None]:
# Apply the function color_emotions to highlight the words associated to emotions
original_text = utils.color_emotions(text_for_viz, emotions_to_highlight)

#Create HTML content with square_color as argument  
html_content = utils.create_colored_html_with_rectangle(original_text, attn_grey_indices)

#Visualize the HTML content
display(HTML(html_content))

#### Feature mean scores

In [None]:
for emotion in feature_labels:
    print(emotion)
    print("train, true:", round(train_df[(train_df["label"] == 0) & (train_df["topic"] == "politics")][emotion].mean(), 3))
    print("train, fake:", round(train_df[(train_df["label"] == 1) & (train_df["topic"] == "politics")][emotion].mean(), 3))
    print("test, true:", round(test_df[(test_df["label"] == 0) & (test_df["topic"] == "politics")][emotion].mean(), 3))
    print("test, fake:", round(test_df[(test_df["label"] == 1) & (test_df["topic"] == "politics")][emotion].mean(), 3))
    print()