## Metrics analysis

This script gets valuable information from the calculated metrics. Detect possible relevant words and determine their contribution for each group of results.

In [None]:
from pathlib import Path
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import numpy as np

plt.rcParams['figure.figsize'] = [14.4, 10.8]
plt.rcParams['figure.dpi'] = 200 # 200 e.g. is really fine, but slower

CHECKPOINT = 'bert-base-multilingual-cased'
FOLDER = './outputs/DS1/bert-base-multilingual-cased'

# Create a "words" folder to save relevant words
Path(f'{FOLDER}/words/').mkdir(parents=True, exist_ok=True)
Path(f'{FOLDER}/graphics/').mkdir(parents=True, exist_ok=True)

TOP_N = 30

TOPS = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]

R_W_COLORS = [
    '#2d93ad',
    '#ff6978',
]

GROUPS = [
    'antichina',
    'antivacina',
    'provacina'
]

GROUPS_NAMES = {
    'antichina': 'Anti-sinovaxxers',
    'antivacina': 'Anti-vaxxers',
    'provacina': 'Pro-vaxxers',
}

COUNTS = {}
for i, row in pd.read_csv(f'{FOLDER}/result_counts.csv').iterrows():
    COUNTS[row['name']] = row['count']
print(COUNTS)

words_metrics = pd.read_csv(f'{FOLDER}/words_metrics.csv')
words_metrics

### 1) Words with greatest attention (in general)

In [None]:
PROPORTION = (
    (
        COUNTS['antichina_correct'] +\
        COUNTS['antivacina_correct'] +\
        COUNTS['provacina_correct']
    ) /\
    COUNTS['total']
)

fig, axes= plt.subplots(1, 1, figsize=(8, 8))
fig.subplots_adjust(wspace=0.35)

axes.set_xlabel('Absolute attention')
# axarr[1].set_xlabel('Relative attention')

absolute_df = words_metrics.sort_values('absolute', ascending=False).head(TOP_N)
absolute_df.plot(kind='barh', ax=axes, x='word', y=['absolute_correct', 'absolute_incorrect'], title='Top 30 words with greatest absolute attention', color=[R_W_COLORS[0], R_W_COLORS[1]], stacked=True, legend=None, xlabel='Word')
bar = 0
thresholds = 0
for _, row in absolute_df.iterrows():
    if row['absolute_correct'] < PROPORTION * row['absolute']:
        if thresholds == 0:
            axes.axvline(PROPORTION * row['absolute'], (bar / TOP_N), ((bar + 1)/TOP_N), color='black', linestyle='--', label='aa')
            thresholds += 1
        else:
            axes.axvline(PROPORTION * row['absolute'], (bar / TOP_N), ((bar + 1)/TOP_N), color='black', linestyle='--')
    bar += 1
handles, _ = axes.get_legend_handles_labels()
axes.legend(handles[1:] + [handles[0]], ['Attention on well-predicted tweets', 'Attention on mispredicted tweets', 'Threshold for misprediction contribution'])

fig.savefig(f'{FOLDER}/graphics/general.svg', format='svg')

### 2) Words with greatest attention (by group)

In [None]:
TOP_N = 20

fig, axarr = plt.subplots(1, 3, figsize=(14.4, 6.5))

for i, group in enumerate(GROUPS):
    axarr[i].set_xlabel('Absolute attention')
    
    group_df = words_metrics.sort_values(f'{group}_absolute', ascending=False).head(TOP_N)
    PROPORTION = COUNTS[f'{group}_correct'] / COUNTS[f'{group}']
    
    bar = 0
    thresholds = 0
    for _, row in group_df.iterrows():
        if row[f'{group}_absolute_correct'] < PROPORTION * row[f'{group}_absolute']:
            if thresholds == 0:
                axarr[i].axvline(PROPORTION * row[f'{group}_absolute'], (bar / TOP_N), ((bar + 1)/TOP_N), color='black', linestyle='--', label='aa')
                thresholds += 1
            else:
                axarr[i].axvline(PROPORTION * row[f'{group}_absolute'], (bar / TOP_N), ((bar + 1)/TOP_N), color='black', linestyle='--')
        bar += 1
    
    group_df.plot(kind='barh', ax=axarr[i], x='word', y=[f'{group}_absolute_correct', f'{group}_absolute_incorrect'], title=GROUPS_NAMES[group], color=[R_W_COLORS[0], R_W_COLORS[1]], stacked=True, legend=None, xlabel='Word')
    axarr[i].set_xlim(0, 15)
    
fig.subplots_adjust(wspace=0.45)
fig.savefig(f'{FOLDER}/graphics/groups.svg', format='svg')

### 3) Get _relevant_, _positive-contributing_ and _existing-in-vocabulary words_...

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained(CHECKPOINT)
vocab_words = list(tokenizer.vocab.keys())

In [None]:
general_percentages = {}

PROPORTION = (
    (
        COUNTS['antichina_correct'] +\
        COUNTS['antivacina_correct'] +\
        COUNTS['provacina_correct']
    ) /\
    COUNTS['total']
)

for top in TOPS:
    general_percentages[top] = {
        'average_attention': 0.0,
        'relevant': 0.0,
        'correct': 0.0,
        'in_vocabulary': 0.0,
    }
    
    absolute_df = words_metrics.sort_values('absolute', ascending=False).head(top)
    
    if top == 500:
        absolute_df.to_csv(f'./{FOLDER}/words/top_{top}_words.csv', sep=';', index=None)
    
    general_percentages[top]['average_attention'] = np.mean(absolute_df.absolute.values)
    
    tfidf = words_metrics.sort_values('tfidf', ascending=False).head(top)
    intersection = set(absolute_df.word).intersection(set(tfidf.word))
    pd.DataFrame({'word': list(intersection)}).to_csv(f'{FOLDER}/words/relevant_{top}.csv', index=None)
    general_percentages[top]['relevant'] = len(intersection) / top * 100
    
    absolute_correct = absolute_df[absolute_df.absolute_correct >= PROPORTION * absolute_df.absolute]
    intersection = set(absolute_df.word).intersection(set(absolute_correct.word))
    pd.DataFrame({'word': list(intersection)}).to_csv(f'{FOLDER}/words/correct_{top}.csv', index=None)
    general_percentages[top]['correct'] = len(intersection) / top * 100

    words_present_in_vocab = 0
    for word in list(absolute_df.word):
        if word in vocab_words:
            words_present_in_vocab += 1
    general_percentages[top]['in_vocabulary'] = words_present_in_vocab / top * 100

percentages_df = pd.DataFrame(general_percentages).T
percentages_df.insert(0, column='top', value=percentages_df.index)
percentages_df.to_csv(f'{FOLDER}/words/general_percentages.csv', index=None, sep=';')
percentages_df

In [None]:
# Normality test:
for metric in ['average_attention', 'correct', 'in_vocabulary']:
    print('%s: %.4f' % (metric, stats.shapiro(percentages_df[metric].values).pvalue))

In [None]:
# Get correlations...
print(
    "Corr. attention vs well-contributing: %.4f (p=%.4f)" %\
    stats.spearmanr(percentages_df.average_attention.values, percentages_df.correct.values)
)
print(
    "Corr. well-contributing vs in-vocabulary: %.4f (p=%.4f)" %\
    stats.spearmanr(percentages_df.correct.values, percentages_df.in_vocabulary.values)
)

In [None]:
group_percentages = {}

for group in GROUPS:

    PROPORTION = COUNTS[f'{group}_correct'] / COUNTS[f'{group}']
    
    for top in TOPS:
        group_percentages[f'{group}_{top}'] = {
            'average_attention': 0.0,
            'relevant': 0.0,
            'correct': 0.0,
        }

        absolute = words_metrics.sort_values(f'{group}_absolute', ascending=False).head(top)

        group_percentages[f'{group}_{top}']['average_attention'] = np.mean(absolute.absolute.values)

        tfidf = words_metrics.sort_values(f'{group}_tfidf', ascending=False).head(top)
        intersection = set(absolute.word).intersection(set(tfidf.word))
        group_percentages[f'{group}_{top}']['relevant'] = len(intersection) / top * 100

        absolute_correct = absolute[absolute[f'{group}_absolute_correct'] >= PROPORTION * absolute[f'{group}_absolute']]

        intersection = set(absolute_df.word).intersection(set(absolute_correct.word))
        group_percentages[f'{group}_{top}']['correct'] = len(intersection) / top * 100

percentages_df = pd.DataFrame(group_percentages).T
percentages_df.insert(0, column='top', value=percentages_df.index)
percentages_df.to_csv(f'{FOLDER}/words/groups_percentages.csv', index=None, sep=';')
percentages_df

In [None]:
# Get correlations...
print(
    "Corr. attention vs well-contributing: %.4f (p=%.4f)" %\
    stats.spearmanr(percentages_df.average_attention.values, percentages_df.correct.values)
)