## Metrics analysis

This script gets valuable information from the calculated metrics. Detect possible relevant words and determine their contribution for each group of results.

In [None]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import numpy as np

plt.rcParams['figure.figsize'] = [14.4, 10.8]
plt.rcParams['figure.dpi'] = 200 # 200 e.g. is really fine, but slower

# CHECKPOINT = 'neuralmind/bert-base-portuguese-cased'
CHECKPOINT = 'bert-base-multilingual-uncased'
FOLDER = './outputs/bert-base-multilingual-uncased'

# Create a "words" folder to save relevant words
Path(f'{FOLDER}/words/').mkdir(parents=True, exist_ok=True)
Path(f'{FOLDER}/graphics/').mkdir(parents=True, exist_ok=True)

TOP_N = 30

TOPS = [50, 100, 150, 200, 250, 500]

R_W_COLORS = [
    '#2d93ad',
    '#ff6978',
]

COLORS = [
    '#C7AC92',
    '#48E5C2',
    '#3D3A4B',
]

GROUPS = [
    'antichina',
    'antivacina',
    'provacina'
]

GROUPS_NAMES = {
    'antichina': 'Anti-sinovaxxers',
    'antivacina': 'Anti-vaxxers',
    'provacina': 'Pro-vaxxers',
}

COUNTS = {}
for i, row in pd.read_csv(f'{FOLDER}/result_counts.csv').iterrows():
    COUNTS[row['name']] = row['count']
print(COUNTS)

words_metrics = pd.read_csv(f'{FOLDER}/words_metrics.csv')
words_metrics

### 1) Words with greatest attention (in general)

In [None]:
PROPORTION = (
    (COUNTS['antichina_correct'] +  COUNTS['antivacina_correct'] + COUNTS['provacina_correct']) /\
    COUNTS['total']
)

fig, axarr = plt.subplots(1, 2, figsize=(14.4, 8))
fig.subplots_adjust(wspace=0.35)

axarr[0].set_xlabel('Absolute attention')
axarr[1].set_xlabel('Relative attention')

absolute_df = words_metrics.sort_values('absolute', ascending=False).head(TOP_N)
absolute_df.plot(kind='barh', ax=axarr[0], x='word', y=['absolute_correct', 'absolute_incorrect'], title='Top 30 words with greatest absolute attention', color=[R_W_COLORS[0], R_W_COLORS[1]], stacked=True, legend=None, xlabel='Word')
bar = 0
thresholds = 0
for _, row in absolute_df.iterrows():
    if row['absolute_correct'] < PROPORTION * row['absolute']:
        if thresholds == 0:
            axarr[0].axvline(PROPORTION * row['absolute'], (bar / TOP_N), ((bar + 1)/TOP_N), color='black', linestyle='--', label='aa')
            thresholds += 1
        else:
            axarr[0].axvline(PROPORTION * row['absolute'], (bar / TOP_N), ((bar + 1)/TOP_N), color='black', linestyle='--')
    bar += 1
handles, _ = axarr[0].get_legend_handles_labels()
axarr[0].legend(handles[1:] + [handles[0]], ['Attention on well-predicted tweets', 'Attention on mispredicted tweets', 'Threshold for misprediction contribution'])

relative_df = words_metrics.sort_values('relative', ascending=False).head(TOP_N)
relative_df.plot(kind='barh', ax=axarr[1], x='word', y=['relative_correct', 'relative_incorrect'], title='Top 30 words with greatest relative attention', color=[R_W_COLORS[0], R_W_COLORS[1]], stacked=True, legend=None, xlabel='')

fig.savefig(f'{FOLDER}/graphics/general.svg', format='svg')

### 2) Words with greatest attention (by group)

In [None]:
TOP_N = 20

fig, axarr = plt.subplots(1, 3, figsize=(14.4, 6.5))

for i, group in enumerate(GROUPS):
    axarr[i].set_xlabel('Absolute attention')
    
    group_df = words_metrics.sort_values(f'{group}_absolute', ascending=False).head(TOP_N)
    PROPORTION = COUNTS[f'{group}_correct'] / COUNTS[f'{group}']
    
    bar = 0
    thresholds = 0
    for _, row in group_df.iterrows():
        if row[f'{group}_absolute_correct'] < PROPORTION * row[f'{group}_absolute']:
            if thresholds == 0:
                axarr[i].axvline(PROPORTION * row[f'{group}_absolute'], (bar / TOP_N), ((bar + 1)/TOP_N), color='black', linestyle='--', label='aa')
                thresholds += 1
            else:
                axarr[i].axvline(PROPORTION * row[f'{group}_absolute'], (bar / TOP_N), ((bar + 1)/TOP_N), color='black', linestyle='--')
        bar += 1
    
    group_df.plot(kind='barh', ax=axarr[i], x='word', y=[f'{group}_absolute_correct', f'{group}_absolute_incorrect'], title=GROUPS_NAMES[group], color=[R_W_COLORS[0], R_W_COLORS[1]], stacked=True, legend=None, xlabel='Word')
    axarr[i].set_xlim(0, 15)
    
fig.subplots_adjust(wspace=0.4)
fig.savefig(f'{FOLDER}/graphics/groups.svg', format='svg')

### 3) Get _relevant_, _positive-contributing_ and _existing-in-vocabulary words_...

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained(CHECKPOINT)
vocab_words = list(tokenizer.vocab.keys())

In [None]:
general_percentages = {}

PROPORTION = (
    (COUNTS['antichina_correct'] +  COUNTS['antivacina_correct'] + COUNTS['provacina_correct']) /\
    COUNTS['total']
)

for top in TOPS:
    general_percentages[f'general_{top}'] = {
        'relevant': 0.0,
        'correct': 0.0,
        'in_vocabulary': 0.0,
    }
    
    print(f'TOP {top}:')
    
    tfidf = words_metrics.sort_values('tfidf', ascending=False).head(top)
    absolute_df = words_metrics.sort_values('absolute', ascending=False).head(top)
    np.savetxt(f'{FOLDER}/words/general_{top}.txt', absolute_df.word.values, delimiter=',', newline='\n', fmt="%s")
    
    if top == 500:
        absolute_df.to_csv(f'{FOLDER}/words/general_{top}.csv', sep=';', index=None)
    
    intersection = set(absolute_df.word).intersection(set(tfidf.word))
    percentage = len(intersection) / top * 100
    np.savetxt(f'{FOLDER}/words/general_relevant_{top}.txt', list(intersection), delimiter=',', newline='\n', fmt="%s")
    general_percentages[f'general_{top}']['relevant'] = percentage
    print(f'Relevant words based on TFIDF: {percentage}%')
#     print(intersection)
    
    absolute_correct = absolute_df[absolute_df.absolute_correct < PROPORTION * absolute_df.absolute]
    
    intersection = set(absolute_df.word).intersection(set(absolute_correct.word))
    percentage = len(intersection) / top * 100
    np.savetxt(f'{FOLDER}/words/general_correct_{top}.txt', list(intersection), delimiter=',', newline='\n', fmt="%s")
    general_percentages[f'general_{top}']['correct'] = percentage
    print(f'Words that contribute to correctness: {percentage}%')
#     print(intersection)

    words_present_in_vocab = 0
    for word in list(absolute_df.word):
        if word in vocab_words:
            words_present_in_vocab += 1
    percentage = words_present_in_vocab / top * 100
    general_percentages[f'general_{top}']['in_vocabulary'] = percentage
    print(f'Words in vocabulary: {percentage}%')
    print('')


percentages_df = pd.DataFrame(general_percentages).T
percentages_df.insert(0, column='top', value=percentages_df.index)
percentages_df.to_csv(f'{FOLDER}/words/general_percentages.csv', index=None, sep=';')
percentages_df

In [None]:
group_percentages = {}

for group in GROUPS:

    group_df = words_metrics.sort_values(f'{group}_absolute', ascending=False).head(TOP_N)
    PROPORTION = COUNTS[f'{group}_correct'] / COUNTS[f'{group}']
    
    for top in TOPS:
        group_percentages[f'{group}_{top}'] = {
            'relevant': 0.0,
            'correct': 0.0,
        }

        tfidf = words_metrics.sort_values(f'{group}_tfidf', ascending=False).head(top)
        absolute = words_metrics.sort_values(f'{group}_absolute', ascending=False).head(top)

        intersection = set(absolute.word).intersection(set(tfidf.word))
        percentage = len(intersection) / top * 100
        np.savetxt(f'{FOLDER}/words/{group}_relevant_{top}.txt', list(intersection), delimiter=',', newline='\n', fmt="%s")
        group_percentages[f'{group}_{top}']['relevant'] = percentage

        absolute_correct = absolute[absolute[f'{group}_absolute_correct'] < PROPORTION * absolute[f'{group}_absolute']]

        intersection = set(absolute_df.word).intersection(set(absolute_correct.word))
        percentage = len(intersection) / top * 100
        np.savetxt(f'{FOLDER}/words/{group}_correct_{top}.txt', list(intersection), delimiter=',', newline='\n', fmt="%s")
        group_percentages[f'{group}_{top}']['correct'] = percentage

percentages_df = pd.DataFrame(group_percentages).T
percentages_df.insert(0, column='top', value=percentages_df.index)
percentages_df.to_csv(f'{FOLDER}/words/groups_percentages.csv', index=None, sep=';')
percentages_df