In [1]:
from IPython.core.display import display, HTML
display(HTML(open('visualization/highlight.css').read()))
display(HTML(open('visualization/highlight.js').read()))

import visualization
from termcolor import colored

from collections import defaultdict
import numpy as np
import spacy
nlp = spacy.load("ru_core_news_sm")

In [2]:
import sys
sys.path.append("../")
from span_identification.dataset import load_data

# Span Identification

In [3]:
def load_result(file):
    result = {}
    with open(file, "r") as f:
        for line in f:
            article_id, spl, spr = line.split('\t')
            result.setdefault(article_id, [])
            result[article_id].append([int(spl), int(spr)])
    return result


def show_result(result, articles_id, articles_content, task):
    articles_contents = dict(zip(articles_id, articles_content))
    for article_id in sorted(result):
        text = articles_contents[article_id]
        nlp_text = nlp(text)
        tokens_idx = np.array([token.idx for token in nlp_text])
        tokens = [token.text for token in nlp_text]
        
        spans = []
        for sp in result[article_id]:
            sp = list(sp)
            sp[0] = np.where(tokens_idx >= sp[0])[0][0]
            sp[1] = np.where(tokens_idx < sp[1])[0][-1]
            spans.append([sp[0], sp[1]])
        
        if len(spans) != 0:
            print(colored(article_id, 'red'))
            visualization.render(tokens, [spans], task=task)

In [6]:
articles_content, articles_id, _ = load_data('../datasets/dev-articles', 
                                             '../tools/data/russian_corpus_techniques.txt')
result = load_result('../results/SI_output_dev.txt')
show_result(result, articles_id, articles_content, 'SI')

[31m103[0m


[31m110[0m


[31m111[0m


[31m115[0m


[31m12[0m


[31m123[0m


[31m128[0m


[31m13[0m


[31m135[0m


[31m14[0m


[31m19[0m


[31m26[0m


[31m34[0m


[31m35[0m


[31m44[0m


[31m47[0m


[31m66[0m


[31m68[0m


[31m88[0m


[31m94[0m


[31m99[0m


In [7]:
articles_content, articles_id, _ = load_data('../datasets/dev-articles', 
                                             '../tools/data/russian_corpus_techniques.txt')
result = load_result('../results/dev-task-SI.labels')
show_result(result, articles_id, articles_content, 'SI')

[31m103[0m


[31m110[0m


[31m111[0m


[31m115[0m


[31m12[0m


[31m122[0m


[31m123[0m


[31m128[0m


[31m13[0m


[31m135[0m


[31m14[0m


[31m19[0m


[31m26[0m


[31m34[0m


[31m35[0m


[31m44[0m


[31m46[0m


[31m47[0m


[31m51[0m


[31m62[0m


[31m66[0m


[31m68[0m


[31m88[0m


[31m94[0m


[31m99[0m


## Technique classification

In [4]:
def load_result(file):
    result = defaultdict(dict)
    with open(file, "r") as f:
        for line in f:
            article_id, prediction, spl, spr = line.split('\t')
            result[article_id].setdefault(prediction, [])
            result[article_id][prediction].append([int(spl), int(spr)])
    return result


def show_result(result, articles_id, articles_content, top=None):
    showed = 0
    articles = dict(zip(articles_id, articles_content))
    for article_id in sorted(result):
        text = articles[article_id]
        nlp_text = nlp(text)
        tokens_idx = np.array([token.idx for token in nlp_text])
        tokens = [token.text for token in nlp_text]
        
        spans = [[] for _ in range(len(mapping))]
        for cluster in result[article_id]:
            for sp in result[article_id][cluster]:
                sp[0] = np.where(tokens_idx >= sp[0])[0][0]
                sp[1] = np.where(tokens_idx < sp[1])[0][-1]
                spans[inverse_mapping[cluster]].append([sp[0], sp[1]])
        
        if len(spans) != 0:
            print(colored(article_id, 'red'))
            #show_box_markup(text, spans, palette=palette(PROP=BLUE))
            visualization.render(tokens, spans, task='TC')
            showed += 1
            if top is not None and showed > top:
                break
                
                
mapping = {i: el for i, el in enumerate(['Negative/Positive_concepts',
 '(PT)_Call',
 'Slogan',
 'Obfuscation,vagueness,obscurantism',
 'Consequential_Simplification',
 'Greenwashing',
 'Causal_Simplification',
 'Appeal_to_Hypocrisy',
 'Appeal_to_values',
 'Rumours',
 'Strawman',
 'Whataboutism',
 'Hate_speech,slang,name_calling',
 'Casting_Doubt',
 'Labelling',
 'Substitution_of_an_idea',
 'Statistical_deception',
 'Bluewashing',
 'Appeal_to_authority',
 'Guilt_by_Association',
 'Appeal_to_Time',
 'Flag_waving',
 '“you_should”',
 'Simplified_Interpretation',
 'Appeal_to_fear/prejudice',
 'Loaded_language',
 'Sensational_and/or_provocative_headings',
 '“I_am_like_you”',
 'Exaggeration/Minimization',
 'Red_Herring',
 'Appeal_to_popularity',
 'Repetition',
 'False_Dilemma',
 'Distraction_by_scapegoat',
 'Conversation_Killer',
 'Stereotypes'])}

inverse_mapping = {b:a for (a, b) in mapping.items()}

In [5]:
articles_content, articles_id, _ = load_data('../datasets/dev-articles', 
                                             '../tools/data/russian_corpus_techniques.txt')
result = load_result('../results/TC_output_dev_sc.txt')
show_result(result, articles_id, articles_content)

[31m103[0m


[31m110[0m


[31m111[0m


[31m115[0m


[31m12[0m


[31m123[0m


[31m128[0m


[31m13[0m


[31m135[0m


[31m14[0m


[31m19[0m


[31m26[0m


[31m34[0m


[31m35[0m


[31m44[0m


[31m47[0m


[31m66[0m


[31m68[0m


[31m88[0m


[31m94[0m


[31m99[0m
