In [232]:
import pandas as pd
import os
import re
from tqdm import tqdm
import numpy as np

In [2]:
meta = pd.read_csv('/home/alapidus/NIS/articles_with_meta.tsv', sep='\t')

In [3]:
def slurp_lists(path):
    with open(path, 'r') as fo:
        return fo.readlines()

def spit(path, text):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, 'w') as file_object:
        return file_object.write(text)
    
names_extracted = slurp_lists('/home/alapidus/NIS/ner_data/ner_corpora_extracted.txt')

## Mark talks and lectures

* Keep file name (path)
* Source: postnauka, polit.ru (lectures)
* Genre: talks, lectures (from polit.ru)
* Text

In [4]:
def generate_filenames(roots):
    fnames = []
    for root in roots:
        if '\\' in root:
            index = root.rfind('\\')
            fname = root[index+1:]
            fnames.append(fname)
        else:
            index = root.rfind('/')
            fname = root[index+1:]
            fnames.append(fname)
    return fnames

In [5]:
def get_texts(df, source):
    if source == 'https://postnauka.ru':
        subset = df.loc[df['source'] == 'https://postnauka.ru']
        roots = subset.path
        fnames = generate_filenames(roots)
        texts = subset.text
    elif source == 'polit.ru/lectures':
        subset = df.loc[df['source'] == 'polit.ru/lectures']
        roots = subset.path
        fnames = generate_filenames(roots)
        texts = subset.text
    return fnames, texts

In [6]:
lec_names, lec_scripts = get_texts(meta, 'polit.ru/lectures')

In [7]:
post_names, post_texts = get_texts(meta, 'https://postnauka.ru')

In [11]:
names_extracted = [name.rstrip() for name in names_extracted]

In [185]:
from flashtext import KeywordProcessor

In [181]:
def tag_names2(text, names:list):
    search_pattern = '|'.join(names)
    #print(search_pattern)
    replace_pattern = r'&\g<0>!&'
    return re.sub(search_pattern, replace_pattern, text)

In [203]:
def preprocess(text):
    text = ' '.join(re.findall(r'[А-ЯЁа-яё\-\.]+', text))
    return text

In [207]:
def tag_names(text):
    text_split = preprocess(text).split()
    for word in text_split:
        if names_processor.extract_keywords(word) == [word]:
            index = text_split.index(word)
            text_split[index] = '&' + word + '!&'
    return ' '.join(text_split)


def tag_texts(texts:list, names:list):
    texts_marked = []
    names_processor = KeywordProcessor()
    names_processor.add_keywords_from_list(names)
    for text in tqdm(texts):
        text_marked = tag_names(text)
        texts_marked.append(text_marked)
    return texts_marked        

In [175]:
names_sub = [name.replace('.', '\.') if '.' in name else name for name in names_extracted]

In [209]:
tagged_lectures = tag_texts(lec_scripts, names_sub)

100%|██████████| 562/562 [00:52<00:00, 10.69it/s]


In [214]:
tagged_postn = tag_texts(post_texts, names_sub)

100%|██████████| 4054/4054 [00:57<00:00, 71.03it/s] 


In [9]:
output = '/home/alapidus/NIS/ner_data/marked_talks/'

In [221]:
def write_to_dir(fnames, texts, path):
    for text, fname in tqdm(zip(texts, fnames)):
        spit(path+fname, text)
    print('Over!')

In [222]:
write_to_dir(lec_names, tagged_lectures, output)

562it [00:00, 1823.48it/s]

Over!





In [224]:
write_to_dir(post_names, tagged_postn, output)

4054it [00:00, 6631.33it/s]

Over!





## GET NE CONTEXTS

In [225]:
class Context:
    def __init__(self, left_context: str, right_context: str,
                 left_word: str, word: str):
        self.left_context = left_context
        self.right_context = right_context
        self.left_word = left_word
        self.word = word
    def __repr__(self):
        return 'Context("%s", "%s", "%s", "%s")' % (
            self.left_context, self.right_context,
            self.left_word, self.word)

def tokenize_text(text: str) -> [str]:
    return re.findall(r'&?[\w.\'-]+!?&?', text, flags=re.UNICODE)

def cleanup_tag(text: str) -> str:
    return text.replace('&', '').replace('!', '')

def get_span(tokens: [str], start: int, end: int) -> str:
    return cleanup_tag(' '.join(tokens[start:end]))

def get_complete_word(index: int, tokens: [str]) -> (str, int, bool):
    current_word = tokens[index]
    if not current_word.startswith('&'):
        return current_word, index+1, False

    word_parts = [current_word]
    index += 1
    if not current_word.endswith('!&'):
        while index < len(tokens):
            current_word = tokens[index]
            word_parts.append(current_word)
            if current_word.endswith('!&'):
                index += 1
                break
            index += 1
        else:
            raise ValueError('No matching closing tag in tokens: "%s"' % tokens)
    
    word = cleanup_tag(' '.join(word_parts))
    return word, index, True

assert ('a', 1, False) == get_complete_word(0, ['a', 'b'])
assert ('b', 2, False) == get_complete_word(1, ['a', 'b'])
assert ('a', 1, True) == get_complete_word(0, ['&a!&', 'b']), get_complete_word(0, ['&a!&', 'b'])
assert ('a b', 2, True) == get_complete_word(0, ['&a', 'b!&'])
assert ('a b c', 3, True) == get_complete_word(0, ['&a', 'b', 'c!&'])

def extract_contexts(text: str, window_size: int) -> [Context]:
    result = []
    
    tokens = tokenize_text(text)
    index = 0

    while index < len(tokens):
        word, new_index, is_tag = get_complete_word(index, tokens)
        if not is_tag:
            index = new_index
            continue

        left_context = get_span(tokens, index-5, index)
        right_context = get_span(tokens, new_index, new_index+6)
        left_word = get_span(tokens, index-1, index)
        
        context = Context(left_context, right_context, left_word, word)
        result.append(context)
        
        index = new_index
    
    return result

In [227]:
def compile_contexts_dataframe(texts, window_size):
    left = []
    left_words = []
    entities = []
    right = []
    for text in texts:
        contexts = extract_contexts(text, window_size)
        for context in contexts:
            left_context = context.left_context
            left.append(left_context)
            left_word = context.left_word
            left_words.append(left_word)
            entity = context.word
            entities.append(entity)
            right_context = context.right_context
            right.append(right_context)
    dataframe = pd.DataFrame(np.column_stack([left, left_words, entities, right]),
                            columns=['left_context', 'left_word', 
                                     'named_entity', 'right_context'])
    
    return dataframe

In [228]:
def sort_dataframe(dataframe, column):
    return dataframe.sort_values(by=[column], axis=0)

In [236]:
lectures_con = compile_contexts_dataframe(tagged_lectures, 5)
lectures_con = sort_dataframe(lectures_con, 'left_word')

In [238]:
postn_con = compile_contexts_dataframe(tagged_postn, 5)
postn_con = sort_dataframe(postn_con, 'left_word')

In [240]:
texts_concat = tagged_postn + tagged_lectures

In [242]:
len(texts_concat)

4616

In [246]:
texts_con = compile_contexts_dataframe(texts_concat, 5)
texts_con = sort_dataframe(texts_con, 'left_word')

In [248]:
texts_con.to_csv('contexts_postn_lectures.tsv', sep='\t')