In [153]:
import re
import os
import pymorphy2 
morph = pymorphy2.MorphAnalyzer()
from tqdm import tqdm

In [2]:
test_texts_root = r'/home/nst/mount/data/share/yd/'\
                   'popular_science_texts_store/ner_markup/test_ner/test_texts/'
    
output_root = r'/home/nst/mount/data/share/yd/'\
               'popular_science_texts_store/ner_markup/test_ner/output_texts/'

In [111]:
file_name = 'nplus1.ru-news-2015-05-10-darkenergy.txt'
sample_path_in = test_texts_root + file_name
sample_path_out = output_root + file_name

In [3]:
def slurp(path):
    with open(path, 'r') as file_object:
        return file_object.read()
    
def spit(path, text):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, 'w') as file_object:
        return file_object.write(text)

In [47]:
text = slurp(sample_path_in)

In [53]:
spit(sample_path_out, text)

5021

In [4]:
def assertEqual(a, b):
    if a != b:
        raise AssertionError('expected "%s", actual "%s"' % (a, b))

In [5]:
def insert_tags(text, index_start, index_end):
    initial_tag = '<>'
    final_tag = '</>'
    return text[:index_start] + initial_tag + text[index_start:index_end] \
        + final_tag + text[index_end:]

assertEqual('ученый <>Хокинг</>', 
           insert_tags('ученый Хокинг', 7, 13))

In [202]:
text = 'Я ем макароны Макфа'
pattern = re.compile(r'([А-ЯЁ][а-яё]+|[A-Z][a-z]+)')
find_ner = re.finditer(pattern, text)
n = [match.group() for match in find_ner]  
print(n)

['Макфа']


In [206]:
def tag_text(text):
    pattern = re.compile(r'([А-ЯЁ][а-яё]+|[A-Z][a-z]+)')
    find_ner = re.finditer(pattern, text)
    for item in reversed(list(find_ner)):
        start_index, end_index = item.span()
        word = item.group()
        if names_processor.extract_keywords(word) == [word]:
            text = insert_tags(text, start_index, end_index)
    return text

assertEqual('ученый <>Иванов</> из <>России</>', 
           tag_text('ученый Иванов из России'))

In [7]:
def extract_capitals(text):
    """
    Makes a list of words in caplitals
    """
    pattern = re.compile(r'([А-ЯЁ][а-яё]+|[A-Z][a-z]+)')
    find_capitals = re.finditer(pattern, text)
    capitals = [match.group() for match in find_capitals] 
    return capitals

In [109]:
def make_capitals_list(input_path, output_path):
    """
    Creates a list of potential names
    """
    capitals = []
    texts = []
    for root, dirs, files in os.walk(input_path):
        for file_name in files:
            file_path = input_path + file_name
            text = slurp(file_path)
            texts.append(text)
    for text in texts:
        words = extract_capitals(text)
        capitals.extend(words)
    capitals = set(capitals)
    file_object = open(output_path, 'w')
    for word in capitals:
        file_object.write("%s\n" % word)

In [131]:
def compile_lemmas_list(input_path):
    lemmas_list = []
    for root, dirs, files in os.walk(input_path):
        for file_name in files:
            file_path = input_path + file_name
            lemmas = slurp(file_path)
            lemmas = re.findall(r'[а-яё]+', lemmas)
            lemmas_list.extend(lemmas)
    lemmas_list = list(set(lemmas_list))
    return lemmas_list

In [112]:
dicts_root = '/home/nst/mount/data/share/yd/popular_science_texts_store/ner_markup/slovnik/'

In [132]:
lemmas = compile_lemmas_list(dicts_root)

In [135]:
len(lemmas)

103258

In [136]:
texts_path = '/home/nst/mount/data/share/yd/popular_science_texts_store/'\
'ner_markup/test_ner/test_texts/'
list_output = '/home/nst/mount/data/share/yd/popular_science_texts_store/ner_markup/capitals.txt'

In [176]:
make_capitals_list(texts_path, list_output)

In [177]:
with open(list_output, 'r') as fo:
    capitals = fo.readlines()

In [178]:
len(capitals)

10667

In [140]:
from flashtext import KeywordProcessor

In [None]:
if keyword_processor.extract_keywords(word) == [word]

In [141]:
lemmas_processor = KeywordProcessor()
lemmas_processor.add_keywords_from_list(lemmas)

In [142]:
lemmas_processor.extract_keywords('домашний')

['домашний']

In [175]:
def delete_common_words(list_output_path):
    potential_names = []
    with open(list_output_path, 'r') as fo:
        capitals = fo.readlines()
    for word in tqdm(capitals):
        word_lower = word.lower()
        word_lower = word_lower.rstrip()
        lemma = morph.parse(word_lower)[0].normal_form
        if not lemmas_processor.extract_keywords(lemma) == [lemma]:
            potential_names.append(word.rstrip())
    potential_names = list(set(potential_names))
    print('Deleted common words, current size:', len(potential_names))
    return potential_names

In [179]:
proper_names = delete_common_words(list_output)

100%|██████████| 10667/10667 [00:03<00:00, 2692.34it/s]

Deleted common words, current size: 6257





## Make draft files ##

In [181]:
names_processor = KeywordProcessor()
names_processor.add_keywords_from_list(proper_names)

In [182]:
names_processor.extract_keywords('Гирц')

['Гирц']

In [207]:
def make_draft_files(test_texts_root, output_root):
    for root, dirs, files in os.walk(test_texts_root):
        for file_name in files:
            input_path = test_texts_root + file_name
            output_path = output_root + file_name[:-3] + 'xml'
            raw_text = slurp(input_path)
            text = tag_text(raw_text)
            spit(output_path, text)

In [208]:
make_draft_files(test_texts_root, output_root)

In [8]:
draft_texts_root = r'/home/nst/mount/data/share/yd/'\
               'popular_science_texts_store/ner_markup/test_ner/output_texts/'
    
final_texts_root = r'/home/nst/mount/data/share/yd/popular_science_texts_store/'\
                'ner_markup/final_markup/'

In [9]:
def change_tags(marked_texts_root):
    start_tag = '<>'
    final_tag = '</>'
    for root, dirs, files in os.walk(marked_texts_root):
        for file_name in files:
            file_path = draft_texts_root + file_name
            content = slurp(file_path)
            marked_text = re.sub(start_tag, '&', content)
            marked_text = re.sub(final_tag, '!&', marked_text)
            output_path = final_texts_root + file_name[:-3] + 'txt'
            spit(output_path, marked_text)
    print('I\'m done')

## Make final changes ##

In [12]:
change_tags(draft_texts_root)

I'm done
