In [1]:
import re
from collections import Counter
import pymorphy2
from tqdm import tqdm
import os
import pandas as pd

In [2]:
marked_path = '/home/nst/mount/data/share/yd/popular_science_texts_store/ner_markup/final_markup/'

In [3]:
def slurp(path):
    with open(path, 'r') as file_object:
        return file_object.read()

In [4]:
def extract_ne(text):
    pattern = re.compile(r'&(.*?)!&')
    nes = re.findall(pattern, text)
    lemmas = []
    for ne in nes:
        divided = ne.split()
        lemma = [morph.parse(word)[0].normal_form for word in divided]
        lemma = ' '.join(lemma)
        lemmas.append(lemma)
    return lemmas

In [74]:
def range_ne(texts_path, top_number):
    nes = []
    texts = []
    for root, dirs, files in os.walk(texts_path):
        for file_name in files:
            input_path = texts_path + file_name
            marked_text = slurp(input_path)
            texts.append(marked_text)
    for text in tqdm(texts):
        found_ne = extract_ne(text)
        #print(found_ne)
        nes.extend(found_ne)
    ne_dict = Counter(nes)
    ranged_ne = list(sorted(ne_dict, key = lambda x : x[1], reverse=True))
    print(top_number, "most cited scholars are:\n", ranged_ne[:top_number]) 
    return ranged_ne

## Get NE context

In [118]:
def preprocess(texts):
    #pattern = re.compile(r'[\w.-]+')
    clean_texts = []
    for text in tqdm(texts):
        split_text = re.findall(r'[\w.\'-]+', text, flags=re.UNICODE)
        joined_text = ' '.join(split_text)
        clean_texts.append(joined_text)
    preproc_texts = pd.DataFrame(clean_texts, columns=['preprocessed_texts'])
    return preproc_texts

In [145]:
def extract_entities_indices(clean_text, marked_text):
    indices = []
    entity_index_left = None
    entity_index_right = None
    pattern = re.compile(r'&(.*?)!&')
    nes = re.findall(pattern, marked_text)
   # print(nes)
    for name in nes:
        name = name.split()
        if len(name)==1:
            name = ''.join(name)
            try:
                entity_index_left = clean_text.index(name)
            except ValueError:
                print('Name:', name)
                print('Text_marked:', marked_text)
                raise 
            try:
                entity_index_right = clean_text.index(name) + len(name) 
            except ValueError:
                print('Name:', name)
                print('Text_marked:', marked_text)
                raise                             
        else:
            try:
                entity_index_left = clean_text.index(name[0])
            except ValueError:
                print('Name:', name[0])
                print('Text_marked:', marked_text)
                raise 
            try:
                entity_index_right = clean_text.index(name[-1]) + len(name[-1])
            except ValueError:
                print('Name:', name[0])
                print('Text_marked:', marked_text)
                raise 
        index_pair = (entity_index_left, entity_index_right)
        indices.append(index_pair)
    return indices 

In [146]:
def get_context(clean_text, entities_indices, window):
    left = []
    names = []
    right = []
    for index_pair in entities_indices:
        left_index = index_pair[0]
        right_index = index_pair[1] + 1
        name = clean_text[left_index:right_index]
        names.append(name)
        left_context = clean_text[:left_index].split()
        if len(left_context) < window:
            left_context = clean_text[:left_index]
        else:
            left_context = ' '.join(left_context[-window:])
        left.append(left_context)    
        right_context = clean_text[right_index:].split()
        if len(right_context) < window:
            right_context = clean_text[right_index:]
        else:
            right_context = ' '.join(right_context[:window+1])
        right.append(right_context)
    return left, names, right

In [7]:
def get_all_contexts(clean_texts, marked_texts, window):
    indices_list = [extract_entities_indices(clean_text, marked_text)
              for clean_text, marked_text in zip(clean_texts, marked_texts)]
    lefts_list = []
    names_list = []
    rights_list = []
    for text, indices in zip(clean_texts, indices_list):
        left, names, right = get_context(text, indices, window)
        lefts_list.extend(left)
        names_list.extend(names)
        rights_list.extend(right)
    dataframe = pd.DataFrame(lefts_list, columns=['left_context'])
    dataframe['named_entities'] = names_list
    dataframe['right_context'] = rights_list
    return dataframe

## Make Dataframe and get all NE contexts

### Marked & Preprocessed DataFrame

In [92]:
def make_texts_df(texts_path):
    marked_texts = []
    for root, dirs, files in os.walk(texts_path):
        for file_name in files:
            input_path = texts_path + file_name
            marked_text = slurp(input_path)
            marked_texts.append(marked_text)
    marked_df = pd.DataFrame(marked_texts, columns=['marked_texts'])
    return marked_df

In [206]:
marked_df = make_texts_df(marked_path)

In [207]:
preprocessed_texts = preprocess(marked_df.marked_texts)

100%|██████████| 164/164 [00:00<00:00, 750.44it/s]


In [208]:
texts_df = marked_df.join(preprocessed_texts) ;

In [97]:
texts_df.preprocessed_texts[2] ;

### Get all contexts from marked texts with window 5

In [209]:
contexts_df = get_all_contexts(texts_df.preprocessed_texts, texts_df.marked_texts, 5)

In [210]:
len(contexts_df)

1903

In [220]:
remove_duplicates = contexts_df.drop_duplicates()

In [221]:
len(remove_duplicates)

1255

In [219]:
remove_duplicates = remove_duplicates.reset_index()
remove_duplicates

AttributeError: 'NoneType' object has no attribute 'reset_index'

In [205]:
for root, dirs, files in os.walk(marked_path):
    for file_name in files:
        input_path = marked_path + file_name
        marked_text = slurp(input_path)
        if 'Субдискретизация' in marked_text:
            print(file_name)

company_audiomania_blog_251144.txt


In [160]:
'Когда &Гарри Уиттингтон!& и его ученики &Дерек Бриггс!& и &Саймон Конвей Моррис!&'

'Когда &Гарри Уиттингтон!& и его ученики &Дерек Бриггс!& и &Саймон Конвей Моррис!&'