In [6]:
import re
import os
from tqdm import tqdm_notebook

In [28]:
def slurp(path):
    try:
        with open(path, 'r') as fo:
            text = fo.read()
    except UnicodeDecodeError:
        print(path)
        with open(path, 'r', encoding='cp1252') as fo:
            text = fo.read()
    return text

def spit(texts, file_names):
    for text, file_name in tqdm_notebook(zip(texts, file_names)):
        with open(file_name, 'w') as fo:
            fo.write(text)

def read_dir(input_path):
    texts = []
    files = []
    print('Reading files...')
    for root, dirs, filenames in os.walk(input_path):
        files.extend(filenames)
        for filename in tqdm_notebook(filenames):
            file_path = os.path.join(root, filename)
            if '.ipynb' not in file_path:
                text = slurp(file_path)
                texts.append(text)
    print('Number of texts: ', len(texts))
    return texts, files

def preprocess(input_path, output_path):
    texts, filenames = read_dir(input_path)
    pattern = re.compile(r'[А-ЯЁа-яё\.\-\d]+')
    preprocessed = []
    print('Preprocessing files...')
    for text in tqdm(texts):
        preproc_text = ' '.join(re.findall(pattern, text))
        preprocessed.append(preproc_text)
    paths = [output_path + name for name in filenames if 'ipynb' not in name]
    print('Number of texts: ', len(texts))
    print('Number of paths:', len(paths))
    print('Writing to files...')
    spit(preprocessed, paths)
    print('All done, Buddy!')

## Preprocess texts before parsing

In [81]:
spit_dir = '/home/nst/mount/data/linguistics_hse/popular-science-research/Tomita_Parser/tomita-parser/build/bin/sci_corpus/'
slurp_dir = '/home/nst/mount/data/share/yd/popular_science_texts_store_copy'

In [92]:
chrdk = preprocess(slurp_dir, spit_dir)


0it [00:00, ?it/s][A
[A
0it [00:00, ?it/s][A
[A
  0%|          | 0/707 [00:00<?, ?it/s][A

Reading files...



  1%|          | 6/707 [00:00<00:12, 54.75it/s][A
  2%|▏         | 14/707 [00:00<00:11, 61.65it/s][A
  3%|▎         | 20/707 [00:00<00:11, 59.24it/s][A
  4%|▍         | 29/707 [00:00<00:10, 65.30it/s][A
  5%|▌         | 38/707 [00:00<00:10, 64.70it/s][A
  7%|▋         | 47/707 [00:00<00:09, 68.41it/s][A
  8%|▊         | 57/707 [00:00<00:09, 71.64it/s][A
  9%|▉         | 65/707 [00:00<00:08, 72.38it/s][A
 10%|█         | 73/707 [00:01<00:08, 72.22it/s][A
 11%|█▏        | 81/707 [00:01<00:08, 71.68it/s][A
 13%|█▎        | 90/707 [00:01<00:08, 73.06it/s][A
 14%|█▍        | 98/707 [00:01<00:08, 72.23it/s][A
 15%|█▌        | 108/707 [00:01<00:08, 74.06it/s][A
 17%|█▋        | 117/707 [00:01<00:07, 74.23it/s]Exception in thread Thread-131:
Traceback (most recent call last):
  File "/home/nst/anaconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/home/nst/anaconda3/lib/python3.6/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance

Number of texts:  31052
Preprocessing files...


100%|██████████| 31052/31052 [00:13<00:00, 2247.23it/s]
0it [00:00, ?it/s]

Number of texts:  31052
Number of paths: 31052
Writing to files...


31052it [06:37, 78.14it/s] 


All done, Buddy!


## Launch on test sample 

In [18]:
test_input = '/home/nst/data/api_nis/ner_data/final_markup/'

In [19]:
marked = []
for root, dirs, filenames in os.walk(test_input):
    for filename in tqdm_notebook(filenames):
        path = test_input + filename
        text = slurp(path)
        marked.append(text)
marked = list(set(marked))

HBox(children=(IntProgress(value=0, max=175), HTML(value='')))




### Find names in marked text

In [21]:
pattern = re.compile(r'\&[А-ЯЁа-яё\-\s]+!\&')
names_list = []
for text in marked:
    names = re.findall(pattern, text)
    clean_names = [name.replace('&', '').replace('!', '') for name in names]
    names_list.extend(clean_names)

In [22]:
to_pop = [51, 52, 54, 123 , 131, 151, 157, 185, 196, 381, 398, 461, 511, 600, 655, 664, 929,
          982, 1210, 1236, 1261, 1334, 1340, 1401, 1408, 1503, 1511, 1527, 1582, 1638, 1656,
         1670, 1704, 1706]
for i in to_pop:
    names_list.pop(i)

In [61]:
names_list[0]

'Леонардо да Винчи'

In [97]:
with open('evaluation_names.txt', 'w') as fo:
    for name in names_list:
        fo.write(name+'\n')

## Test sample texts

In [25]:
marked_texts, file_names = read_dir(test_input)
texts = [text.replace('&', '').replace('!', '') for text in marked_texts]

Reading files...


HBox(children=(IntProgress(value=0, max=175), HTML(value='')))


Number of texts:  175


In [26]:
test_output = '/home/nst/data/api_nis/ner_data/texts_eval/'

In [29]:
pattern = re.compile(r'[А-ЯЁа-яё\.\-\d]+')
preprocessed = []
print('Preprocessing files...')
for text in tqdm_notebook(texts):
    preproc_text = ' '.join(re.findall(pattern, text))
    preprocessed.append(preproc_text)
paths = [test_output + name for name in file_names]
print('Number of texts: ', len(texts))
print('Number of paths:', len(paths))
print('Writing to files...')
spit(preprocessed, paths)
print('All done, Buddy!')

Preprocessing files...


HBox(children=(IntProgress(value=0, max=175), HTML(value='')))


Number of texts:  175
Number of paths: 175
Writing to files...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


All done, Buddy!


In [30]:
import pandas as pd

In [431]:
meta_table = pd.read_csv('/home/alapidus/NIS/articles_with_meta.tsv', sep='\t')

In [432]:
len(meta_table)

30628

In [433]:
rem_dup_meta = meta_table.drop_duplicates(subset='path', keep='first')
len(rem_dup_meta)

30628

In [226]:
for row in meta_table.iterrows():
    if 'Гоффман' in row[1].text:
        text = row[1].text.split()
        try:
            name_index = text.index('Гоффман')
            left = name_index - 3
            right = name_index +4 
            context = ' '.join(text[left:right])
            print('Title:', row[1].title)
            print('Context:', context)
            #print(row[1].path)
        except ValueError:
            continue
        

Title: Множественные «я»
Context: чем вытеснение. Но Гоффман находит способ показать,
Title: Поведение в публичных местах
Context: с большим трудом. Гоффман — автор, которого
Title: Социальное &laquo;лицо&raquo;
Context: еврейско-русского происхождения Эрвинг Гоффман был первым из
Title: Социальные роли
Context: то, что Эрвинг Гоффман называл ролевой дистанцией,
Title: Представление себя другим в теории Гоффмана
Context: по крайней мере. Гоффман был первым, кто
Title: Стратегические интеракции
Context: это тот, который Гоффман называет наивным. Это
Title: Понятие фрейма у Гоффмана
Context: имеет весьма косвенное. Гоффман пишет о фреймах,
Title: Социология эмоций
Context: некоторые не одобряются. Гоффман написал несколько статей
Title: Понятие стигмы у Гоффмана
Context: 
Title: Теория фреймов
Context: что позднее Ирвинг Гоффман назовет "транспонированием" -
Title: Классовый статус: сигналы и символы
Context: этими заданиями. Но Гоффман пошел дальше, чем


# NER-PARSER

In [31]:
import bs4 as bs

In [32]:
output = slurp('/home/nst/data/api_nis/ner_module/tomita-parser/build/bin/names.xml')

In [57]:
def parse_by_document(output):
    xml = bs.BeautifulSoup(output, 'lxml')
    documents = xml.find_all('document')
    print('Number of documents:', len(documents))
    filenames = []
    num_names = []
    names_final = []
    for doc in tqdm_notebook(documents):
        names_found = doc.find_all('name')
        names = [name.get('val').title() for name in names_found]
        names = delete_common_words(names)
        names = delete_geo_terms(names)
        if names:
            #print(names)
            num_names.append(len(names))
            names_final.append('|'.join(names))
            filename = doc.attrs['url'][1:]
            filenames.append(filename)
    df = pd.DataFrame({'file':filenames,
                      'NEnum': num_names,
                      'NE': names_final})
    print('Finished!')
    return df

In [58]:
ner_df = parse_by_document(output)

Number of documents: 155


HBox(children=(IntProgress(value=0, max=155), HTML(value='')))


Finished!


In [60]:
with open('/home/nst/data/api_nis/ner_module/other_dir/full_names_list.txt', 'r') as fo:
    corpora_names = fo.readlines()

In [452]:
def count_words(text):
    text = str(text.values)
    return len(text.split())
def sub_win_slash(strings):
    return [s.replace('\\', '/') if 'postnauka' in s else s for s in strings]

In [483]:
def extract_meta(ner_df, meta_df):
    filenames = ner_df.file
    names_mentioned = ner_df.NE
    ne_nums = ner_df.NEnum
    paths = sub_win_slash(meta_df.path)
    genres = []
    rubrics = []
    num_words = []
    number_nes = []
    fnames = []
    final_mentions = []
    for fname, num, men in tqdm(zip(filenames, ne_nums, names_mentioned)):
        for path in paths:
            if '/'+fname in path:
                if 'postnauka' in path:
                    path = path.replace('/', '\\')
                    row = meta_df.loc[meta_df['path'] == path]
                    genres.append(''.join(row.genre.values))
                    rubrics.append(''.join(row.final_rubrics.values))
                    num_words.append(count_words(row.text))
                    number_nes.append(num)
                    fnames.append(fname)
                    final_mentions.append(men)
                else:
                    row = meta_df.loc[meta_df['path'] == path]
                    genres.append(''.join(row.genre.values))
                    rubrics.append(''.join(row.final_rubrics.values))
                    num_words.append(count_words(row.text))
                    number_nes.append(num)
                    fnames.append(fname)
                    final_mentions.append(men)
                    
    df = pd.DataFrame({'genre':genres,
                      'rubric': rubrics,
                      'num_words': num_words,
                      'NE_num': number_nes,
                      'file': fnames,
                      'NE': final_mentions})
    print('Finished!')
    return df

In [484]:
ner_meta = extract_meta(ner_df, meta_table)

2665it [00:25, 105.44it/s]

Finished!





In [491]:
ner_meta = ner_meta.loc[ner_meta['genre'] != '']

In [493]:
len(ner_meta)

2664

In [498]:
set(ner_meta.genre)

{'FAQ', 'Блоги', 'Лекции'}

In [499]:
ner_meta = ner_meta.replace('FAQ', 'Блоги')

In [522]:
len(ner_meta[ner_meta.rubric == 'Экономика'])

91

In [408]:
ner_meta = ner_meta.drop_duplicates(subset='file', keep='first')
len(ner_meta)

2665

In [500]:
ner_meta.to_csv('/home/alapidus/NIS/ner_data/ner_count.tsv', sep = '\t')

In [501]:
set(ner_meta.rubric)

{'Computer Science',
 'Биология',
 'История',
 'Космос',
 'Культура',
 'Математика',
 'Мусор',
 'Науки о земле',
 'Политология',
 'Психология',
 'Социология',
 'Технологии',
 'Физика',
 'Физиология человека',
 'Философия',
 'Футурология',
 'Химия',
 'Экономика',
 'Язык'}

## Rank per rubric

In [401]:
def rank_per_rubric(ner_meta_df, rubric):
    rubrics_df = ner_meta_df[ner_meta_df.rubric == rubric]
    full_names_list = []
    for row in rubrics_df.iterrows():
        names = row[1].NE
        full_names_list.extend(names.split('|'))
    names_dict = dict(Counter(full_names_list))
    names_sorted = sorted(names_dict.items(), key = lambda x:x[1], reverse=True)
    names = [name[0] for name in names_sorted]
    mentions = [name[1] for name in names_sorted]
    names_sorted_df = pd.DataFrame({'name':names,
                                'mentions': mentions})
    print('Rubric:', rubric)
    return names_sorted_df

In [502]:
cs_rank = rank_per_rubric(ner_meta, 'Computer Science')
cs_rank.to_csv('/home/alapidus/NIS/ner_data/cs_ranking.tsv', sep='\t')
cs_rank

Rubric: Computer Science


Unnamed: 0,mentions,name
0,5,Шеннон
1,5,Чистяков
2,4,Ник Бостром
3,4,Курцвейл
4,3,Тьюринг
5,3,Цермело
6,2,Хэмминг
7,2,Сергей Марков
8,2,Миронов
9,2,Сергей


In [503]:
bio_rank = rank_per_rubric(ner_meta, 'Биология')
bio_rank 

Rubric: Биология


Unnamed: 0,mentions,name
0,9,Дарвин
1,4,Дарвина
2,4,Холдейн
3,4,Тимонова
4,4,Синъя Яманака
5,4,Патрик Хаус
6,4,Теркер
7,3,Чарльз Дарвин
8,3,Часто
9,3,Флеминг


In [504]:
hist_rank = rank_per_rubric(ner_meta, 'История')
hist_rank 

Rubric: История


Unnamed: 0,mentions,name
0,16,Сталин
1,12,Петр
2,9,Галилей
3,8,Сталина
4,7,Кто-То
5,7,Фукидид
6,6,Аристотель
7,6,Артем Ефимов
8,6,Ленин
9,6,Данилевского


In [505]:
cos_rank = rank_per_rubric(ner_meta, 'Космос')
cos_rank

Rubric: Космос


Unnamed: 0,mentions,name
0,8,Эйнштейн
1,6,Пифагор
2,5,Эдгар
3,3,Стивен Хокинг
4,3,Галилей
5,3,Артем Елмуратов
6,3,Эйнштейна
7,3,Эддингтон
8,2,Артур Эддингтон
9,2,Вальтер Бааде


In [506]:
math_rank = rank_per_rubric(ner_meta, 'Математика')
math_rank

Rubric: Математика


Unnamed: 0,mentions,name
0,9,Колмогорова
1,8,Колмогоров
2,4,Апу
3,3,Перельман
4,3,Мариам Мирзахани
5,3,Семереди
6,3,Сколтеха Бурнаев
7,3,Римана
8,3,Левин
9,3,Блэк


In [507]:
cult_rank = rank_per_rubric(ner_meta, 'Культура')
cult_rank

Rubric: Культура


Unnamed: 0,mentions,name
0,7,Толстой
1,7,Саида
2,7,Бельский
3,6,Витрувий
4,6,Ницше
5,5,Тынянов
6,5,Дженкинс
7,5,Хайям
8,5,Бахтин
9,5,Хадид


In [508]:
geo_rank = rank_per_rubric(ner_meta, 'Науки о земле')
geo_rank 

Rubric: Науки о земле


Unnamed: 0,mentions,name
0,2,Киевского
1,2,Святский
2,2,Хадсон
3,2,Майкл
4,2,Сергей Тархов
5,1,Клименко
6,1,Палеоклиматология
7,1,Ин-Т
8,1,Каганского
9,1,Татьяны Нефедовой


In [509]:
polit_rank = rank_per_rubric(ner_meta, 'Политология')
polit_rank

Rubric: Политология


Unnamed: 0,mentions,name
0,5,Шмитт
1,4,Путин
2,4,Ленин
3,4,Портнова
4,3,Фуко
5,3,Руссо
6,2,Ходорковский
7,2,Ясина
8,2,Зиновьев
9,2,Мичиганского


In [510]:
psy_rank = rank_per_rubric(ner_meta, 'Психология')
psy_rank 

Rubric: Психология


Unnamed: 0,mentions,name
0,8,Винникотт
1,4,Фрейд
2,3,Маслоу
3,3,Татьяна Карягина
4,3,Бартлетт
5,3,Татьяна Котова
6,3,Эббингауз
7,2,Карл
8,2,Солоу
9,2,Падун


In [511]:
soc_rank = rank_per_rubric(ner_meta, 'Социология')
soc_rank

Rubric: Социология


Unnamed: 0,mentions,name
0,15,Гоффман
1,10,Кто-То
2,10,Латур
3,8,Вулгар
4,7,Шюц
5,7,Колхас
6,6,Федотов
7,6,Аузана
8,6,Аузан
9,6,Путин


In [512]:
tech_rank = rank_per_rubric(ner_meta, 'Технологии')
tech_rank 

Rubric: Технологии


Unnamed: 0,mentions,name
0,11,Видлар
1,5,Морита
2,4,Кто-То
3,4,Куренниеми
4,4,Патрик
5,4,Шушурин
6,4,Мацусита
7,3,Роберт Видлар
8,3,Бошлу
9,3,Меуччи


In [513]:
ph_rank = rank_per_rubric(ner_meta, 'Физика')
ph_rank 

Rubric: Физика


Unnamed: 0,mentions,name
0,22,Декарт
1,12,Эйнштейн
2,10,Максвелл
3,6,Дирак
4,5,Паули
5,4,Хиггса
6,4,Резерфорд
7,4,Эйнштейна
8,4,Фейнман
9,4,Мурзин


In [75]:
phis_rank = rank_per_rubric(ner_meta, 'Физиология человека')

NameError: name 'rank_per_rubric' is not defined

In [515]:
phil_rank = rank_per_rubric(ner_meta, 'Философия')
phil_rank

Rubric: Философия


Unnamed: 0,mentions,name
0,10,Ницше
1,9,Аристотель
2,7,Платон
3,5,Мэдисон
4,5,Лосев
5,4,Сократ
6,4,Макиавелли
7,4,Булгаков
8,3,Гегеля
9,3,Арендт


In [516]:
fut_rank = rank_per_rubric(ner_meta, 'Футурология')
fut_rank 

Rubric: Футурология


Unnamed: 0,mentions,name
0,2,Макклюен
1,2,Павел Клушанцев
2,2,Бен
3,2,Тони
4,2,Иниго
5,2,Кларк
6,2,Джобс
7,2,Мур
8,1,Маршалл Макклюен
9,1,Дурова


In [517]:
chem_rank = rank_per_rubric(ner_meta, 'Химия')
chem_rank 

Rubric: Химия


Unnamed: 0,mentions,name
0,4,Цуи
1,2,Паули
2,2,Леман
3,2,Иоганн Кеплер
4,2,Кеплер
5,2,Сколтеха Фардад Азарми
6,2,Эдисон
7,2,Фокин
8,1,Завойского
9,1,Исаак Раби


In [518]:
econ_rank = rank_per_rubric(ner_meta, 'Экономика')
econ_rank

Rubric: Экономика


Unnamed: 0,mentions,name
0,5,Смит
1,5,Кейнс
2,4,Сергей Гуриев
3,4,Кейнса
4,3,Рональд Коуз
5,3,Бен Бернанке
6,3,Харт
7,3,Белянин
8,3,Гэри Беккер
9,3,Кругман


In [519]:
ling_rank = rank_per_rubric(ner_meta, 'Язык')
ling_rank

Rubric: Язык


Unnamed: 0,mentions,name
0,8,Витгенштейн
1,4,Пешковский
2,4,Плунгян
3,3,Стивен Пинкер
4,3,Кто-То
5,3,Сыма Цянь
6,3,Хомский
7,3,Бейкер
8,3,Платон
9,3,Бикертон


## Parse full corpora

In [55]:
def parse_xml(xml_output):
    xml = bs.BeautifulSoup(xml_output, 'lxml')
    names = xml.find_all('name')
    pattern = re.compile(r'val="([А-ЯЁ]+)">')
    names_ext = []
    for name in names:
        name = name.get('val')
        names_ext.append(name)
    names_ext = [name.title() for name in names_ext]
    #names_sep = divide_names(names_ext)
    #return names_sep
    return names_ext

In [62]:
names = parse_xml(output)
len(names)

729

## Delete common words and geoterms

In [66]:
import pymorphy2 
morph = pymorphy2.MorphAnalyzer()
from flashtext import KeywordProcessor
from tqdm import tqdm_notebook
import pandas as pd

### Comile a list of common words in the beginning of the sentence

In [50]:
meta_table = pd.read_csv('/home/alapidus/NIS/articles_with_meta.tsv', sep='\t')

In [63]:
from nltk.tokenize.punkt import PunktSentenceTokenizer

In [66]:
tokenizer = PunktSentenceTokenizer()

In [69]:
def preprocess(text):
    text = ' '.join(re.findall(r'[А-ЯЁа-яё\-\.\d]+', text))
    return text

In [75]:
def find_first_words(texts):
    words = []
    for text in tqdm(texts):
        text = preprocess(text)
        sentences = tokenizer.tokenize(text)
        first_words = [sentence.split()[0] for sentence in sentences]
        first_words = [w.lower() for w in first_words]
        words.extend(first_words)
    return list(set(words))

In [79]:
first_words = find_first_words(meta_table.text)

100%|██████████| 30628/30628 [01:48<00:00, 283.35it/s]


In [39]:
def compile_lemmas_list(input_path):
    lemmas_list = []
    for root, dirs, files in os.walk(input_path):
        for file_name in files:
            file_path = input_path + file_name
            lemmas = slurp(file_path)
            lemmas = re.findall(r'[а-яё]+', lemmas)
            lemmas_list.extend(lemmas)
    lemmas_list = list(set(lemmas_list))
    return lemmas_list

In [40]:
common = '/home/nst/data/api_nis/ner_module/other_dir/slovnik/'
common_words = compile_lemmas_list(common)

In [41]:
lemmas_processor = KeywordProcessor()
lemmas_processor.add_keywords_from_list(common_words)

In [43]:
cities = pd.read_csv('/home/nst/data/api_nis/ner_module/other_dir/cities.csv', sep='\t', 
                     names=['geoloc', 'mentions'])
countries = pd.read_csv('/home/nst/data/api_nis/ner_module/other_dir/countries.csv', sep='\t', 
                    names = ['geoloc', 'mentions'])
geo_general = pd.read_csv('/home/nst/data/api_nis/ner_module/other_dir/geo_names.tsv', sep='\t',
                         names= ['geoloc'])

In [44]:
geo = pd.concat([cities, countries, geo_general])
geo = geo.drop_duplicates(keep='first')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  """Entry point for launching an IPython kernel.


In [45]:
geo = geo.geoloc

In [46]:
geo = list(geo)

In [47]:
geo = [loc.lower() for loc in geo]

In [48]:
lemmas_geo = KeywordProcessor()
lemmas_geo.add_keywords_from_list(geo)

In [49]:
def delete_common_words(names:list):
    potential_names = []
    
    for name in names:
        true_name = []
        name = name.split()
        for n in name:
            n = n.lower()
            lemma = morph.parse(n)[0].normal_form
            if not lemmas_processor.extract_keywords(lemma) == [lemma]:
                #print('not in common:', lemma)
                true_name.append(n.title())
        final_name = ' '.join(true_name)
        if final_name != '':
            potential_names.append(final_name)
    #print('Deleted common words, current size:', len(potential_names))
    return potential_names

In [50]:
len(names)

9

In [67]:
commons_deleted = delete_common_words(names)

In [69]:
len(commons_deleted)

470

In [52]:
def delete_geo_terms(names:list):
    potential_names = []
    
    for name in names:
        true_name = []
        name = name.split()
        for n in name:
            n = n.lower()
            lemma = morph.parse(n)[0].normal_form
            if not lemmas_geo.extract_keywords(lemma) == [lemma]:
                #print('not in common:', lemma)
                true_name.append(n.title())
        final_name = ' '.join(true_name)
        if final_name != '':
            potential_names.append(final_name)
    #print('Deleted geo terms, current size:', len(potential_names))
    return potential_names

In [70]:
geo_delete = delete_geo_terms(commons_deleted)

In [71]:
len(geo_delete)

456

In [77]:
clean_names=geo_delete

In [81]:
true = names_list

In [83]:
pred = []
for name in clean_names:
    if name in true:
        pred.append(name)
accuracy = len(pred)/len(true)
print('Accuracy:', accuracy)

Accuracy: 0.4798061389337641


## Пройдемся по текстам и списку из корпуса

In [73]:
corpora_names = [name.strip() for name in corpora_names]

In [84]:
%%time
for text in tqdm_notebook(preprocessed):
    for n in corpora_names:
        if n in text:
            clean_names.append(n)    

HBox(children=(IntProgress(value=0, max=175), HTML(value='')))


CPU times: user 23.9 s, sys: 73.7 ms, total: 24 s
Wall time: 24.4 s


In [244]:
from collections import Counter

In [245]:
names_dict = dict(Counter(geo_delete))
names_sorted = sorted(names_dict.items(), key = lambda x:x[1], reverse=True)

In [263]:
names_sorted.remove(('Московского', 7))

In [268]:
names = [name[0] for name in names_sorted]
mentions = [name[1] for name in names_sorted]
names_sorted_df = pd.DataFrame({'name':names,
                                'mentions': mentions})

In [270]:
names_sorted_df.to_csv('/home/alapidus/NIS/ner_data/general_ranking.tsv', sep='\t')