In [1]:
import sys

import pandas as pd

from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

sys.path.append('../src')
from encode import *
from phrase_extract import PhraseExtracter

**Извлечение фраз**

Сформируем таблицу разбиения новостных статей на фразы, которая будет иметь следующие поля:

* **article_id** - идентификатор новостной статьи
* **part** - часть статьи ('title', 'snippet', 'text')
* **sent_id** - идентификатор предложения
* **begin_boundary** - индекс, с которого начинается фраза в предложении 
* **end_boundary** - индекс, которым заканчивается фраза в предложении
* **phrase_id** - идентификатор фразы
* **phrase_lemma_id** - идентификатор лемматизированной фразы
* **weight** - sig из TopMine

In [10]:
articles = pd.read_csv('../data/interim/articles_preproc.csv')

articles.title_preproc = articles.title_preproc.apply(conllu_encoder)
articles.snippet_preproc = articles.snippet_preproc.apply(conllu_encoder)
articles.text_preproc = articles.text_preproc.apply(conllu_encoder)

title_phrase_extracter = PhraseExtracter().fit(articles.title_preproc)
title_phrase_extracter.dump('../models/title_phrase_extract_model')

snippet_phrase_extracter = PhraseExtracter().fit(articles.snippet_preproc)
snippet_phrase_extracter.dump('../models/snippet_phrase_extract_model')

text_phrase_extracter = PhraseExtracter().fit(articles.text_preproc)
text_phrase_extracter.dump('../models/text_phrase_extract_model')

11375it [00:00, 13606.16it/s]
22366it [00:19, 1171.78it/s]
104855it [02:14, 780.07it/s] 


In [None]:
articles = pd.read_csv('../data/interim/articles_preproc.csv')

articles.title_preproc = articles.title_preproc.apply(conllu_encoder)
articles.snippet_preproc = articles.snippet_preproc.apply(conllu_encoder)
articles.text_preproc = articles.text_preproc.apply(conllu_encoder)

title_phrase_extracter = PhraseExtracter(path='../models/title_phrase_extract_model')
snippet_phrase_extracter = PhraseExtracter(path='../models/snippet_phrase_extract_model')
text_phrase_extracter = PhraseExtracter(path='../models/text_phrase_extract_model')

articles_phrase = []
for _, article in tqdm(articles.iterrows()):
    
    #title
    for sent in article.title_preproc:
        for begin_boundary, end_boundary, phrase, phrase_lemma, sig in title_phrase_extracter.transform(sent.tokens):
            articles_phrase.append((article.id, 'title', sent.id, begin_boundary, end_boundary, phrase, phrase_lemma, sig))
            
    #snippet
    for sent in article.snippet_preproc:
        for begin_boundary, end_boundary, phrase, phrase_lemma, sig in snippet_phrase_extracter.transform(sent.tokens):
            articles_phrase.append((article.id, 'snippet', sent.id, begin_boundary, end_boundary, phrase, phrase_lemma, sig))
            
    #text
    for sent in article.text_preproc:
        for begin_boundary, end_boundary, phrase, phrase_lemma, sig in text_phrase_extracter.transform(sent.tokens):
            articles_phrase.append((article.id, 'text', sent.id, begin_boundary, end_boundary, phrase, phrase_lemma, sig))
            
articles_phrase = pd.DataFrame(articles_phrase, columns=['article_id', 'part', 'sent_id', 'begin_boundary', 'end_boundary', 
                                                         'phrase_id', 'phrase_lemma_id', 'sig'])


label_encoder = LabelEncoder().fit(articles_phrase.phrase)
articles_phrase.phrase = label_encoder.transform(articles_phrase.phrase)

phrases = label_encoder.classes_
phrases = pd.DataFrame({'id':range(len(phrases)), 'phrase':phrases})

label_encoder = LabelEncoder().fit(articles_phrase.phrase_lemma)
articles_phrase.phrase_lemma = label_encoder.transform(articles_phrase.phrase_lemma)

phrases_lemma = label_encoder.classes_
phrases_lemma = pd.DataFrame({'id':range(len(phrases_lemma)), 'phrase':phrases_lemma})

phrases.to_csv('../data/interim/phrases.csv', index=False)
phrases_lemma.to_csv('../data/interim/phrases_lemma.csv', index=False)
articles_phrase.to_csv('../data/interim/articles_phrase.csv', index=False)

10817it [02:14, 95.19it/s]

In [None]:
with open('../data/interim/phrases.txt', 'w') as fl:
    fl.write('\n'.join(phrases.phrase))