In [None]:

articles = pd.read_csv('../data/interim/articles_preproc.csv')
decoder = CoNLLUFormatDecoder()

articles.preproc_title = decoder.transform(articles.preproc_title)
articles.preproc_text = decoder.transform(articles.preproc_text)

phr_extracter = PhraseExtracter(min_count=3).fit(articles.preproc_title.tolist()+articles.preproc_text.tolist())

phrases_title = phr_extracter.transform(articles.preproc_title)
phrases_text = phr_extracter.transform(articles.preproc_text)

_, _, title_sigs = zip(*chain(*chain(*phrases_title)))
_, _, text_sigs = zip(*chain(*chain(*phrases_text)))

fig, axs = plt.subplots(2, 1, figsize=(15, 10))

axs[0].hist(title_sigs, bins=500)
axs[0].set_ylabel('Number of phrases in titles')

axs[1].hist(text_sigs, bins=500)
axs[1].set_ylabel('Number of phrases in texts')
axs[1].set_xlabel('Sig')

plt.show()

In [None]:
phrases = []

for phrase_title, phrase_text, (article_id, article_title, article_text) in \
zip(phrases_title, phrases_text, articles[['id', 'preproc_title', 'preproc_text']].values):
    
    for phrase_title_sent, article_title_sent in zip(phrase_title, article_title):
        
        tokens = article_title_sent.tokens
        for begin_index, end_index, sig in phrase_title_sent:
            phrase = tokens[begin_index:end_index]
            
            spaces = [token.space for token in phrase]
            forms = [token.form for token in phrase]
            lemmas = [token.lemma for token in phrase]
            
            phrase_form = ''.join([form + (' ' if space else '') for form, space in zip(forms, spaces)]).strip()
            phrase_lemma = ''.join([lemma + (' ' if space else '') for lemma, space in zip(lemmas, spaces)]).strip()
            
            phrases.append((article_id, 'title', article_title_sent.sent_id, begin_index, end_index, phrase_form, phrase_lemma, sig))
            
    for phrase_text_sent, article_text_sent in zip(phrase_text, article_text):
        
        tokens = article_text_sent.tokens
        for begin_index, end_index, sig in phrase_text_sent:
            phrase = tokens[begin_index:end_index]
            
            spaces = [token.space for token in phrase]
            forms = [token.form for token in phrase]
            lemmas = [token.lemma for token in phrase]
            
            phrase_form = ''.join([form + (' ' if space else '') for form, space in zip(forms, spaces)]).strip()
            phrase_lemma = ''.join([lemma + (' ' if space else '') for lemma, space in zip(lemmas, spaces)]).strip()
            
            phrases.append((article_id, 'text', article_text_sent.sent_id, begin_index, end_index, phrase_form, phrase_lemma, sig))
            
phrases = pd.DataFrame(phrases, columns=['article_id', 'article_part', 'sent_id', 'begin_index', 'end_index', 'phrase_form', 'phrase_lemma', 'sig'])
phrases.to_csv('../data/interim/phrases.csv', index=False)

form_encoder = LabelEncoder().fit(phrases.phrase_form)
phrases.phrase_form = form_encoder.transform(phrases.phrase_form)

phrases_form = pd.DataFrame([(form_id, form) for form_id, form in enumerate(form_encoder.classes_)], columns=['form_id', 'form'])
phrases_form.to_csv('../data/interim/phrases_form.csv', index=False)

lemma_encoder = LabelEncoder().fit(phrases.phrase_lemma)
phrases.phrase_lemma = lemma_encoder.transform(phrases.phrase_lemma)

phrases_lemma = pd.DataFrame([(lemma_id, lemma) for lemma_id, lemma in enumerate(lemma_encoder.classes_)], columns=['lemma_id', 'lemma'])
phrases_lemma_sig = pd.DataFrame([(lemma_id, group.sig.mean()) for lemma_id, group in phrases.groupby('phrase_lemma')], columns=['lemma_id', 'sig'])
phrases_lemma = pd.merge(phrases_lemma, phrases_lemma_sig, on='lemma_id')

phrases_lemma.to_csv('../data/interim/phrases_lemma.csv', index=False)