In [1]:
import sys
import warnings
import pandas as pd

sys.path.append('../src')
warnings.simplefilter('ignore')

from preprocessing import *
from sklearn.pipeline import Pipeline as sklearn_pipeline

Using TensorFlow backend.


In [None]:
articles = pd.read_csv('../data/interim/articles_new.csv')

chars_map = {'\xad': ' ',
             '…': '...',
             '«': '', '»': '',
             '"': '', '\'': '',
             '’': '', '‘': '',
             '”': '', '“': '', '„': '',
             '`': '', '*': '', '_': '',
             'http://':'', 'https://':''}

pipeline = sklearn_pipeline([('replace_chars', ReplaceChars(chars_map)),
                             ('sub_code', RegExprSub(r'\{.*\}', ' ')),
                             ('sub_colon', RegExprSub(r'\d*\:\d*', ' ')),
                             ('sub_spaces', RegExprSub(r' +', ' ')),
                             ('strip', Strip()),
                             ('sent_tokenize', RusSentTokenizer()),
                             ('spell', Yandex_Speller()),
                             ('word_tokenize', Spacy_RusWordTokenizer()),
                             ('space_detect', SpaceDetecter()),
                             ('morph_predict', MorphPredictor()),
                             ('conllu_encode', CoNLLUFormatEncoder()),
                             ('syntax_parse', SyntaxParser('../models/parser_model.udpipe'))])

print('Transform titles')
articles['preproc_title'] = pipeline.fit_transform(articles.title)

print('Transform texts')
articles['preproc_text'] = pipeline.fit_transform(articles.text)

articles.to_csv('../data/interim/articles_preproc.csv', index=False)

Transform titles
Transform texts

In [None]:
articles = pd.read_csv('../data/interim/articles_preproc.csv')

upos_set={'ADJ', 'ADV', 'INTJ', 'NOUN', 'PROPN', 'VERB'}
pipeline = sklearn_pipeline([('conllu_decode', CoNLLUFormatDecoder()),
                             ('morph_filtration', MorphFilter(upos_set=upos_set)),
                             ('vowpal_wabbit_encode', VowpalWabbitFormatEncoder())])

articles['vw_title'] = pipeline.fit_transform(articles.preproc_title)
articles['vw_text'] = pipeline.fit_transform(articles.preproc_text)

with open('../data/interim/articles_vw.txt', 'w') as fl:
    fl.write('\n'.join([f'{article_id} |title {title} |text {text}' 
                        for article_id, title, text in 
                        articles[['id', 'vw_title', 'vw_text']].values]))

In [4]:
print('Finish')

Finish
