In [1]:
import sys
import artm
import warnings
import pandas as pd

sys.path.append('../src')
warnings.simplefilter('ignore')

from preprocessing import *
from sklearn.pipeline import Pipeline as sklearn_pipeline

Using TensorFlow backend.


In [2]:
chars_map = {'\xad': ' ',
             '…': '...',
             '«': '', '»': '',
             '"': '', '\'': '',
             '’': '', '‘': '',
             '”': '', '“': '', '„': '',
             '`': '', '*': '', '_': '', '©': '',
             'http://':'', 'https://':''}

pipeline = sklearn_pipeline([('replace_chars', ReplaceChars(chars_map)),
                             ('replace_part', ReplacePart(r'[а-яйё]+\.[А-ЯЙЁ]+[а-яйё]+', lambda x: x.replace('.', '. '))),
                             ('sub_code', RegExprSub(r'\{.*\}', ' ')),
                             ('sub_colon', RegExprSub(r'\d*\:\d*', ' ')),
                             ('sub_round_brackets_without_words', RegExprSub(r'\([^a-zA-Zа-яйёА-ЯЙЁ]+\)', ' ')),
                             ('sub_spaces', RegExprSub(r' +', ' ')),
                             ('strip', Strip()),
                             ('sent_tokenize', RusSentTokenizer()),
                             ('sub_begin_hyphen', RegExprSub(r'^ *\- *', '', sent=True)),
                             ('ner_word_tokenize', NER_RusWordTokenizer()),
                             ('space_detect', SpaceDetecter()),
                             ('correct_no_ne', NONECorrecter()),
                             ('morph_predict', MorphPredictor()),
                             ('correct_ne', NECorrecter()),
                             ('conllu_encode', CoNLLUFormatEncoder())])

[nltk_data] Downloading package punkt to /home/arina/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/arina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package perluniprops to
[nltk_data]     /home/arina/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /home/arina/nltk_data...
[nltk_data]   Package nonbreaking_prefixes is already up-to-date!
W0831 16:49:29.357393 140209560168256 deprecation_wrapper.py:119] From /home/arina/anaconda3/envs/py36/lib/python3.6/site-packages/bert_dp/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.

2019-08-31 16:49:29.565 INFO in 'deeppavlov.core.data.simple_vocab'['simple_vocab'] at line 112: [loading vocabulary from /home/arina/.deeppavlov/models/ner_rus_bert/tag.dict]
I0831 16:49:29.565898 140209560168

In [3]:
articles = pd.read_csv('../data/interim/articles_new.csv')

print('Transform titles')
articles['preproc_title'] = pipeline.fit_transform(articles.title)

print('Transform texts')
articles['preproc_text'] = pipeline.fit_transform(articles.text)

articles.to_csv('../data/interim/articles_preproc.csv', index=False)

Transform titles
ReplaceChars


100%|██████████| 11127/11127 [00:00<00:00, 153576.39it/s]


ReplacePart


100%|██████████| 11127/11127 [00:00<00:00, 163401.28it/s]


RegExprSub


100%|██████████| 11127/11127 [00:00<00:00, 310031.82it/s]


RegExprSub


100%|██████████| 11127/11127 [00:00<00:00, 181111.98it/s]


RegExprSub


100%|██████████| 11127/11127 [00:00<00:00, 302700.24it/s]


RegExprSub


100%|██████████| 11127/11127 [00:00<00:00, 349619.60it/s]


Strip


100%|██████████| 11127/11127 [00:00<00:00, 410968.74it/s]


RusSentTokenizer


100%|██████████| 11127/11127 [00:00<00:00, 136137.63it/s]


RegExprSub


100%|██████████| 11127/11127 [00:00<00:00, 200054.96it/s]


NER_RusWordTokenizer


100%|██████████| 11127/11127 [07:05<00:00, 26.16it/s]


SpaceDetecter


100%|██████████| 11127/11127 [00:00<00:00, 112396.94it/s]


NONECorrecter


100%|██████████| 11127/11127 [00:15<00:00, 712.36it/s] 


MorphPredictor


100%|██████████| 11127/11127 [02:36<00:00, 70.94it/s]


NECorrecter


100%|██████████| 11127/11127 [00:00<00:00, 81951.13it/s]


CoNLLUFormatEncoder


100%|██████████| 11127/11127 [00:00<00:00, 46304.68it/s]


Transform texts
ReplaceChars


100%|██████████| 11127/11127 [00:00<00:00, 84422.95it/s]


ReplacePart


100%|██████████| 11127/11127 [00:00<00:00, 27613.27it/s]


RegExprSub


100%|██████████| 11127/11127 [00:00<00:00, 225631.26it/s]


RegExprSub


100%|██████████| 11127/11127 [00:00<00:00, 38469.20it/s]


RegExprSub


100%|██████████| 11127/11127 [00:00<00:00, 229287.14it/s]


RegExprSub


100%|██████████| 11127/11127 [00:00<00:00, 26481.82it/s]


Strip


100%|██████████| 11127/11127 [00:00<00:00, 296487.01it/s]


RusSentTokenizer


100%|██████████| 11127/11127 [00:04<00:00, 2581.34it/s]


RegExprSub


100%|██████████| 11127/11127 [00:00<00:00, 60804.28it/s]


NER_RusWordTokenizer


100%|██████████| 11127/11127 [09:57<00:00, 22.08it/s]


SpaceDetecter


100%|██████████| 11127/11127 [00:01<00:00, 11122.95it/s]


NONECorrecter


100%|██████████| 11127/11127 [04:11<00:00, 44.26it/s]


MorphPredictor


100%|██████████| 11127/11127 [44:41<00:00,  6.24it/s] 


NECorrecter


100%|██████████| 11127/11127 [00:01<00:00, 8152.73it/s]


CoNLLUFormatEncoder


100%|██████████| 11127/11127 [00:03<00:00, 2790.40it/s]


In [4]:
articles = pd.read_csv('../data/interim/articles_preproc.csv')

pos_set = {'ADJ', 'ADV', 'INTJ', 'NOUN', 'PROPN', 'VERB'}
pipeline = sklearn_pipeline([('conllu_decode', CoNLLUFormatDecoder()),
                             ('morph_filtration', MorphFilter(pos_set=pos_set))])

titles = pipeline.fit_transform(articles.preproc_title)
texts = pipeline.fit_transform(articles.preproc_text)

CoNLLUFormatDecoder


100%|██████████| 11127/11127 [00:00<00:00, 38758.29it/s]


MorphFilter


100%|██████████| 11127/11127 [00:00<00:00, 187953.66it/s]


CoNLLUFormatDecoder


100%|██████████| 11127/11127 [00:06<00:00, 1689.11it/s]


MorphFilter


100%|██████████| 11127/11127 [00:00<00:00, 32623.42it/s]


In [5]:
from collections import Counter

n_per = 0
n_loc = 0
n_org = 0

articles_wv = []
for article_id, title, text in zip(articles.id, titles, texts):
    
    title_tokens = []
    text_tokens = []
    
    pers = []
    locs = []
    orgs = []
    
    for sent in title:
        for token in sent.tokens:
            if token.ne == 'O':
                title_tokens.append(token.lemma)
            elif token.ne[2:] == 'PER':
                pers.append(token.lemma.replace(' ', '_'))
            elif token.ne[2:] == 'LOC':
                locs.append(token.lemma.replace(' ', '_'))
            elif token.ne[2:] == 'ORG':
                orgs.append(token.lemma.replace(' ', '_'))
            else:
                print(token.ne)
    
    for sent in text:
        for token in sent.tokens:
            if token.ne == 'O':
                text_tokens.append(token.lemma)
            elif token.ne[2:] == 'PER':
                pers.append(token.lemma.replace(' ', '_'))
            elif token.ne[2:] == 'LOC':
                locs.append(token.lemma.replace(' ', '_'))
            elif token.ne[2:] == 'ORG':
                orgs.append(token.lemma.replace(' ', '_'))
            else:
                print(token.ne)
                
    n_per += len(pers)
    n_loc += len(locs)
    n_org += len(orgs)
                
    title_tokens = Counter(title_tokens)
    text_tokens = Counter(text_tokens)
    
    pers = Counter(pers)
    locs = Counter(locs)
    orgs = Counter(orgs)
    
    encode = lambda x: ' '.join([f'{token}' + (f':{count}' if count > 1 else '') for token, count in x.items()])
    
    title_tokens = encode(title_tokens)
    text_tokens = encode(text_tokens)
    
    pers = encode(pers)
    locs = encode(locs)
    orgs = encode(orgs)
    
    articles_wv.append(f'{article_id} |per {pers} |loc {locs} |org {orgs} |title {title_tokens} |text {text_tokens}')
    
with open('../data/interim/articles_vw.txt', 'w') as fl:
    fl.write('\n'.join(articles_wv))
    
artm.BatchVectorizer(data_path='../data/interim/articles_vw.txt', data_format='vowpal_wabbit', target_folder='../data/interim/batches')

artm.BatchVectorizer(data_path="../data/interim/batches", num_batches=12)

In [6]:
print(n_per, n_loc, n_org)
print(articles_wv[0])

48944 68217 52243
0 |per медведев дмитрий_медведев |loc россия дальневосточный_федеральный_округ |org  |title год льготный автокредит лизинг быть выделенный миллиард рубль |text год:5 льготный:4 автокредит:2 лизинг:4 быть:5 выделенный:3 миллиард:8 рубль:9 премьер заявить сегодня:2 ход заседание правительство:2 резерв выделять деньга поддержка автомобильный лёгкий промышленность миллион:2 более половина сумма район пойти программа:2 кредитование транспортный:2 средство:3 помочь сохранить кредитный лизинговый ставка нормальный уровень поддержать спрос автомобиль:3 делать последний рассчитывать рамка проданный менее лишний тысяча вид техника:3 сказать глава слово также:2 субсидия распределяться производитель газомоторный:2 проект распоряжение предусматриваться выделение ряд мера число стимулирование:4 колёсный размер:4 продажа:2 физический лицо производство территория
