In [1]:
import sys
import artm
import warnings
import pandas as pd

sys.path.append('../src')
warnings.simplefilter('ignore')

from preprocessing import *
from sklearn.pipeline import Pipeline as sklearn_pipeline

Using TensorFlow backend.


In [None]:
chars_map = {'\xad': ' ',
             '…': '...',
             '«': '', '»': '',
             '"': '', '\'': '',
             '’': '', '‘': '',
             '”': '', '“': '', '„': '',
             '`': '', '*': '', '_': '', '©': '',
             'http://':'', 'https://':'',
             'тыс.': 'тыс. ', 'кв.': 'кв. ', 'куб.': 'куб. ',
             'прим.': 'прим. ', 'Прим.': 'Прим.', 'зам.': 'зам. '}

pipeline = sklearn_pipeline([('replace_chars', ReplaceChars(chars_map)),
                             ('replace_part', ReplacePart(r'[а-яйё]+\.[А-ЯЙЁ]+[а-яйё]+', lambda x: x.replace('.', '. '))),
                             ('sub_code', RegExprSub(r'\{.*\}', ' ')),
                             ('sub_colon', RegExprSub(r'\d*\:\d*', ' ')),
                             ('sub_round_brackets_without_words', RegExprSub(r'\([^a-zA-Zа-яйёА-ЯЙЁ]+\)', ' ')),
                             ('sub_spaces', RegExprSub(r' +', ' ')),
                             ('strip', Strip()),
                             ('sent_tokenize', RusSentTokenizer()),
                             ('sub_begin_hyphen', RegExprSub(r'^ *\- *', '', sent=True)),
                             ('ner_word_tokenize', NER_RusWordTokenizer()),
                             ('morph_predict', MorphPredictor()),
                             ('space_detect', SpaceDetecter()),
                             ('ner_corrector', NER_Correcter()),
                             ('conllu_encode', CoNLLUFormatEncoder())])

[nltk_data] Downloading package punkt to /home/arina/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/arina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package perluniprops to
[nltk_data]     /home/arina/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /home/arina/nltk_data...
[nltk_data]   Package nonbreaking_prefixes is already up-to-date!
W0901 16:35:48.626969 139835170060096 deprecation_wrapper.py:119] From /home/arina/anaconda3/envs/py36/lib/python3.6/site-packages/bert_dp/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.

2019-09-01 16:35:48.837 INFO in 'deeppavlov.core.data.simple_vocab'['simple_vocab'] at line 112: [loading vocabulary from /home/arina/.deeppavlov/models/ner_rus_bert/tag.dict]
I0901 16:35:48.837283 139835170060

In [None]:
articles = pd.read_csv('../data/interim/articles_new.csv')

print('Transform titles')
articles['preproc_title'] = pipeline.fit_transform(articles.title)

print('Transform texts')
articles['preproc_text'] = pipeline.fit_transform(articles.text)

articles.to_csv('../data/interim/articles_preproc.csv', index=False)

In [None]:
articles = pd.read_csv('../data/interim/articles_preproc.csv')

pos_set = {'ADJ', 'ADV', 'INTJ', 'NOUN', 'PROPN', 'VERB'}
pipeline = sklearn_pipeline([('conllu_decode', CoNLLUFormatDecoder()),
                             ('morph_filtration', MorphFilter(pos_set=pos_set))])

titles = pipeline.fit_transform(articles.preproc_title)
texts = pipeline.fit_transform(articles.preproc_text)

In [None]:
from collections import Counter

n_per = 0
n_loc = 0
n_org = 0

articles_wv = []
for article_id, title, text in zip(articles.id, titles, texts):
    
    title_tokens = []
    text_tokens = []
    
    pers = []
    locs = []
    orgs = []
    
    for sent in title:
        for token in sent.tokens:
            if token.ne == 'O':
                title_tokens.append(token.lemma)
            elif token.ne[2:] == 'PER':
                pers.append(token.lemma.replace(' ', '_'))
            elif token.ne[2:] == 'LOC':
                locs.append(token.lemma.replace(' ', '_'))
            elif token.ne[2:] == 'ORG':
                orgs.append(token.lemma.replace(' ', '_'))
            else:
                print(token.ne)
    
    for sent in text:
        for token in sent.tokens:
            if token.ne == 'O':
                text_tokens.append(token.lemma)
            elif token.ne[2:] == 'PER':
                pers.append(token.lemma.replace(' ', '_'))
            elif token.ne[2:] == 'LOC':
                locs.append(token.lemma.replace(' ', '_'))
            elif token.ne[2:] == 'ORG':
                orgs.append(token.lemma.replace(' ', '_'))
            else:
                print(token.ne)
                
    n_per += len(pers)
    n_loc += len(locs)
    n_org += len(orgs)
                
    title_tokens = Counter(title_tokens)
    text_tokens = Counter(text_tokens)
    
    pers = Counter(pers)
    locs = Counter(locs)
    orgs = Counter(orgs)
    
    encode = lambda x: ' '.join([f'{token}' + (f':{count}' if count > 1 else '') for token, count in x.items()])
    
    title_tokens = encode(title_tokens)
    text_tokens = encode(text_tokens)
    
    pers = encode(pers)
    locs = encode(locs)
    orgs = encode(orgs)
    
    articles_wv.append(f'{article_id} |per {pers} |loc {locs} |org {orgs} |title {title_tokens} |text {text_tokens}')
    
with open('../data/interim/articles_vw.txt', 'w') as fl:
    fl.write('\n'.join(articles_wv))
    
artm.BatchVectorizer(data_path='../data/interim/articles_vw.txt', data_format='vowpal_wabbit', target_folder='../data/interim/batches')

In [None]:
print(n_per, n_loc, n_org)
print(articles_wv[0])