# Загружаем текст из файла

In [2]:
with open('rueval_2010_goldstandard_text.txt', encoding='windows-1251') as f:
    brusov_text = f.read()

In [12]:
with open('rueval_2010_goldstandard_tagged.txt', encoding='windows-1251') as f:
    f.readline()
    goldstandard = f.read()

In [13]:
DATA = []
for line in goldstandard.split('\n'):
    DATA.append(line.split('\t'))

In [10]:
import pandas as pd

In [21]:
df = pd.DataFrame(DATA, columns=['Wordform_GS2', 'Lemma_GS2', 'POS_GS2', 'Gram_GS2', 'Status', 'NB'])
df = df[['Wordform_GS2', 'Lemma_GS2', 'POS_GS2', 'Gram_GS2']]

In [22]:
df

Unnamed: 0,Wordform_GS2,Lemma_GS2,POS_GS2,Gram_GS2
0,Цветки,цветок,S,"m,nom,pl"
1,дикорастущих,дикорастущий,A,"gen,pl"
2,форм,форма,S,"f,gen,pl"
3,обыкновенной,обыкновенный,A,"f,gen,sg"
4,сирени,сирень,S,"f,gen,sg"
...,...,...,...,...
2903,ведется,вестись,V,"3p,pres,sg"
2904,по,по,PR,
2905,плану,план,S,"dat,m,sg"
2906,.,,,


# Pymorphy3

In [17]:
from pymorphy3 import MorphAnalyzer
from pymorphy3.tokenizers import simple_word_tokenize

In [19]:
morph = MorphAnalyzer()

In [None]:
# brusov_tokenized_pymorphy = simple_word_tokenize(brusov_text)
# for token in brusov_tokenized_pymorphy:
#     parsed_token = morph.parse(token)
#     print('{:3}\t{:10}\t{:50}'.format(len(parsed_token), parsed_token[0].word, parsed_token[0].tag._str))

In [20]:
# brusov_tokenized_pymorphy = simple_word_tokenize(brusov_text)
# brusov_pymorphy = []
# for token in brusov_tokenized_pymorphy:
#     parsed_token = morph.parse(token)
#     brusov_pymorphy.append((parsed_token[0].word, parsed_token[0].tag._str))

In [23]:
df = pd.DataFrame(brusov_pymorphy, columns=['token', 'tag'])
df.to_excel('brusov_pymorphy.xlsx', index=False)

# spacy

## Скачиваем модель

In [None]:
! python -m spacy download ru_core_news_sm

## Размечаем

In [26]:
import spacy

In [27]:
nlp = spacy.load('ru_core_news_sm')

In [32]:
doc = nlp(brusov_text)
brusov_spacy = []
for token in doc:
    brusov_spacy.append((token.text, token.pos_, token.morph))

In [None]:
df = pd.DataFrame(brusov_spacy, columns=['token', 'pos',  'tag'])
df.to_excel('brusov_spacy.xlsx', index=False)

# Pymystem

In [None]:
! pip install pymystem3

In [2]:
from pymystem3 import Mystem
analyzer_no_disamb = Mystem()
analyzer_with_disamb = Mystem(disambiguation=True)

In [5]:
brusov_mystem = []
doc_no_disamb = analyzer_no_disamb.analyze(brusov_text)
doc_with_disamb = analyzer_with_disamb.analyze(brusov_text)

In [10]:
brusov_pymystem = []
for i in range(len(doc_no_disamb)):
    has_ana = doc_no_disamb[i].get('analysis')
    if has_ana:
        text = doc_no_disamb[i].get('text')
        ana_no_disamb = doc_no_disamb[i].get('analysis')[0]
        gr_no_disamb = ana_no_disamb.get('gr')
        
        ana_with_disamb = doc_with_disamb[i].get('analysis')[0]
        gr_with_disamb = ana_with_disamb.get('gr')
        brusov_pymystem.append((text, gr_no_disamb, gr_with_disamb))

In [None]:
df = pd.DataFrame(brusov_pymystem, columns=['token', 'gr_no_disamb', 'gr_with_disamb'])
df.to_excel('brusov_pymystem.xlsx', index=False)

# Stanza

In [None]:
import stanza
nlp = stanza.Pipeline(lang='ru', processors='tokenize,pos', verbose=False)

In [40]:
brusov_stanza = []
doc = nlp(brusov_text)
for sentence in doc.sentences:
    for word in sentence.words:
        brusov_stanza.append((word.text, word.upos, word.feats))

In [None]:
df = pd.DataFrame(brusov_stanza, columns=['text', 'upos', 'feats'])
df.to_excel('brusov_stanza.xlsx', index=False)

# UDPipe

In [46]:
from ufal.udpipe import Model, Pipeline
model = Model.load('russian-ud-2.0-170801.udpipe')
pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')

In [51]:
analyzed = pipeline.process(brusov_text)
brusov_ud = []
for i, sent in enumerate(analyzed.split('\n\n')):
    for j, token in enumerate(sent.split('\n')):
        if (not token.startswith('#')) and token:
            brusov_ud.append(token.split('\t')[1:6])

In [None]:
brusov_ud[12]

In [57]:
df = pd.DataFrame(brusov_ud, columns=['text', 'lemma', 'pos1', 'pos2', 'morph'])
df.to_excel('brusov_ud.xlsx', index=False)