## Задание 1
**Написать теггер на данных с руским языком**
1. проверить UnigramTagger, BigramTagger, TrigramTagger и их комбмнации

*******************

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.float_format', '{:.4f}'.format)

In [2]:
import pyconll
from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger

In [3]:
# !wget -O ./datasets/ru_syntagrus-ud-train.conllu https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/master/ru_syntagrus-ud-train.conllu
# !wget -O ./datasets/ru_syntagrus-ud-dev.conllu https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/master/ru_syntagrus-ud-dev.conllu

In [4]:
full_train = pyconll.load_from_file('datasets/ru_syntagrus-ud-train.conllu')
full_test = pyconll.load_from_file('datasets/ru_syntagrus-ud-dev.conllu')

In [5]:
train_data = []
test_data = []
test_sent = []

for sent in full_train[:]:
    train_data.append([(token.form, token.upos) for token in sent])

for sent in full_test[:]:
    test_data.append([(token.form, token.upos) for token in sent])
    test_sent.append([(token.form, token.upos) for token in sent])

In [6]:
test_data[3:4]

[[('Таким', 'DET'),
  ('образом', 'NOUN'),
  (',', 'PUNCT'),
  ('некоторые', 'DET'),
  ('инструкции', 'NOUN'),
  ('должны', 'ADJ'),
  ('выполняться', 'VERB'),
  ('строго', 'ADV'),
  ('после', 'ADP'),
  ('завершения', 'NOUN'),
  ('работы', 'NOUN'),
  ('инструкций', 'NOUN'),
  (',', 'PUNCT'),
  ('от', 'ADP'),
  ('которых', 'PRON'),
  ('они', 'PRON'),
  ('зависят', 'VERB'),
  ('.', 'PUNCT')]]

In [7]:
def test_tagger(train_data, test_data, tagger_cl, backoff=None):
    tagger = tagger_cl(train_data, backoff=backoff)
    accuracy = tagger.evaluate(test_data)
    return accuracy

In [8]:
results = {}
tagger_list = [UnigramTagger, BigramTagger, TrigramTagger]

for tagger_cl in tagger_list:
    results[tagger_cl.__name__] = {
        'accuracy': test_tagger(train_data, test_data, tagger_cl)}

In [9]:
for tagger_cl in tagger_list:
    for backoff in tagger_list:
        if tagger_cl.__name__ == backoff.__name__:
            continue
        back_name = f'{tagger_cl.__name__}/{backoff.__name__}'
        backoff_cl = backoff(train_data)
        results[back_name] = {'accuracy': test_tagger(
            train_data, test_data, tagger_cl, backoff=backoff_cl)}

In [10]:
def backoff_tagger(train_data, tagger_cl, backoff=None):
    for cls in tagger_cl:
        backoff = cls(train_data, backoff=backoff)
    return backoff


backoff = UnigramTagger(train_data)
tag = backoff_tagger(train_data,
                     [BigramTagger, TrigramTagger],
                     backoff=backoff)
results['TrigramTagger/BigramTagger/UnigramTagger'] = {
    'accuracy': tag.evaluate(test_data)}

_________

2. написать свой теггер, попробовать разные векторайзеры, добавить знание не только букв но и слов

In [11]:
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [12]:
train_tok = []
train_label = []
for sent in train_data[:]:
    for tok in sent:
        train_tok.append(tok[0])
        train_label.append('NO_TAG' if tok[1] is None else tok[1])

test_tok = []
test_label = []
for sent in test_data[:]:
    for tok in sent:
        test_tok.append(tok[0])
        test_label.append('NO_TAG' if tok[1] is None else tok[1])

In [13]:
le = LabelEncoder()
train_enc_labels = le.fit_transform(train_label)
test_enc_labels = le.transform(test_label)
le.classes_

array(['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN',
       'NO_TAG', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM',
       'VERB', 'X'], dtype='<U6')

In [14]:
hvectorizer1 = HashingVectorizer(ngram_range=(1, 5), analyzer='char')
hvectorizer2 = HashingVectorizer(n_features=100)

cvectorizer1 = CountVectorizer(ngram_range=(1, 5), analyzer='char')
cvectorizer2 = CountVectorizer()

tvectorizer1 = TfidfVectorizer(ngram_range=(1, 5), analyzer='char')
tvectorizer2 = TfidfVectorizer()

In [15]:
vect_list = [hvectorizer1, hvectorizer2, cvectorizer1,
             cvectorizer2, tvectorizer1, tvectorizer2]

for num, vectorizer in enumerate(vect_list, 1):
    X_train = vectorizer.fit_transform(train_tok)
    X_test = vectorizer.transform(test_tok)
    model = LogisticRegression(random_state=13)
    model.fit(X_train, train_enc_labels)
    pred = model.predict(X_test)
    analyzer = 'word' if num % 2 == 0 else 'char'
    results[type(vectorizer).__name__ + '_' +
           analyzer] = {'accuracy': accuracy_score(test_enc_labels, pred)}

In [16]:
result_df = pd.DataFrame(results).transpose(
).sort_values(by='accuracy', ascending=False)
result_df

Unnamed: 0,accuracy
CountVectorizer_char,0.95
TfidfVectorizer_char,0.9382
HashingVectorizer_char,0.9303
BigramTagger/UnigramTagger,0.883
TrigramTagger/BigramTagger/UnigramTagger,0.8821
TrigramTagger/UnigramTagger,0.8818
UnigramTagger,0.8773
UnigramTagger/BigramTagger,0.6997
BigramTagger,0.6963
TrigramTagger/BigramTagger,0.6892


**Вывод:**
* Среди одиночных теггеров - лучшие результаты показал UnigramTagger. Использование backoff позволило (незначительно) повысить точность
* Применение предсказательных моделей на основе n-gram(CHAR) vectorizer-ов значительно повысило точность. Использование параметра analyzer='word' снижает точность предсказаний