In [None]:
# 1.1
!pip install pyconll -q

In [None]:
import pandas as pd
import numpy as np
import re
import string

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import pyconll
import nltk
from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger

In [None]:
%%capture
!wget -q -O ru_syntagrus-ud-train.conllu https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/master/ru_syntagrus-ud-train-a.conllu
!wget -q -O ru_syntagrus-ud-dev.conllu https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/master/ru_syntagrus-ud-dev.conllu

In [None]:
full_train = pyconll.load_from_file('ru_syntagrus-ud-train.conllu')
full_test = pyconll.load_from_file('ru_syntagrus-ud-dev.conllu')

In [None]:
for sent in full_train[:1]:
    for token in sent:
        print(token.form, token.upos)
    print()

Анкета NOUN
. PUNCT



In [None]:
MAX_SENT_LEN = max(len(sent) for sent in full_train)
MAX_ORIG_TOKEN_LEN = max(len(token.form) for sent in full_train for token in sent)
print('Наибольшая длина предложения', MAX_SENT_LEN)
print('Наибольшая длина токена', MAX_ORIG_TOKEN_LEN)

Наибольшая длина предложения 194
Наибольшая длина токена 31


In [None]:
# Заполним данными
fdata_train = []
for sent in full_train[:]:
    fdata_train.append([(token.form, token.upos) for token in sent])

fdata_test = []
for sent in full_test[:]:
    fdata_test.append([(token.form, token.upos) for token in sent])

fdata_sent_test = []
for sent in full_test[:]:
    fdata_sent_test.append([token.form for token in sent])

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
unigram_tagger = UnigramTagger(fdata_train)
unigram_acc = unigram_tagger.evaluate(fdata_test)

bigram_tagger = BigramTagger(fdata_train)
bigram_acc = bigram_tagger.evaluate(fdata_test)

trigram_tagger = TrigramTagger(fdata_train)
trigram_acc = trigram_tagger.evaluate(fdata_test)

bigram_tagger = BigramTagger(fdata_train, backoff=unigram_tagger)
bigram_unigram_acc = bigram_tagger.evaluate(fdata_test)

trigram_tagger = TrigramTagger(fdata_train, backoff=bigram_tagger)
trigram_bigram_unigram_acc = trigram_tagger.evaluate(fdata_test)

print(f'Accuracy:\nUnigram Tagger: {round(unigram_acc, 3)},\nBigram Tagger: {round(bigram_acc, 3)},\n'
      f'Trigram Tagger: {round(trigram_acc, 3)},\nBigram and Unigram Tagger: {round(bigram_unigram_acc, 3)},\n'
      f'Trigram, Bigram and Unigram Tagger: {round(trigram_bigram_unigram_acc, 3)},\n')

Accuracy:
Unigram Tagger: 0.824,
Bigram Tagger: 0.609,
Trigram Tagger: 0.178,
Bigram and Unigram Tagger: 0.829,
Trigram, Bigram and Unigram Tagger: 0.829,



In [None]:
# 1.2
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [None]:
# Переведём тренировочный датасет в списки слов и списки POS-разметки
train_tok = []
train_label = []
for sent in fdata_train[:]:
    for tok in sent:
        train_tok.append(tok[0])
        train_label.append('NO_TAG' if tok[1] is None else tok[1])

test_tok = []
test_label = []
for sent in fdata_test[:]:
    for tok in sent:
        test_tok.append(tok[0])
        test_label.append('NO_TAG' if tok[1] is None else tok[1])

In [None]:
train_tok[:4], train_label[:4]

(['Анкета', '.', 'Начальник', 'областного'], ['NOUN', 'PUNCT', 'NOUN', 'ADJ'])

In [None]:
test_tok = ['' if item is None else item for item in test_tok]

In [None]:
le = LabelEncoder()
train_enc_labels = le.fit_transform(train_label)
train_enc_labels

array([ 7, 13,  7, ...,  1, 11, 13])

In [None]:
test_enc_labels = le.transform(test_label)
test_enc_labels

array([ 7, 13,  1, ...,  0,  7, 13])

In [None]:
le.classes_

array(['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN',
       'NO_TAG', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM',
       'VERB', 'X'], dtype='<U6')

In [None]:
for vectorizer in [CountVectorizer, HashingVectorizer, TfidfVectorizer]:

    scaler = StandardScaler(with_mean=False)
    coder = vectorizer(ngram_range=(1, 5), analyzer='char')


    X_train = coder.fit_transform(train_tok)
    X_test = coder.transform(test_tok)

    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)

    lr = LogisticRegression(random_state=0, max_iter = 100, n_jobs=7)
    lr.fit(X_train, train_enc_labels)

    pred = lr.predict(X_test)

    print(vectorizer, accuracy_score(test_enc_labels, pred))

<class 'sklearn.feature_extraction.text.CountVectorizer'> 0.9294159776027086
<class 'sklearn.feature_extraction.text.HashingVectorizer'> 0.9352106256917768
<class 'sklearn.feature_extraction.text.TfidfVectorizer'> 0.9360895891659613


In [None]:
# 1.3
# 1е место Tfidf
# 2е - Hashing
# 3е - Count

In [None]:
# 2.1
!pip install corus -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/83.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.5/83.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import corus
from corus import load_ne5
import nltk

In [None]:
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
%%capture
!wget http://www.labinform.ru/pub/named_entities/collection5.zip

In [None]:
!unzip -q collection5.zip

In [None]:
records = load_ne5('Collection5/')
records

<generator object load_ne5 at 0x7fbb7f66c740>

In [None]:
document = next(records).text
document

'Замминистра энергетики РФ В.Синюгин покинул свой пост.\r\n\r\nВячеслав Синюгин, занимавший пост заместителя министра энергетики РФ с июня 2008г., попросил об отставке в связи с переходом на другую работу. Как сообщила пресс-служба ведомства, его решение уже согласовано с руководством. "Скорее всего, Вячеслав Юрьевич займется новым интересным отраслевым проектом", - приводятся в сообщении слова министра энергетики Сергея Шматко, который уточнил, что переход В.Синюгина на другую работу обговаривалась изначально, но подробности не сообщил.\r\n\r\nНапомним, С.Шматко назначил 23 июля 2008г. В.Синюгина куратором электроэнергетики, инвестиционной политики ведомства в ТЭК, а также поручил ему контролировать вопросы мобилизационной подготовки, оперативного контроля, гражданской обороны и чрезвычайных ситуаций в ТЭК. При его непосредственном участии, говорится в сообщении, была организована система управления отраслью на этапе после прекращения деятельности "РАО ЕЭС России", подготовлено прохож

In [None]:
{(' '.join(c[0] for c in chunk), chunk.label() ) for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(document))) if hasattr(chunk, 'label') }

{('Вячеслав Синюгин', 'PERSON'),
 ('Вячеслав Юрьевич', 'PERSON'),
 ('Напомним', 'PERSON'),
 ('РАО', 'ORGANIZATION'),
 ('Сергея Шматко', 'PERSON')}

In [None]:
# часть определено верно, часть нет

In [None]:
# установка deeppavlov

!pip uninstall -y tensorflow tensorflow-gpu
!pip install -q numpy scipy librosa unidecode inflect librosa transformers
!pip install -q deeppavlov

Found existing installation: tensorflow 2.12.0
Uninstalling tensorflow-2.12.0:
  Successfully uninstalled tensorflow-2.12.0
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.9/235.9 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m45.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m68.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m71.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.3/468.3 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.8/55.8 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

In [None]:
%%capture
!python -m deeppavlov install squad_bert
!python -m deeppavlov install ner_ontonotes

In [None]:
import deeppavlov
from deeppavlov import configs, build_model

In [None]:
document

'Замминистра энергетики РФ В.Синюгин покинул свой пост.\r\n\r\nВячеслав Синюгин, занимавший пост заместителя министра энергетики РФ с июня 2008г., попросил об отставке в связи с переходом на другую работу. Как сообщила пресс-служба ведомства, его решение уже согласовано с руководством. "Скорее всего, Вячеслав Юрьевич займется новым интересным отраслевым проектом", - приводятся в сообщении слова министра энергетики Сергея Шматко, который уточнил, что переход В.Синюгина на другую работу обговаривалась изначально, но подробности не сообщил.\r\n\r\nНапомним, С.Шматко назначил 23 июля 2008г. В.Синюгина куратором электроэнергетики, инвестиционной политики ведомства в ТЭК, а также поручил ему контролировать вопросы мобилизационной подготовки, оперативного контроля, гражданской обороны и чрезвычайных ситуаций в ТЭК. При его непосредственном участии, говорится в сообщении, была организована система управления отраслью на этапе после прекращения деятельности "РАО ЕЭС России", подготовлено прохож

In [None]:
deeppavlov_ner = build_model(configs.ner.ner_rus_bert, download=True) # ner_bert_ent_and_type_rus

2023-07-10 05:23:52.845 INFO in 'deeppavlov.core.data.utils'['utils'] at line 95: Downloading from http://files.deeppavlov.ai/v1/ner/ner_rus_bert_torch_new.tar.gz to /root/.deeppavlov/models/ner_rus_bert_torch_new.tar.gz
INFO:deeppavlov.core.data.utils:Downloading from http://files.deeppavlov.ai/v1/ner/ner_rus_bert_torch_new.tar.gz to /root/.deeppavlov/models/ner_rus_bert_torch_new.tar.gz
100%|██████████| 1.44G/1.44G [00:55<00:00, 25.9MB/s]
2023-07-10 05:24:49.236 INFO in 'deeppavlov.core.data.utils'['utils'] at line 276: Extracting /root/.deeppavlov/models/ner_rus_bert_torch_new.tar.gz archive into /root/.deeppavlov/models/ner_rus_bert_torch
INFO:deeppavlov.core.data.utils:Extracting /root/.deeppavlov/models/ner_rus_bert_torch_new.tar.gz archive into /root/.deeppavlov/models/ner_rus_bert_torch


Downloading (…)okenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initializ

In [None]:
deeppavlov_ner(document[:6])

[[['З'], ['а'], ['м'], ['м'], ['и'], ['н']],
 [['O'], ['O'], ['O'], ['O'], ['O'], ['O']]]

In [None]:
# 2.2
!pip install razdel -q

In [None]:
from razdel import tokenize

In [None]:
words_docs = []
for ix, rec in enumerate(records):
    words = []
    for token in tokenize(rec.text):
        # if "http://" in input:
        #   token='None'
        type_ent = 'OUT'
        for ent in rec.spans:
            if (token.start >= ent.start) and (token.stop <= ent.stop):
                type_ent = ent.type
                break
        words.append([token.text, type_ent])
    words_docs.extend(words)

In [None]:
df_words = pd.DataFrame(words_docs, columns=['word', 'tag'])

In [None]:
df_words['tag'].value_counts()

OUT         219041
PER          21179
ORG          13647
LOC           4568
GEOPOLIT      4354
MEDIA         2482
Name: tag, dtype: int64

In [None]:
df_words.shape

(265271, 2)

In [None]:
!pip install tensorflow==2.8.0 -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m497.6/497.6 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.5/462.5 kB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m115.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m781.3/781.3 kB[0m [31m60.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import tensorflow as tf

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, GlobalMaxPooling1D, Conv1D, GRU, LSTM, Dropout, Input, Bidirectional,Reshape
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from sklearn import model_selection, preprocessing, linear_model

In [None]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df_words['word'], df_words['tag'])

# labelEncode целевую переменную
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [None]:
train_x.apply(len).max(axis=0)

55

In [None]:
train_data = tf.data.Dataset.from_tensor_slices((train_x, train_y))
valid_data = tf.data.Dataset.from_tensor_slices((valid_x, valid_y))

train_data = train_data.batch(2048)
valid_data = valid_data.batch(2048)

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

train_data = train_data.cache().prefetch(buffer_size=AUTOTUNE)
valid_data = valid_data.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
def custom_standardization(input_data):
    return input_data

vocab_size = 30000
seq_len = 10


vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    # ngrams=(1, 3),
    output_sequence_length=seq_len
    )

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
text_data = train_data.map(lambda x, y: x)
vectorize_layer.adapt(text_data)

In [None]:
t=np.unique(encoder.inverse_transform(valid_y),return_counts=True)[1]
t=t/t.sum()
t  # распределение таргета

array([0.01589312, 0.01794385, 0.00951476, 0.05009198, 0.82638198,
       0.08017431])

In [None]:
model = tf.keras.Sequential([
  vectorize_layer,
  tf.keras.layers.Embedding(len(vectorize_layer.get_vocabulary()), 64, mask_zero=True),
  tf.keras.layers.GlobalAveragePooling1D(),
  tf.keras.layers.Dense(300, activation='relu'),
  tf.keras.layers.Dropout(0.5),
  tf.keras.layers.Dense(50, activation='relu'),
  # tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(6, activation='softmax')
  ])

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              metrics=['accuracy'])
model.fit(train_data, validation_data=valid_data, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fbb7cd02b30>

In [None]:
pred=model.predict(valid_data)

In [None]:
list(zip(encoder.classes_,pred.mean(axis=0)/t))

[('GEOPOLIT', 0.9472928787572787),
 ('LOC', 1.0307716787080554),
 ('MEDIA', 0.9058603517144399),
 ('ORG', 1.0235878538176606),
 ('OUT', 1.0169916656206806),
 ('PER', 0.82620887586401)]

In [None]:
model = tf.keras.Sequential([
    vectorize_layer,
    tf.keras.layers.Embedding(len(vectorize_layer.get_vocabulary()), 64, mask_zero=True),
    # tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(6,activation='softmax')
])

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              metrics=['accuracy'])

In [None]:
model.fit(train_data, validation_data=valid_data, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fbb7f910ac0>

In [None]:
pred=model.predict(valid_data)

In [None]:
list(zip(encoder.classes_,pred.mean(axis=0)/t))

[('GEOPOLIT', 1.0417171231624751),
 ('LOC', 0.9441550309386324),
 ('MEDIA', 0.9997117255685967),
 ('ORG', 1.0691460027936386),
 ('OUT', 1.0078848091588126),
 ('PER', 0.8794690662613485)]

In [None]:
# X_train = list(map(lambda x: x[0], ds))
y_val = list(map(lambda x: x[1], valid_data))
# pred

In [None]:
def get_labels_from_tfdataset(tfdataset, batched=False):

    labels = list(map(lambda x: x[1], tfdataset)) # Get labels

    if not batched:
        return tf.concat(labels, axis=0) # concat the list of batched labels

    return labels

y_val = get_labels_from_tfdataset(valid_data, True)

In [None]:
pred_list = pd.DataFrame(pred)
pred_list

Unnamed: 0,0,1,2,3,4,5
0,5.990709e-03,1.449319e-02,1.378779e-02,0.146598,0.731866,0.087265
1,7.097272e-09,1.013318e-07,4.506289e-08,0.000225,0.999773,0.000002
2,1.048531e-08,1.529681e-07,7.132645e-08,0.000287,0.999711,0.000002
3,1.048531e-08,1.529681e-07,7.132645e-08,0.000287,0.999711,0.000002
4,9.945817e-04,4.269337e-03,4.795839e-03,0.075256,0.904230,0.010454
...,...,...,...,...,...,...
66313,8.139661e-04,2.563603e-03,1.200074e-03,0.012201,0.927677,0.055545
66314,2.350003e-03,1.124144e-02,3.957906e-03,0.039678,0.642031,0.300741
66315,8.357525e-07,6.579159e-06,2.650123e-06,0.001106,0.998787,0.000096
66316,1.160004e-07,1.058469e-06,5.797949e-07,0.000885,0.999101,0.000013


In [None]:
# )
# pred_list = list(pred)
# pred_list[1]
pred_list_2 = pred_list.apply(lambda x: list(x).index(max(x)))
pred_list_2

0     368
1    2405
2    2388
3    1597
4     202
5     701
dtype: int64

In [None]:
loss, accuracy, f1_score, precision, recall = model.evaluate(Xtest, ytest, verbose=0)

In [None]:
# https://datascience.stackexchange.com/questions/45165/how-to-get-accuracy-f1-precision-and-recall-for-a-keras-model
from sklearn.metrics import classification_report
print(classification_report(y_val, pred))