In [261]:
import os
import time
from typing import List
import numpy as np
import pandas as pd
import pymorphy2
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from catboost import CatBoostClassifier, Pool

In [129]:
morph = pymorphy2.MorphAnalyzer()

In [83]:
def tokenize_to_words(text: str):
    start = 0
    end = 0
    sent = []
    for word in text.split(' '):
        end += len(word) + 1
        i, j = 0, 1
        while i < len(word) and not word[i].isalpha():
            i += 1
        while j < len(word) and not word[-j].isalpha():
            j += 1
        wword = word[i:]
        j -= 1
        if j > 0:
            wword = wword[:-j]
        if wword:
            sent.append((wword, start + i, len(wword)))
        start = end
    return sent

def read_data(str_data) -> List[List[str]]:
    return list(map(lambda s: tokenize_to_words(s), str_data.split('\n')))

In [336]:
ner_dataset = []
words_ners = {}

In [337]:
def update_by_annoteted(file_name, words, ann_words):
    ann_words[file_name] = []
    for ann in open("collection5/" + file_name + ".ann"):
        anns = ann.split('\t')
        word = anns[2].strip()
        w = morph.parse(word)[0].normal_form
        ner = anns[1].split(' ')[0]
        ner_map = {'PER': 'PERSON', 'MEDIA': 'ORG'}
        if ner in ner_map:
            ner = ner_map[ner]
        if ner in ['PERSON', 'ORG']:
#                     words[w] = ner
            if w not in words:
                words[w] = {}
            if word not in words:
                words[word] = {}
            ann_words[file_name].append(word)
            words[w][ner] = words[w].get(ner, 0) + 1
            words[word][ner] = words[word].get(ner, 0) + 1
            ner_dataset.append((word, ner))
            
def update_by_text(file_name, words, ann_words):
    for line in open("collection5/" + file_name + ".txt"):
        for word, i, j in tokenize_to_words(line):
            if word not in ann_words[file_name]:
                w = morph.parse(word)[0].normal_form
                if w not in words:
                    words[w] = {}
                if word not in words:
                    words[word] = {}
                words[w]['NONE'] = words[w].get('NONE', 0) + 1
                words[word]['NONE'] = words[word].get('NONE', 0) + 1
                ner_dataset.append((word, 'NONE'))

def collection5(words):
    ann_words = {}
    txt_words = {}
    for file in os.listdir("collection5"):
        file_name = file[:-4]
        if file[-3:] == "ann":
            update_by_annoteted(file_name, words, ann_words)
        elif file[-3:] == "txt":
            if file_name not in ann_words:
                update_by_annoteted(file_name, words, ann_words)
            txt_words[file_name] = []
            update_by_text(file_name, words, ann_words)  
    return words
            
words_ners = collection5(words_ners)

In [338]:
len(words_ners)

57685

In [339]:
def all_names(words):
    for file in os.listdir("names"):
        if file[-1] != 'v':
            continue
        for line in list(open("names/" + file))[1:]:
            name = line.split(';')[1]
            if name not in words:
                words[name] = {}
            words[name]['PERSON'] = words[name].get('PERSON', 0) + 1
    return words
            
words_ners = all_names(words_ners)

In [340]:
len(words_ners)

440513

In [341]:
def get_data1(words):
    for text, ann in zip(open("train_sentences.txt"), open("train_nes.txt")):
        all_words = set(text.strip().split(' '))
        seen_words = set()
        anns = ann.split(' ')[:-1]
        sent = tokenize_to_words(text)
        for i in range(0, len(anns), 3):
            start = int(anns[i])
            finish = start + int(anns[i + 1])
            word = text[start:finish]
            seen_words.add(word)
            word = morph.parse(word.strip())[0].normal_form
            ner = anns[2]
            if word not in words:
                words[word] = {}
            words[word][ner] = words[word].get(ner, 0) + 1
            ner_dataset.append((word, ner))
        unseen_words = all_words - seen_words
        for word in unseen_words:
            if word not in words:
                words[word] = {}
            words[word]['NONE'] = words[word].get('NONE', 0) + 1
            ner_dataset.append((word, 'NONE'))
    return words
            
words_ners = get_data1(words_ners)

In [342]:
len(words_ners)

450384

In [344]:
def get_data2(words):
    for text in open("train_sentences_enhanced.txt"):
        for word in text.split(' '):
            final_ner = 'NONE'
            if word and word[-1] == '}' and '{' in word:
                start = word.index('{')
                ner = word[start + 1 : -1]
                w = word[:start]
                i = 0
                while i < len(w) and not w[i].isalpha():
                    i += 1
                w = w[i:]
                w = morph.parse(w.strip())[0].normal_form
                if ner in ['PERSON', 'ORG']:
                    final_ner = ner
            if word not in words:
                words[word] = {}
            words[word][final_ner] = words[word].get(final_ner, 0) + 1
            ner_dataset.append((word, final_ner))
    return words
            
words_ners = get_data2(words_ners)

In [345]:
len(words_ners)

457543

In [346]:
for k in list(words_ners.keys())[:10]:
    print(k, words_ners[k])

игнатьев {'PERSON': 14, 'NONE': 9}
Игнатьев {'PERSON': 15, 'NONE': 5}
минфин {'ORG': 69, 'NONE': 4}
Минфином {'ORG': 4}
цб {'ORG': 99, 'NONE': 2}
ЦБ {'ORG': 98, 'NONE': 2}
сергей игнатьев {'PERSON': 16}
Сергей Игнатьев {'PERSON': 16}
госдума {'ORG': 288, 'NONE': 3, 'PERSON': 1}
Госдуме {'ORG': 44, 'NONE': 1}


In [377]:
def arg_max_dict(d):
    max_v = 0
    max_a = None
    for k in d:
        if max_v < d[k]:
            max_v = d[k]
            max_a = k
    return max_a

In [422]:
str_data = open("dataset_40163_1.txt").read().strip()
# str_data = """Барак Обама принимает в Белом доме своего французского коллегу Николя Саркози.
# О возможном включении благотворительного фонда в список "иностранных агентов" 7 мая написала газета «Ведомости»."""
data = read_data(str_data)
data[:2]

[[('Список', 0, 6),
  ('пяти', 7, 4),
  ('ключевых', 12, 8),
  ('тем', 21, 3),
  ('саммита', 25, 7),
  ('был', 33, 3),
  ('обнародован', 37, 11),
  ('на', 49, 2),
  ('минувшей', 52, 8),
  ('неделе', 61, 6)],
 [('После', 0, 5),
  ('первой', 6, 6),
  ('инспекции', 13, 9),
  ('предложенного', 23, 13),
  ('для', 37, 3),
  ('Конституционного', 41, 16),
  ('суда', 58, 4),
  ('здания', 63, 6),
  ('из', 71, 2),
  ('которого', 74, 8),
  ('в', 83, 1),
  ('настоящий', 85, 9),
  ('момент', 95, 6),
  ('переезжает', 102, 10),
  ('Исторический', 113, 12),
  ('государственный', 126, 15),
  ('архив', 142, 5),
  ('в', 149, 1),
  ('конце', 151, 5),
  ('года', 162, 4),
  ('Зорькин', 167, 7),
  ('высказывался', 175, 12),
  ('о', 188, 1),
  ('проекте', 190, 7),
  ('положительно', 198, 12)]]

In [404]:
str_data.split('\n')[1]

'По словам Н.Прянишникова, прежде всего должна быть представлена инновационная идея, которая имеет потенциал развития.'

In [405]:
all_words = set()
for sent in data:
    for x, i, j in sent:
        all_words.add(x)
all_words = list(all_words)
words_features, selected_words = features(all_words, all_words)
words_ind = {w: i for i, w in enumerate(selected_words)}
print(len(words_features), len(words_ind))
words_features = change_nones(words_features)
words_pred_ner = full_model.predict(words_features).reshape(-1)

Errors: 789 / 4178
3389 3389


In [423]:
def write_result(data):
    res = []
    for words in data:
        sent = []
        for (word, start, l) in words:
            normal = morph.parse(word)[0].normal_form
            ner = None
            if word in words_ners:
                ner = arg_max_dict(words_ners[word])
            if (ner is None or ner == 'NONE') and normal in words_ners:
                ner = arg_max_dict(words_ners[normal])
#                 print("col5")
#             if ner is None and word in names:
#                 ner = names[word]
#                 print("names")
#             if ner is None and normal in train_data1:
#                 ner = train_data1[normal]
#                 print(normal, ner)
#                 print(1)
#             if ner is None and normal in train_data2:
#                 ner = train_data2[normal]
#                 print(2)
#             if (ner is None or ner == 'NONE') and word in words_ind:
#                 ner = int_to_ner[words_pred_ner[words_ind[word]]]
            if ner is not None and ner != 'NONE':
                sent.append(f"{start} {l} {ner}")
        sent.append("EOL")
        res.append(" ".join(sent))
    print("\n".join(res[:5]))
    with open("out.txt", "w") as f:
        f.write("\n".join(res))
        
write_result(data)

EOL
167 7 PERSON EOL
47 10 ORG EOL
16 11 ORG EOL
121 6 PERSON 154 3 ORG EOL


In [347]:
ner_to_int = {'PERSON': 0, 'ORG': 1, 'NONE': 2}
int_to_ner = {0: 'PERSON', 1: 'ORG', 2: None}
ner_words = [x[0] for x in ner_dataset]
ner_target = np.array([ner_to_int[x[1]] for x in ner_dataset])
target = {}
for yi in ner_target:
    if yi not in target:
        target[yi] = 0
    target[yi] += 1
print(target)

{0: 18707, 1: 17518, 2: 270597}


In [408]:
def first_is_capital(word):
    return [word[0] != word[0].lower()]

def words_ner_dict(word):
    wn = words_ners.get(word, {})
    summa = sum(wn.values())
    return [wn.get(k, 0.) / summa for k in ner_to_int]

def norm_words_ner_dict(word):
    w = morph.parse(word)[0].normal_form
    wn = words_ners.get(w, {})
    summa = sum(wn.values())
    return [wn.get(k, 0.) / summa for k in ner_to_int]

def pymorphy(word):
    p = morph.parse(word)[0]
    return [p.tag.POS,
            p.tag.animacy,
            p.tag.aspect,
            p.tag.case,
            p.tag.gender,
            p.tag.involvement,
            p.tag.mood,
            p.tag.number,
            p.tag.person,
            p.tag.tense,
            p.tag.transitivity,
            p.tag.voice]

def features(X, y):
    data = []
    ys = []
    errs = 0
    for x, yi in zip(X, y):
        try:
            datum = []
            for f in [first_is_capital, pymorphy, words_ner_dict, norm_words_ner_dict]:
                datum += f(x)
            data.append(datum)
            ys.append(yi)
        except Exception:
            errs += 1
    print(f"Errors: {errs} / {len(y)}")
    return data, ys

# print(len(ner_words))
orig_X, y = features(ner_words, ner_target)
# print(len(X))

Errors: 9768 / 306822


In [417]:
def to_float(X):
    coder = {}
    X_n = []
    for xs in X:
        xs_n = []
        for i, x in enumerate(xs):
            if isinstance(x, float):
                xs_n.append(x)
            else:
                if i not in coder:
                    coder[i] = {}
                coder[i][x] = len(coder[i])
                xs_n.append(coder[i][x])
        X_n.append(xs_n)
    return X_n

def balance_target(X, y):
    X_n, y_n = [], []
    n = 0
    for xs, yi in zip(X, y):
        if yi == 2:
            n = n + 1
            if n % 2 == 0:
                X_n.append(xs)
                y_n.append(yi)
        else:
            X_n.append(xs)
            y_n.append(yi)
    return X_n, y_n

# coded_X = to_float(orig_X)
balanced_X, balanced_y = balance_target(coded_X, y)

In [419]:
target = {}
for yi in balanced_y:
    if yi not in target:
        target[yi] = 0
    target[yi] += 1
print(target)

{0: 17444, 1: 15660, 2: 131975}


In [420]:
X_tr, X_te, y_tr, y_te = train_test_split(balanced_X, balanced_y)
print(X_tr[:2])
print(y_tr[:2])

[[2, 18, 3, 3, 10, 4, 2, 3, 3, 4, 4, 3, 3, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0], [2, 18, 3, 3, 10, 4, 2, 3, 3, 4, 4, 3, 3, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0]]
[1, 2]


In [421]:
model = CatBoostClassifier(loss_function="MultiClass", custom_metric='F1', iterations=300, learning_rate=None, metric_period=25, cat_features=list(range(12)))
model.fit(X_tr, y_tr, use_best_model=True, eval_set=(X_te, y_te))
y_pred = model.predict(X_te).reshape(-1,)
print(f1_score(np.ones_like(y_te), y_te == y_pred))
print(np.mean(y_te == y_pred))

0:	learn: 1.0419455	test: 1.0419675	best: 1.0419675 (0)	total: 187ms	remaining: 55.9s
25:	learn: 0.3871019	test: 0.3875390	best: 0.3875390 (25)	total: 3.73s	remaining: 39.3s
50:	learn: 0.1881264	test: 0.1888965	best: 0.1888965 (50)	total: 7.61s	remaining: 37.2s
75:	learn: 0.1069587	test: 0.1080567	best: 0.1080567 (75)	total: 11.8s	remaining: 34.9s
100:	learn: 0.0705625	test: 0.0719096	best: 0.0719096 (100)	total: 16s	remaining: 31.6s
125:	learn: 0.0537518	test: 0.0553283	best: 0.0553283 (125)	total: 20.5s	remaining: 28.2s
150:	learn: 0.0460188	test: 0.0478177	best: 0.0478177 (150)	total: 24.8s	remaining: 24.4s
175:	learn: 0.0421656	test: 0.0441710	best: 0.0441710 (175)	total: 29.2s	remaining: 20.5s
200:	learn: 0.0401189	test: 0.0422748	best: 0.0422748 (200)	total: 33.6s	remaining: 16.5s
225:	learn: 0.0388543	test: 0.0411849	best: 0.0411849 (225)	total: 37.9s	remaining: 12.4s
250:	learn: 0.0381384	test: 0.0405662	best: 0.0405662 (250)	total: 42.7s	remaining: 8.34s
275:	learn: 0.0376070	

In [402]:
full_model = CatBoostClassifier(loss_function="MultiClass", custom_metric='F1', iterations=300, learning_rate=None, metric_period=25, cat_features=list(range(12)))
full_model.fit(coded_X, y)

You should provide test set for use best model. use_best_model parameter has been switched to false value.


0:	learn: 1.0413101	total: 711ms	remaining: 3m 32s
25:	learn: 0.3804145	total: 10.7s	remaining: 1m 53s
50:	learn: 0.1798110	total: 20.7s	remaining: 1m 41s
75:	learn: 0.0982081	total: 30.7s	remaining: 1m 30s
100:	learn: 0.0621114	total: 40.5s	remaining: 1m 19s
125:	learn: 0.0450945	total: 50.5s	remaining: 1m 9s
150:	learn: 0.0373740	total: 1m	remaining: 59.4s
175:	learn: 0.0335899	total: 1m 10s	remaining: 49.6s
200:	learn: 0.0316011	total: 1m 21s	remaining: 40s
225:	learn: 0.0305871	total: 1m 31s	remaining: 30.1s
250:	learn: 0.0300819	total: 1m 42s	remaining: 20s
275:	learn: 0.0297303	total: 1m 52s	remaining: 9.81s
299:	learn: 0.0295393	total: 2m 1s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1a64de7c18>

In [424]:
model.get_feature_importance()
# 4, 13, 14

array([2.88294031e-02, 0.00000000e+00, 3.41596841e-04, 1.61856035e-02,
       6.53826534e+00, 2.03638618e-02, 1.22631495e-01, 1.22622637e-01,
       1.32238144e-02, 5.17217542e-02, 4.24678952e-08, 2.00711216e-02,
       4.84186616e-03, 1.27338514e+01, 2.67178954e+01, 2.76523120e+01,
       1.00522346e+01, 4.54821081e+00, 1.13563973e+01])

In [3]:
bpemb_en = BPEmb(lang="ru", dim=50, vs=10000)

downloading https://nlp.h-its.org/bpemb/ru/ru.wiki.bpe.vs10000.d50.w2v.bin.tar.gz


100%|██████████| 1935516/1935516 [00:00<00:00, 6058636.32B/s]


In [139]:
import os.path

import pandas as pd
import numpy as np
import random

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras.optimizers import RMSprop

RANDOM_SEED = 997
random.seed(RANDOM_SEED)

Using TensorFlow backend.


In [None]:
class Pipeline:
    def __init__(self):
        self.pos_dict = {}
        self.pos_num = {}
        self.num_pos = {}
        self.max_token_len = 20
        self.chars = set()
        self.poses = set()
        self.model = Sequential()

    def prepare_data(self, df):
        token_pos = df['token_pos'].values
        token_text = df['token_text'].values
        k = 0
        for i in range(len(token_pos)):
            if token_pos[i] not in self.pos_num.keys():
                self.pos_num[token_pos[i]] = k
                self.num_pos[k] = token_pos[i]
                k += 1
            self.chars = self.chars.union(set(token_text[i]))
            token_text[i] = self.word_norm(token_text[i])
            if token_text[i] not in self.pos_dict:
                self.pos_dict[token_text[i]] = {}
            self.pos_dict[token_text[i]] = token_pos[i]
            token_pos[i] = self.pos_num[token_pos[i]]
        self.chars.add(' ')
        self.chars = sorted(list(self.chars))
        #print('total chars:', len(self.chars))
        for i in range(len(token_text)):
            if len(token_text[i]) <= self.max_token_len:
                token_text[i] = (' ' * (self.max_token_len - len(token_text[i]))) + token_text[i]
            else:
                token_text[i] = token_text[i][:self.max_token_len]
        return token_text, token_pos

    def vect_tokens(self, tokens):
        char_indices = dict((c, i) for i, c in enumerate(self.chars))
        x = np.zeros((len(tokens), self.max_token_len, len(self.chars)), dtype=np.bool)
        for i, token in enumerate(tokens):
            for t, char in enumerate(token):
                if char in char_indices.keys():
                    x[i, t, char_indices[char]] = 1
        return x

    def vect_poss(self, poss):
        y = np.zeros((len(poss), len(self.pos_num)), dtype=np.bool)
        for i, pos in enumerate(poss):
            y[i, pos] = 1
        return y

    def word_norm(self, word):
        word = word.lower().strip()
        return word

    def build_model(self, tokens, poss):
        self.model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=False, 
            input_shape=(self.max_token_len, len(self.chars))))
            #input_shape=(None, len(self.chars))))
        #self.model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
        self.model.add(Dense(100, activation='sigmoid'))
        self.model.add(Dropout(0.3))
        self.model.add(Dense(len(self.pos_num)))
        self.model.add(Activation('softmax'))

        optimizer = RMSprop(lr=0.01) #0.01
        self.model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    def fit(self, df):
        tokens, poss = self.prepare_data(df)
        X = self.vect_tokens(tokens)
        y = self.vect_poss(poss)
        self.build_model(tokens, poss)
        self.model.fit(X, y, batch_size=128, epochs=50)
        return self

    def predict(self, df):
        token_text = df['token_text'].values
        for i in range(len(token_text)):
            if len(token_text[i]) <= self.max_token_len:
                token_text[i] = (' ' * (self.max_token_len - len(token_text[i]))) + token_text[i]
            else:
                token_text[i] = token_text[i][:self.max_token_len]
        X = self.vect_tokens(token_text)
        y_pred = np.argmax(self.model.predict(X, verbose=0), axis=1)
        y_p = []
        k = 0
        base = True
        for i in range(len(y_pred)):
            if base:
                word = self.word_norm(token_text[i])
                if word in self.pos_dict:
                    y_p.append(self.pos_dict[word])
                    k += 1
                else:
                    y_p.append(self.num_pos[y_pred[i]])
            else:
                y_p.append(self.num_pos[y_pred[i]])
        y_p = np.array(y_p)
        return y_p

In [None]:
def eval(X_test, pred):
    y_true = X_test['token_pos']
    return np.sum(pred == y_true) / len(pred)

def test():
    df_learn = pd.read_csv("./train.csv", encoding='utf-8')
    #print(df_learn.loc[[27262], 'token_text'].values[0] == '?')
    X_train, X_test = train_test_split(df_learn, pr=0.7, random_state=RANDOM_SEED)
    #p = Pipeline().fit(X_train)
    p = Pipeline().fit(X_train)
    pred = p.predict(X_test)
    print(eval(X_test, pred))

def main():
    df_learn = pd.read_csv("./train.csv")
    p = Pipeline().fit(df_learn)

    df_test = pd.read_csv("./test.csv")
    y_pred = p.predict(df_test)
    solution_name, _ext = os.path.splitext(os.path.basename(__file__))
    pd.DataFrame({"token_pos": y_pred, "index": df_test["index"]}) \
        .to_csv(solution_name + ".csv", index=False, header=True)

main()