In [1]:
import pandas as pd
import numpy as np
import pickle
import random
from collections import Counter, defaultdict
from sklearn.neighbors import KDTree

import json
import gensim
from tqdm.auto import tqdm, trange

%matplotlib inline
import matplotlib.pyplot as plt

import sys
import os

def add_sys_path(p):
    p = os.path.abspath(p)
    if p not in sys.path:
        sys.path.append(p)

add_sys_path('..')

In [2]:
import evaluate
import data_split

In [3]:
n_ds = evaluate.read_dataset('../data/training_data/training_nouns.tsv',  lambda x: json.loads(x))
train, dev, test1, test2, hid, forbidden_words = data_split.split_dict(n_ds)

In [4]:
len(forbidden_words), len(n_ds.keys())

(5083, 25376)

In [5]:
#forbidden_words = set(n_ds.keys())

In [6]:
import my_knn
from importlib import reload
reload(my_knn)
from my_knn import SynsetStorage, RelationStorage

In [7]:
taiga = gensim.models.KeyedVectors.load_word2vec_format(
    'C:/Users/ddale/Downloads/NLP/rusvectores/taiga_skipgram/model.bin', 
    binary=True,
)

In [8]:
w2v_embedder_pos = my_knn.W2VWrapper(taiga,  pos_weights={'NOUN': 1.0, 'PREP': 0.1}, default_weight=0.5)

In [9]:
w2v_embedder = my_knn.W2VWrapper(taiga)

In [85]:
ft = gensim.models.fasttext.FastTextKeyedVectors.load(
    'C:/Users/ddale/Downloads/NLP/rusvectores/model.model'
)

In [86]:
ft_embedder = my_knn.SentenceEmbedder(ft=ft, n=300, normalize_word=True, pos_weights={'PREP': 0.1})
print(ft_embedder('привет как дела').shape)

(300,)


In [10]:
syns_storage, rel_storage, rel_df = my_knn.prepare_storages(
    synsets_filename='../data/ruwordnet/synsets.N.xml',
    relations_filename='../data/ruwordnet/synset_relations.N.xml',
    forbidden_words=forbidden_words,
)

number of texts: 86549
forbidden senses are 4126
numer of ids 29296 long list is 95119


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


23827
466
5
30877


In [11]:
w2v_vecs_pos = np.stack([w2v_embedder_pos(t) for t in tqdm(syns_storage.texts_long) ])
w2v_tree_pos = KDTree(w2v_vecs_pos)

HBox(children=(FloatProgress(value=0.0, max=95119.0), HTML(value='')))




In [12]:
from textdistance import damerau_levenshtein
damerau_levenshtein('лев', 'левобережье') / len('левобережье')

0.7272727272727273

In [32]:
import ranking
reload(ranking);
import download_wiki
reload(download_wiki);

In [15]:
wiki_storage = download_wiki.CachedDownloader(downloader=download_wiki.get_definition)
wiki_storage.collect_from_files('cache')

Got 1000 definitions from file wiki_private_nouns_0_1000.pkl
Got 525 definitions from file wiki_private_nouns_1000_2000.pkl
Got 762 definitions from file wiki_public_nouns_0_1000.pkl
Got 810 definitions from file wiki_training_nouns_0_1000.pkl
Got 787 definitions from file wiki_training_nouns_10000_11000.pkl
Got 803 definitions from file wiki_training_nouns_1000_2000.pkl
Got 770 definitions from file wiki_training_nouns_11000_12000.pkl
Got 794 definitions from file wiki_training_nouns_12000_13000.pkl
Got 793 definitions from file wiki_training_nouns_13000_14000.pkl
Got 798 definitions from file wiki_training_nouns_14000_15000.pkl
Got 783 definitions from file wiki_training_nouns_15000_16000.pkl
Got 772 definitions from file wiki_training_nouns_16000_17000.pkl
Got 803 definitions from file wiki_training_nouns_17000_18000.pkl
Got 732 definitions from file wiki_training_nouns_18000_19000.pkl
Got 740 definitions from file wiki_training_nouns_19000_20000.pkl
Got 763 definitions from file wi

In [259]:
def knn_candidates(
        text,
        synset_storage: SynsetStorage,
        rel_storage: RelationStorage,
        index=None,
        text2vec=None,
        k=10,
        verbose=False,
):
    ids_list = synset_storage.ids_long
    texts_list = synset_storage.texts_long

    vec = text2vec(text)
    distances, indices = index.query(vec.reshape(1, -1), k=k)
    
    hypotheses = defaultdict(lambda: defaultdict(lambda: 0))

    vec_plain = w2v_embedder.get_text_vec(text.lower())
    vec_ft = ft_embedder(text.lower())
    for i, d in zip(indices.ravel(), distances.ravel()):
        hypers = rel_storage.id2hypernym.get(ids_list[i], set())
        t1, t2 = text.lower(), texts_list[i].lower()
        
        splain = np.dot(vec_plain, w2v_embedder.get_text_vec(t2))
        sim_ft = np.dot(vec_ft, ft_embedder(t2))
        #leven_min = damerau_levenshtein(t1, t2) / min(len(t1), len(t2))
        #leven_max = damerau_levenshtein(t1, t2) / max(len(t1), len(t2))
        s = (1-d**2/2)
        magic = np.exp(-d ** 3) * (splain ** 5)

        for parent in hypers:
            ids = [parent] + sorted(rel_storage.id2hypernym.get(parent, set()))
            for j, hyper_id in enumerate(ids):
                int_o = 1 + (j > 0)
                o = str(int_o)
                h = hypotheses[hyper_id]
                h['query'] = text
                h['document'] = hyper_id
                h['neighbor'] = h.get('neighbor', texts_list[i])
                h['sum_magic'] += magic / int_o
                h['sum_magic_exp'] += np.exp(-d ** 3) / int_o
                h['sum_magic_s5'] += (splain ** 5) / int_o
                h['sum_ft_sim'] += sim_ft / int_o
                h['sum_ft_sim_5'] += sim_ft ** 5 / int_o
                continue

                h['first_heighbor_' + o] = min(h.get('first_heighbor_' + o, 100500), i)
                h['n_' + o] += 1
                h['max_dist_' + o] = max(h['max_dist_' + o], d)
                h['min_dist_' + o] = min(h.get('min_dist_' + o, 100), d)
                #h['min_leven_min_' + o] = min(h.get('min_leven_min_' + o, 100), leven_min)
                #h['min_leven_max_' + o] = min(h.get('min_leven_max_' + o, 100), leven_max)
                h['sum_magic_' + o] += magic
                h['sum_dist_' + o] += d
                h['sum_dist2_' + o] += d**2
                h['sum_dist3_' + o] += d**3
                h['sum_sim_'+o] += s
                h['sum_sim2_'+o] += s**2
                h['sum_sim5_'+o] += s**5
    return hypotheses

In [260]:
train_keys = sorted(train.keys())
random.seed(42)
train_small_keys = random.sample(train_keys, k=1000)

In [261]:
keys = train_small_keys

Весь тренировочный сет заваривается полчаса; ужас!

In [262]:
pre_x = []
pre_y = []
query_ids = []
for i, w in enumerate(tqdm(keys)):
    cands = knn_candidates(
        w, 
        index=w2v_tree_pos, text2vec=w2v_embedder_pos, 
        synset_storage=syns_storage, rel_storage=rel_storage,
        k=100,
    )
    cands = ranking.add_defin_hypotheses(
        w, cands, 
        definition_extractor=wiki_storage,
        synset_storage=syns_storage,
    )
    gt_labels = {s for senses in train[w] for s in senses}
    for cand_id, cand_dict in cands.items():
        query_ids.append(i)
        pre_y.append(cand_id in gt_labels)
        pre_x.append(dict(cand_dict))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




In [263]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score

In [264]:
y = np.array(pre_y)

def extract_features(pre_x, remove_meta=True):
    x = pd.DataFrame(pre_x)
    x.fillna(0, inplace=True)

    if remove_meta:
        for col in ['query', 'document', 'neighbor']:
            if col in x.columns:
                x.drop(col, axis=1, inplace=True)
    
    eps = 1e-12
    if 'sum_dist_1' in x.columns and 'n_1' in x.columns:
        x['mean_dist_1'] = x.sum_dist_1 / (x.n_1 + eps)
        x['mean_dist_2'] = x.sum_dist_2 / (x.n_2 + eps)
        x['mean_dist_all'] = (x.sum_dist_1 + x.sum_dist_2) / (x.n_1 + x.n_2 + eps)
        x['has_1'] = (x.n_1 > 0).astype(int)
        x['has_2'] = (x.n_2 > 0).astype(int)
    if 'wiki_n_matches' in x.columns:
        x['wiki_has_matches'] = (x['wiki_n_matches'] > 0).astype(int)
    return x

x = extract_features(pre_x)

Только половина запросов вообще хоть как-то нашлась в вики. грустно. 

In [265]:
xx = extract_features(pre_x, remove_meta=False)
xx.groupby('query').wiki_has_matches.max().mean()

0.363

In [266]:
x.sample(3)

Unnamed: 0,sum_magic,sum_magic_exp,sum_magic_s5,sum_ft_sim,sum_ft_sim_5,wiki_min_place,wiki_match_len,wiki_n_senses,wiki_n_matches,wiki_has_matches
99073,0.007764,0.336388,0.023081,0.290825,0.00208,0,0,0,0,0
47993,0.013458,0.381565,0.035271,0.362385,0.00625,0,0,0,0,0
34460,0.021929,0.77339,0.05335,0.657257,0.067686,0,0,0,0,0


In [267]:
x.describe()

Unnamed: 0,sum_magic,sum_magic_exp,sum_magic_s5,sum_ft_sim,sum_ft_sim_5,wiki_min_place,wiki_match_len,wiki_n_senses,wiki_n_matches,wiki_has_matches
count,99862.0,99862.0,99862.0,99862.0,99862.0,99862.0,99862.0,99862.0,99862.0,99862.0
mean,0.147855,1.099073,0.210207,1.152739,0.2289,11.008752,0.759158,0.368398,0.141806,0.138782
std,0.62152,2.626276,0.742241,2.655267,0.84702,39.406292,2.333717,1.077094,0.358172,0.34572
min,0.0,0.0,0.0,-0.292165,-0.002129,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.203055,0.0,0.176272,0.000693,0.0,0.0,0.0,0.0,0.0
50%,0.014535,0.409329,0.034689,0.455378,0.029244,0.0,0.0,0.0,0.0,0.0
75%,0.076041,1.001826,0.138971,1.143666,0.157405,0.0,0.0,0.0,0.0,0.0
max,36.411295,86.606781,37.283963,77.94547,39.291219,625.0,47.0,9.0,4.0,1.0


for w in train_small_keys:
    if w in syns_storage.word2sense:
        print(w)
        print(train[w])
        print(syns_storage.word2sense[w])
        print([rel_storage.id2hypernym[s] for s in syns_storage.word2sense[w]])
        print('')

In [268]:
print(x.shape, sum(pre_y))

(99862, 10) 2696


In [269]:
from catboost import CatBoostClassifier

In [271]:
model = LogisticRegression(max_iter=1000, C=0.01, penalty='l1', solver='liblinear')
#model = CatBoostClassifier(iterations=100, depth=3, verbose=0)
print(cross_val_score(model, x, y, cv=3, scoring='roc_auc'))

[0.94755809 0.93446388 0.94661316]


```
[0.88538203 0.8336317  0.85868983]  100 words
[0.85867767 0.87508228 0.85045668]  300 words
[0.86614533 0.85163482 0.85033878] 1000 words
[0.89062933 0.83815102 0.87119477]  100 words, 13 features
[0.95877762 0.93675879 0.92065066]  100 words, 15 features (with levenshtein)
[0.74918301 0.86308356 0.81610435]  100 words, 13 features, drop all train words
[0.95552977 0.93510626 0.92946495]  100 words, 23 features (with similarity)
[0.94820527 0.94484599 0.95262975] 3000 words, 23 features
[0.94824586 0.95184857 0.95281711]  all words, 23 features
[0.95035338 0.9292177  0.90908593]  100 words, magic-only
[0.95491309 0.95448801 0.93524542]  100 words, magic-only with uniform-POS scorer
[0.95754426 0.95235692 0.92833908]  100 words, magic-only with uniform-POS scorer and fixed power
```

In [274]:
model.fit(x, y);
pd.Series(model.coef_[0], index=x.columns)
#pd.Series(model.feature_importances_, index=x.columns)

sum_magic           2.669586
sum_magic_exp      -1.057455
sum_magic_s5        0.000000
sum_ft_sim          0.873499
sum_ft_sim_5        0.000000
wiki_min_place     -0.011819
wiki_match_len      0.071185
wiki_n_senses      -0.016117
wiki_n_matches      0.000000
wiki_has_matches    0.000000
dtype: float64

In [275]:
pre_features_dict = dict()
labels_dict = dict()
for i, w in enumerate(tqdm(dev)):
    cands = knn_candidates(
        w, 
        index=w2v_tree_pos, text2vec=w2v_embedder_pos, 
        synset_storage=syns_storage, rel_storage=rel_storage,
        k=100,
    )
    cands = ranking.add_defin_hypotheses(
        w, cands, 
        definition_extractor=wiki_storage,
        synset_storage=syns_storage,
    )
    labels = []
    pre_features = []
    for cand_id, cand_dict in cands.items():
        labels.append(cand_id)
        pre_features.append(dict(cand_dict))
    pre_features_dict[w] = pre_features
    labels_dict[w] = labels

HBox(children=(FloatProgress(value=0.0, max=508.0), HTML(value='')))




In [276]:
predictions = dict()

for i, w in enumerate(tqdm(dev)):
    features = extract_features(pre_features_dict[w])
    scores = model.predict_proba(features)[:, 1]
    predictions[w] = pd.Series(scores, index=labels_dict[w]).sort_values(ascending=False).head(10).index.tolist()

HBox(children=(FloatProgress(value=0.0, max=508.0), HTML(value='')))




In [277]:
mean_ap, mean_rr = evaluate.get_score(dev, predictions, k=10)
print(mean_ap, mean_rr)

0.4522257634462358 0.4832161604799399


```
0.4695603804732742 0.507339707536557  * previous baseline
0.3971516581260675 0.42608580177477784  c=1    , три магические фичи
0.4195125348914717 0.44960629921259826  c=0.1
0.4663870141232345 0.50311914135733     c=0.01
0.4657120984876891 0.5006608548931382   with wikipedia data ='(
0.4442544942298880 0.4749015748031495  with wiki data and catboost
```

```
0.4695603804732742 0.507339707536557 * - previous baseline
0.440481918926801  0.4710614298212723  - logreg scorer (100 training samples, 9 features)
0.4466926660209143 0.4790729283839521  - 300 samples
0.4468210353914096 0.4791447944007000  - 1000 samples
0.4582034667541558 0.4897161292338458+ - 100 samples, 13 features
0.4394665510561180 0.4742040057492813  - 100 samples, 15 features, looks like overfit
0.3241014144065326 0.3557024121984751  - 100 samples, 13 features, with more forbidden words
0.4306767643627878 0.4671080177477813  - 100 samples, 23 features, with similarity
0.4626613079615048 0.4990266841644794  - 3k samples, 23 features; almost there!
0.4670162323459567 0.5041869766279214  - all samples, 23 features - still not there, hmmm...
0.4584119693371660 0.4955021247344079  - 100 samples + magic + fixed similarities 
0.4584119693371660 0.4955021247344079  - magic as a single feature; why are we still not here???
0.4395115975086445 0.4733978565179351  - magic (uni-pos w2v) as a single feature; WTF???
0.4691010628879724 0.5070444319460065  - fixed a typo in magic and funally reproduced the baseline
```

#### смотрим, что же не так с ответами

не найдены
* named entities (ФСТР)
* абстракции (гладкость, въедливость, состязательное судопроизводство)

неверные соседи
* фразы (диагноз болезни, восточная европа, смещение на низшую должность)

много значений
* гео (казанка, босанцы, туниска)
* честно многозначное (рубка)

абстрактные гиперонимы
* пенек

плохо отличаю верхнеуровневые сущности (процессы от предметов от свойств и т.п.)
* крошение

In [406]:
w = np.random.choice(list(dev.keys()))
print(w)

СОСТЯЗАТЕЛЬНОЕ СУДОПРОИЗВОДСТВО


In [407]:
gt_set = {t for tt in dev[w] for t in tt}
cand_set = {c['document'] for c in pre_features_dict[w]}
res_set = predictions[w]

In [408]:
for i, concomp in enumerate(dev[w]):
    print(i)
    for synset_id in concomp:
        print(
            synset_id, 
            int(synset_id in predictions[w]), 
            int(synset_id in cand_set), 
            syns_storage.get_synset_name(synset_id),
        )

0
106454-N 0 0 ПОЛОЖЕНИЕ, ТЕЗИС
106825-N 0 0 ПРИНЦИП, ОСНОВНОЕ ПОЛОЖЕНИЕ


In [409]:
for synset_id in predictions[w]:
    print(
        synset_id,
        int(synset_id in gt_set),
        syns_storage.get_synset_name(synset_id),
    )

106562-N 0 СФЕРА ДЕЯТЕЛЬНОСТИ
106898-N 0 РАССМОТРЕТЬ, РАЗОБРАТЬ
149658-N 0 СЛУШАНИЕ (ОБСУЖДЕНИЕ)
106501-N 0 ЗАНЯТИЕ, ДЕЯТЕЛЬНОСТЬ
134994-N 0 СУДЕБНЫЙ ДОКУМЕНТ
3199-N 0 СУДЕБНОЕ РЕШЕНИЕ
1853-N 0 СУДОПРОИЗВОДСТВО
115790-N 0 СИСТЕМА (ОРГАНИЗОВАННОЕ ЦЕЛОЕ)
143739-N 0 РАЗБИРАТЕЛЬСТВО СПОРА
661-N 0 СУД
