In [None]:
import pandas as pd
import nltk
from scipy.spatial import distance
import pymorphy2

In [None]:
def compare_sentences(s1, s2):
    s1 = nltk.word_tokenize(s1)
    s2 = nltk.word_tokenize(s2)
    if len(s1)!=len(s2):
        return False
    morph = pymorphy2.MorphAnalyzer()
    equal = True
    for i in range(len(s1)):
        w1 = morph.parse(s1[i])[0]
        norm1 = w1.normal_form
        w2 = morph.parse(s2[i])[0]
        norm2 = w2.normal_form
        equal = equal and norm1==norm2
    return equal

In [None]:
def is_it_correct(df, correct_pairs_df):
    print(len(df))
    is_correct = []
    for i in range(len(df)):
        A = df['abbreviation'][i].lower()
        D = df['definition'][i].lower()
        l = 0
        r = len(correct_pairs_df)
        while(l < r-1):
            m = (l+r)//2
            #print('l = ', l, ' r = ', r, ' m = ', m)
            if correct_pairs_df['abbreviation'][m].lower() <= A:
                l = m
            else:
                r = m
        #print('done')
        if (correct_pairs_df['abbreviation'][l].lower()==A and
            compare_sentences(correct_pairs_df['definition'][l].lower(), D)):
            is_correct.append(1)
        else:
            is_correct.append(0)
        #print('i = ', i)
    return is_correct

In [None]:
def common_first_letters_cnt(df):
    feature = []
    for i in range(len(df)):
        A = df['abbreviation'][i]
        D = nltk.word_tokenize(df['definition'][i])
        common_letters = 0
        for word in D:
            if word.lower()[0] in A.lower():
                common_letters += 1
        feature.append(common_letters/len(A))
    return feature

In [None]:
def parenthesis_feature(df, pairs):
    feature = []
    for i in range(len(pairs)):
        abbreviation_in_parenthesis = False
        definition_in_parenthesis = False
        idx = pairs['abbreviation_place'][i]
        l = pairs['begin'][i]
        r = pairs['end'][i]
        left_p = False
        right_p = False
        for j in range(max(0, idx-5), min(len(df), idx+6)):
            if df['token'][j]=='(':
                left_p = True
            if df['token'][j]==')' and left_p:
                right_p = True
        abbreviation_in_parenthesis = left_p and right_p
        
        left_p = False
        right_p = False
        for j in range(max(0, l-5), l):
            if df['token'][j]=='(':
                left_p = True
        for j in range(r+1, min(len(df), r+6)):
            if df['token'][j]==')':
                right_p = True
        definition_in_parenthesis = left_p and right_p
        
        feature.append(int(abbreviation_in_parenthesis or definition_in_parenthesis))
    return feature

In [None]:
def add_vectors(v, w):
    res = [vi + wi for vi, wi in zip(v, w)]
    return res

In [None]:
def divide_vector(vec, div):
    res = [el/div for el in vec]
    return res

In [None]:
from deeppavlov.core.common.file import read_json
from deeppavlov import build_model, configs
import torch

bert_config = read_json(configs.embedder.bert_embedder)
bert_config['metadata']['variables']['BERT_PATH'] = r'/content/drive/MyDrive/ColabNotebooks/rubert_cased_L-12_H-768_A-12_pt'

model = build_model(bert_config)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package perluniprops to /root/nltk_data...
[nltk_data]   Unzipping misc/perluniprops.zip.
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping corpora/nonbreaking_prefixes.zip.
Some weights of the model checkpoint at /content/drive/MyDrive/ColabNotebooks/rubert_cased_L-12_H-768_A-12_pt were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of 

In [None]:
def almost_synonyms_with_bert(pairs):
    morph = pymorphy2.MorphAnalyzer()
    feature = []
    for i in range(len(pairs)):
        A = pairs['abbreviation'][i].lower()
        D = nltk.word_tokenize(pairs['definition'][i].lower())
        try:
            _, token_embs, _, _, _, _, _ = model([A])
            omega = token_embs[0][0]
            eta = [0]*len(omega)
            l = 0
            for word in D:
                #print(1)
                try:
                    w = morph.parse(word)[0]
                    norm = w.normal_form
                    _, token_embs, _, _, _, _, _ = model([norm])
                    vec = token_embs[0][0]
                    eta = add_vectors(eta, vec)
                    #print(2.1)
                    l += 1
                except KeyError: 
                    continue
            eta = divide_vector(eta, l)
            dis = distance.cosine(omega, eta)
            feature.append(dis)
        except KeyError:
            #print(A)
            feature.append(None)
        except ZeroDivisionError:
            feature.append(None)
    return feature

In [None]:
def longest_common_subsequence(A, D):
    D = nltk.word_tokenize(D)
    D = ''.join(D)
    #print(A)
    #print(D)
    n = len(A)
    m = len(D)
    matrix = [["" for x in range(m)] for x in range(n)]
    for i in range(n):
        for j in range(m):
            if A[i]==D[j]:
                if i==0 or j==0:
                    matrix[i][j] = A[i]
                else:
                    matrix[i][j] = matrix[i-1][j-1] + A[i]
            else:
                matrix[i][j] = max(matrix[i-1][j], matrix[i][j-1], key=len)
    cs = matrix[-1][-1]
    return len(cs)/n

In [None]:
def lcs_feature(df):
    feature = []
    for i in range(len(df)):
        A = df['abbreviation'][i]
        D = df['definition'][i]
        lcs = longest_common_subsequence(A.lower(), D.lower())
        feature.append(lcs)
    return feature

In [None]:
pairs = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/data/potential_pairs.csv")
df = pd.read_excel("/content/drive/MyDrive/ColabNotebooks/data/merges_data.xlsx")
correct_pairs = pd.read_excel("/content/drive/MyDrive/ColabNotebooks/data/merged_data_correct_pairs.xlsx")

In [None]:
pairs.head()

Unnamed: 0.1,Unnamed: 0,abbreviation,definition,distance,begin,end,abbreviation_place
0,0,ВРТ,витамина D регуляции,8,33,37,44
1,1,ВРТ,влиянии результаты,6,37,39,44
2,2,ВРТ,влиянии результаты использования,5,37,40,44
3,3,ВРТ,влиянии результаты использования вспомогательных,4,37,41,44
4,4,ВРТ,влиянии результаты использования вспомогательн...,3,37,42,44


In [None]:
pairs['is_it_correct'] = is_it_correct(pairs, correct_pairs)
pairs['first_letters'] = common_first_letters_cnt(pairs)
pairs['parenthesis'] = parenthesis_feature(df, pairs)
pairs['almost_synonyms'] = almost_synonyms_with_bert(pairs)
pairs['lcs_feature'] = lcs_feature(pairs)
pairs.to_csv("/content/drive/MyDrive/ColabNotebooks/data/potential_pairs_with_features.csv")

1878
