<a href="https://colab.research.google.com/github/alinaalborova/russian_idioms_processing/blob/main/MICE_Token_Level_multiBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Idiom Type and Token Classification

Based on [MICE: Mining Idioms with Contextual Embeddings](https://arxiv.org/pdf/2008.05759.pdf) by  Škvorc et al.


## Libraries

In [None]:
!pip install transformers
!pip install tensor2tensor

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 3.9MB/s 
[?25hCollecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 43.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |█

In [None]:
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Activation, Concatenate, Masking, GRU
from tensorflow.keras import Sequential
from tensorflow.keras import Input, Model
from tensorflow.keras.utils import to_categorical
from ast import literal_eval
from tensorflow.keras.preprocessing.sequence import pad_sequences
from collections import Counter
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from keras.utils import plot_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import tensorflow as tf
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

## Dataset

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
dataset_vnc_dir = '/content/drive/MyDrive/ВКР/Sense Disambiguation Corpus/token_level_vnc_multiBERT.csv'
data_vnc = pd.read_csv(dataset_vnc_dir )
data_vnc.head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Idiom Normal,Idiom Inflected,Token,Label,Context,Context ID,Context Embedding,Idiom Embedding,Token Embedding
0,0,0,бить карту,бил карту,Он,2,Он бил карту за картой и загребал золото и кре...,0,tf.Tensor(\n[[-0.02800624 -0.18069455 0.26622...,tf.Tensor(\n[[ 0.1000824 -0.18658537 0.44111...,tf.Tensor(\n[[ 0.31914854 -0.5246461 0.69526...
1,1,1,бить карту,бил карту,бил,0,Он бил карту за картой и загребал золото и кре...,0,tf.Tensor(\n[[-0.02800624 -0.18069455 0.26622...,tf.Tensor(\n[[ 0.1000824 -0.18658537 0.44111...,tf.Tensor(\n[[ 0.28880197 -0.0534335 0.75770...


In [None]:
data_vnc.drop(data_vnc.iloc[:, :2], axis=1, inplace=True)
data_vnc.drop('Context ID', axis=1, inplace=True)
data_vnc.head()

Unnamed: 0,Idiom Normal,Idiom Inflected,Token,Label,Context,Context Embedding,Idiom Embedding,Token Embedding
0,бить карту,бил карту,Он,2,Он бил карту за картой и загребал золото и кре...,tf.Tensor(\n[[-0.02800624 -0.18069455 0.26622...,tf.Tensor(\n[[ 0.1000824 -0.18658537 0.44111...,tf.Tensor(\n[[ 0.31914854 -0.5246461 0.69526...
1,бить карту,бил карту,бил,0,Он бил карту за картой и загребал золото и кре...,tf.Tensor(\n[[-0.02800624 -0.18069455 0.26622...,tf.Tensor(\n[[ 0.1000824 -0.18658537 0.44111...,tf.Tensor(\n[[ 0.28880197 -0.0534335 0.75770...
2,бить карту,бил карту,карту,0,Он бил карту за картой и загребал золото и кре...,tf.Tensor(\n[[-0.02800624 -0.18069455 0.26622...,tf.Tensor(\n[[ 0.1000824 -0.18658537 0.44111...,tf.Tensor(\n[[ 1.08091556e-01 -1.02390237e-01 ...
3,бить карту,бил карту,за,2,Он бил карту за картой и загребал золото и кре...,tf.Tensor(\n[[-0.02800624 -0.18069455 0.26622...,tf.Tensor(\n[[ 0.1000824 -0.18658537 0.44111...,tf.Tensor(\n[[-0.01611326 -0.16018972 0.31805...
4,бить карту,бил карту,картой,2,Он бил карту за картой и загребал золото и кре...,tf.Tensor(\n[[-0.02800624 -0.18069455 0.26622...,tf.Tensor(\n[[ 0.1000824 -0.18658537 0.44111...,tf.Tensor(\n[[-0.05653494 0.06898555 0.74265...


In [None]:
data_vnc.Label.value_counts()

2    16711
1     1000
0      988
Name: Label, dtype: int64

In [None]:
data_vnc.shape

(18699, 8)

In [None]:
len(data_vnc['Idiom Normal'].value_counts())

51

In [None]:
dataset_anc_dir = '/content/drive/MyDrive/ВКР/Sense Disambiguation Corpus/token_level_anc_multiBERT.csv'
data_anc = pd.read_csv(dataset_anc_dir )
data_anc.head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Idiom Normal,Idiom Inflected,Token,Label,Context,Context Embedding,Token Embedding
0,0,0,избитая дорога,избитой дороге,С,2,"С бурной быстротой, возможной только в сновид...",tf.Tensor(\n[[ 0.04656217 -0.15822835 -0.39671...,tf.Tensor(\n[[ 0.33673236 0.01402963 0.46577...
1,1,1,избитая дорога,избитой дороге,бурной,2,"С бурной быстротой, возможной только в сновид...",tf.Tensor(\n[[ 0.04656217 -0.15822835 -0.39671...,tf.Tensor(\n[[ 0.13988337 -0.36430976 0.67115...


In [None]:
data_anc.drop(data_anc.iloc[:, :2], axis=1, inplace=True)
data_anc.head()

Unnamed: 0,Idiom Normal,Idiom Inflected,Token,Label,Context,Context Embedding,Token Embedding
0,избитая дорога,избитой дороге,С,2,"С бурной быстротой, возможной только в сновид...",tf.Tensor(\n[[ 0.04656217 -0.15822835 -0.39671...,tf.Tensor(\n[[ 0.33673236 0.01402963 0.46577...
1,избитая дорога,избитой дороге,бурной,2,"С бурной быстротой, возможной только в сновид...",tf.Tensor(\n[[ 0.04656217 -0.15822835 -0.39671...,tf.Tensor(\n[[ 0.13988337 -0.36430976 0.67115...
2,избитая дорога,избитой дороге,быстротой,2,"С бурной быстротой, возможной только в сновид...",tf.Tensor(\n[[ 0.04656217 -0.15822835 -0.39671...,tf.Tensor(\n[[ 0.07310043 -0.16848688 0.36059...
3,избитая дорога,избитой дороге,возможной,2,"С бурной быстротой, возможной только в сновид...",tf.Tensor(\n[[ 0.04656217 -0.15822835 -0.39671...,tf.Tensor(\n[[ 0.11816832 -0.15271866 0.21041...
4,избитая дорога,избитой дороге,только,2,"С бурной быстротой, возможной только в сновид...",tf.Tensor(\n[[ 0.04656217 -0.15822835 -0.39671...,tf.Tensor(\n[[ 0.18717647 -0.294104 0.31264...


In [None]:
data_anc['Idiom Normal'].value_counts()

больное место                 1170
болевая точка                 1141
правая рука                   1114
путеводная звезда              998
нож острый                     971
лавровый венок                 939
бедный родственник             904
зелёная улица                  786
вавилонское столпотворение     774
тяжёлая рука                   656
наша сестра                    638
ваш брат                       587
дальний прицел                 573
старый воробей                 556
пороховая бочка                554
чёрная кость                   505
синяя птица                    462
заблудшая овца                 438
красная бумажка                412
вторая ступень                 400
девичья кожа                   373
старый гриб                    304
другой разговор                279
долгая песня                   257
музейная редкость              219
избитая дорога                 212
маковое зерно                  193
ободранная кошка               187
куриная голова      

In [None]:
len(data_anc['Idiom Normal'].value_counts())

30

In [None]:
data = pd.concat([data_vnc, data_anc], ignore_index=True)
data.head(2)

Unnamed: 0,Idiom Normal,Idiom Inflected,Token,Label,Context,Context Embedding,Idiom Embedding,Token Embedding
0,бить карту,бил карту,Он,2,Он бил карту за картой и загребал золото и кре...,tf.Tensor(\n[[-0.02800624 -0.18069455 0.26622...,tf.Tensor(\n[[ 0.1000824 -0.18658537 0.44111...,tf.Tensor(\n[[ 0.31914854 -0.5246461 0.69526...
1,бить карту,бил карту,бил,0,Он бил карту за картой и загребал золото и кре...,tf.Tensor(\n[[-0.02800624 -0.18069455 0.26622...,tf.Tensor(\n[[ 0.1000824 -0.18658537 0.44111...,tf.Tensor(\n[[ 0.28880197 -0.0534335 0.75770...


In [None]:
data.tail(2)

Unnamed: 0,Idiom Normal,Idiom Inflected,Token,Label,Context,Context Embedding,Idiom Embedding,Token Embedding
35500,бедный родственник,бедных родственниках,бедных,0,"[Егор Дмитрич Глумов, муж] У молодой женщин...",tf.Tensor(\n[[-0.2779434 0.15737975 -0.09745...,,tf.Tensor(\n[[ 0.00859448 -0.20205446 0.63864...
35501,бедный родственник,бедных родственниках,родственниках,0,"[Егор Дмитрич Глумов, муж] У молодой женщин...",tf.Tensor(\n[[-0.2779434 0.15737975 -0.09745...,,tf.Tensor(\n[[ 0.0848169 -0.22677557 0.73319...


## Embed

### Load BERT

In [None]:
model_class, tokenizer_class, pretrained_weights = (ppb.TFBertModel, ppb.BertTokenizer, 'bert-base-multilingual-cased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
BERT_model = model_class.from_pretrained(pretrained_weights)

Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
from keras.preprocessing.sequence import pad_sequences

def pad_sentence(tokenized, max_len):
  print('\nPadding/truncating all sentences to %d values...' % max_len)
  print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))

  # Pad our input tokens with value 0.
  # "post" indicates that we want to pad and truncate at the end of the sequence,
  # as opposed to the beginning.
  input_ids = pad_sequences(tokenized, maxlen=max_len, dtype="long", 
                            value=0, truncating="post", padding="post")
  print('\nDone.')
  return input_ids

def create_att_masks(input_ids):
  # Create attention masks
  attention_masks = []

  # For each sentence...
  for sent in input_ids:
      
      # Create the attention mask.
      #   - If a token ID is 0, then it's padding, set the mask to 0.
      #   - If a token ID is > 0, then it's a real token, set the mask to 1.
      att_mask = [int(token_id > 0) for token_id in sent]
      
      # Store the attention mask for this sentence.
      attention_masks.append(att_mask)
  attention_masks = np.array(attention_masks)    
  return attention_masks

def extract_full_embeddings(output):
  last_hidden_states = output[0] # lhs for all sentences
  extracted = []
  for i, el in enumerate(last_hidden_states): #for each sentence...
    extracted.append(last_hidden_states[i]) 
  return extracted

def use_batches(padded, masked):
  full_embeddings = []

  NUM_OF_IDIOMS_initial = len(masked)
  BATCH_SIZE = 200  # Using larger batch might kill the session when embedding contexts
  NUM_OF_IDIOMS = NUM_OF_IDIOMS_initial
  i = 0

  while NUM_OF_IDIOMS > 0:
    print(i)
    NUM_OF_IDIOMS -= BATCH_SIZE
    print('NUM_OF_IDIOMS -= BATCH_SIZE', NUM_OF_IDIOMS)
    if i < NUM_OF_IDIOMS_initial - BATCH_SIZE:
      output_batch = BERT_model(padded[i:i+BATCH_SIZE], attention_mask = masked[i:i+BATCH_SIZE])
    else:
      output_batch = BERT_model(padded[i:NUM_OF_IDIOMS_initial], attention_mask = masked[i:NUM_OF_IDIOMS_initial])
    i += BATCH_SIZE
    embeddings_batch = extract_full_embeddings(output_batch)
    full_embeddings.append(embeddings_batch)

  full_embeddings_all = []
  for batch in full_embeddings:
    for sentence in batch:
      full_embeddings_all.append(sentence)

  return full_embeddings_all

## Classifier

In [None]:
MAX_CONTEXT_LEN = 220
MAX_TOKEN_LEN = 15
VECTOR_DIM = 768
NUM_CLASSES = 3

In [None]:
def build_model():
    input_token = Input(shape=(MAX_TOKEN_LEN, VECTOR_DIM), name='input1')
    input_context = Input(shape=(MAX_CONTEXT_LEN,VECTOR_DIM), name='input2')

    #token
    forward_layer = GRU(10, return_sequences=False, dropout=0.5)
    backward_layer = GRU(10, return_sequences=False, dropout=0.5,
                    go_backwards=True)
    bidirectional1 = Bidirectional(forward_layer, backward_layer=backward_layer,
                      input_shape=(MAX_TOKEN_LEN,VECTOR_DIM))(input_token)

    #context
    bidirectional2 = Bidirectional(forward_layer, backward_layer=backward_layer,
                      input_shape=(MAX_CONTEXT_LEN,VECTOR_DIM))(input_context)

    concat = Concatenate(axis=1)([bidirectional1, bidirectional2])

    dense = Dense(NUM_CLASSES)(concat)
    softmax = Activation('softmax', name='output')(dense)

    model = Model(inputs=[input_token, input_context], outputs=softmax)
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    plot_model(model, to_file='multiple_inputs.png')
    return model
model_all = build_model()

model_all.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input1 (InputLayer)             [(None, 5, 768)]     0                                            
__________________________________________________________________________________________________
input2 (InputLayer)             [(None, 200, 768)]   0                                            
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 20)           46800       input1[0][0]                     
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 20)           46800       input2[0][0]                     
______________________________________________________________________________________________

## VNC

In [None]:
tokens_vnc = data_vnc['Token'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=50)))
tokenized_contexts_vnc = data_vnc['Context'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=500)))
max_token_length_vnc = max([len(sen) for sen in tokens_vnc])
max_context_length_vnc = max([len(sen) for sen in tokenized_contexts_vnc])
print('Max token length: ', max_token_length_vnc)
print('Max context length: ', max_context_length_vnc)

padded_tokens_vnc = pad_sentence(tokens_vnc, max_token_length_vnc+5)
padded_contexts_vnc = pad_sentence(tokenized_contexts_vnc, max_context_length_vnc+20)
print(len(padded_tokens_vnc))
print(len(padded_contexts_vnc))

masked_tokens_vnc = create_att_masks(padded_tokens_vnc)
masked_contexts_vnc = create_att_masks(padded_contexts_vnc)
print(len(masked_tokens_vnc))
print(len(masked_contexts_vnc))

In [None]:
padded_tokens_vnc_train, padded_tokens_vnc_test = train_test_split(padded_tokens_vnc, test_size=0.3, random_state=34)
masked_tokens_vnc_train, masked_tokens_vnc_test = train_test_split(masked_tokens_vnc, test_size=0.3, random_state=34)

In [None]:
embedded_tokens_vnc = use_batches(padded_tokens_vnc_train, padded_tokens_vnc_train)
len(embedded_tokens_vnc)

In [None]:
embedded_tokens_vnc_test = use_batches(padded_tokens_vnc_test, padded_tokens_vnc_test)
len(embedded_tokens_vnc_test)

In [None]:
padded_contexts_vnc_train, padded_contexts_vnc_test = train_test_split(padded_contexts_vnc, test_size=0.3, random_state=34)
masked_contexts_vnc_train, masked_contexts_vnc_test = train_test_split(masked_contexts_vnc, test_size=0.3, random_state=34)

In [None]:
embedded_contexts_vnc_train = use_batches(padded_contexts_vnc_train, masked_contexts_vnc_train)
len(embedded_contexts_vnc_train)

In [None]:
contexts_unique_vnc = list(set(data_vnc.Context.values))
len(contexts_unique_vnc)

In [None]:
#tokenized_contexts = [tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=200) for x in contexts_unique]
tokenized_contexts_vnc = [tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=200) for x in contexts_unique_vnc]
max_context_length = max([len(sen) for sen in tokenized_contexts_vnc])
print('Max context length: ', max_context_length)

padded_contexts_vnc = pad_sentence(tokenized_contexts_vnc, max_context_length+20)
print(len(padded_contexts_vnc))

masked_contexts_vnc = create_att_masks(padded_contexts_vnc)
print(len(masked_contexts_vnc))

embedded_contexts_vnc = use_batches(padded_contexts_vnc, masked_contexts_vnc)
len(embedded_contexts_vnc)

Max context length:  200

Padding/truncating all sentences to 220 values...

Padding token: "[PAD]", ID: 0

Done.
892
892
0
NUM_OF_IDIOMS -= BATCH_SIZE 692
200
NUM_OF_IDIOMS -= BATCH_SIZE 492
400
NUM_OF_IDIOMS -= BATCH_SIZE 292
600
NUM_OF_IDIOMS -= BATCH_SIZE 92
800
NUM_OF_IDIOMS -= BATCH_SIZE -108


In [None]:
len(embedded_contexts_vnc[0])

220

In [None]:
data_vnc['Token Embedding'] = embedded_tokens_vnc

In [None]:
embeddings_per_token_vnc.shape

In [None]:
examples_per_tokens_vnc = data_vnc.Context.values
# RAM CRASHES if using array instead of list
# embeddings_per_token_vnc = np.empty(shape=(len(embedded_tokens_vnc), MAX_CONTEXT_LEN+20, 768), dtype=object)
embeddings_per_token_vnc = []
for i, context in enumerate(examples_per_tokens_vnc):
  if context in contexts_unique_vnc:
    embeddings_per_token_vnc.append(embedded_contexts_vnc[contexts_unique_vnc.index(context)])
    #embeddings_per_token_vnc[i] = embedded_contexts_vnc[contexts_unique_vnc.index(context)]

len(embeddings_per_token_vnc)

data_vnc['Context Embedding'] = embeddings_per_token_vnc
data_vnc.head(2)

Unnamed: 0,Idiom Normal,Idiom Inflected,Token,Label,Context,Context Embedding,Idiom Embedding,Token Embedding
0,бить карту,бил карту,Он,2,Он бил карту за картой и загребал золото и кре...,"((tf.Tensor(-0.028006509, shape=(), dtype=floa...",tf.Tensor(\n[[ 0.1000824 -0.18658537 0.44111...,"((tf.Tensor(0.31914937, shape=(), dtype=float3..."
1,бить карту,бил карту,бил,0,Он бил карту за картой и загребал золото и кре...,"((tf.Tensor(-0.028006509, shape=(), dtype=floa...",tf.Tensor(\n[[ 0.1000824 -0.18658537 0.44111...,"((tf.Tensor(0.2888023, shape=(), dtype=float32..."


In [None]:
X = [embedded_tokens_vnc, embeddings_per_token_vnc]
labels_vnc = to_categorical(data_vnc.Label)
X_train_vnc, X_test_vnc = train_test_split(X, test_size=0.3, random_state=34)
Y_train_vnc, Y_test_vnc = train_test_split(labels_vnc, test_size=0.3, random_state=34)

In [None]:
data_vnc['Token Embedding'].shape
len(list(data_vnc['Token Embedding'])[0])

15

In [None]:
tokens = data_vnc['Token Embedding'].values
contexts = data_vnc['Context Embedding'].values

### Classifier

In [None]:
model_vnc = build_model()

model_vnc.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input1 (InputLayer)             [(None, 15, 768)]    0                                            
__________________________________________________________________________________________________
input2 (InputLayer)             [(None, 220, 768)]   0                                            
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, 20)           46800       input1[0][0]                     
__________________________________________________________________________________________________
bidirectional_3 (Bidirectional) (None, 20)           46800       input2[0][0]                     
____________________________________________________________________________________________

In [None]:
len(embedded_tokens_vnc[0])

15

In [None]:
len(embedded_tokens_vnc[1])

15

In [None]:
labels_vnc.shape

(18699, 3)

In [None]:
for i, el in enumerate(embedded_contexts_vnc):
  if len(el) != 220:
    print(len(el), i) 

In [None]:
contexts_array = np.asarray(embedded_contexts_vnc)

In [None]:
contexts_array.shape

(892, 220, 768)

In [None]:
len(embedded_contexts_vnc[0])

220

In [None]:
len(tokens_array)

892

In [None]:
 model_vnc.fit({'input1': tokens_array, 'input2': contexts_array}, 
               {'output': labels_vnc[:892]}, batch_size=8, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f19e57a3a10>

In [None]:
model_vnc.fit(X_train_vnc, Y_train_vnc, batch_size=8, epochs=10)#, validation_split=0.1)
print('fit model')

In [None]:
X_train_vnc.shape

(13089, 2)

In [None]:
new = np.asarray(X_train_vnc['Context Embedding'])
type(list(new)[0])

tensorflow.python.framework.ops.EagerTensor

In [None]:
model_vnc.evaluate(np.asarray(X_test_vnc), Y_test_vnc)



[3.1564769744873047, 0.4962686598300934]

In [None]:
MAX_SEQUENCE_LEN = 100
model_all = Sequential()
model_all.add(Masking(mask_value=0., input_shape=(MAX_SEQUENCE_LEN,VECTOR_DIM)))
forward_layer = GRU(10, return_sequences=False, dropout=0.5)
backward_layer = GRU(10, return_sequences=False, dropout=0.5,
                    go_backwards=True)
model_all.add(Bidirectional(forward_layer, backward_layer=backward_layer,
                      input_shape=(MAX_SEQUENCE_LEN,VECTOR_DIM)))
model_all.add(Dense(NUM_CLASSES))
model_all.add(Activation('softmax'))

model_all.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
print('compiled model')

compiled model


In [None]:
model_vnc.fit(np.asarray(X_train_vnc['Context Embedding']), Y_train_vnc, batch_size=8, epochs=10)#, validation_split=0.1)

## ANC

In [None]:
tokens_anc = data_anc['Token'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=50)))
tokenized_contexts_anc = data_anc['Context'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=500)))
max_token_length_anc = max([len(sen) for sen in tokens_anc])
max_context_length_anc = max([len(sen) for sen in tokenized_contexts_anc])
print('Max token length: ', max_token_length_anc)
print('Max context length: ', max_context_length_anc)

padded_tokens_anc = pad_sentence(tokens_anc, max_token_length_anc+5)
padded_contexts_anc = pad_sentence(tokenized_contexts_anc, max_context_length_anc+20)
print(len(padded_tokens_anc))
print(len(padded_contexts_anc))

masked_tokens_anc = create_att_masks(padded_tokens_anc)
masked_contexts_anc = create_att_masks(padded_contexts_anc)
print(len(masked_tokens_anc))
print(len(masked_contexts_anc))

In [None]:
labels_anc = to_categorical(data_anc.Label)
X_train_anc, X_test_anc = train_test_split(embedded_contexts_anc, test_size=0.3, random_state=34)

In [None]:
Y_train_anc, Y_test_anc = train_test_split(labels_anc, test_size=0.3, random_state=34)

### Classifier

In [None]:
model_anc = Sequential()
model_anc.add(Masking(mask_value=0., input_shape=(MAX_SEQUENCE_LEN,VECTOR_DIM)))
forward_layer = GRU(10, return_sequences=False, dropout=0.5)
backward_layer = GRU(10, return_sequences=False, dropout=0.5,
                    go_backwards=True)
model_anc.add(Bidirectional(forward_layer, backward_layer=backward_layer,
                      input_shape=(MAX_SEQUENCE_LEN,VECTOR_DIM)))
model_anc.add(Dense(NUM_CLASSES))
model_anc.add(Activation('softmax'))

model_anc.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
print('compiled model')

compiled model


In [None]:
model_anc.fit(np.asarray(X_train_anc), Y_train_anc, batch_size=8, epochs=10)#, validation_split=0.1)
print('fit model')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
fit model


In [None]:
model_anc.evaluate(np.asarray(X_test_anc), Y_test_anc)



[0.6596354246139526, 0.7644628286361694]

In [None]:
preds = model_all.predict(np.array(X_test))
f1_score(np.argmax(preds, axis=1), np.argmax(Y_test, axis=1))

0.7734375000000001

In [None]:
preds_anc = model_anc.predict(np.array(X_test_anc))



In [None]:
f1_score(np.argmax(preds_anc, axis=1), np.argmax(Y_test_anc, axis=1))

0.7673469387755103

In [None]:
preds_vnc = model_vnc.predict(np.array(X_test_vnc))
f1_score(np.argmax(preds_vnc, axis=1), np.argmax(Y_test_vnc, axis=1))



0.5752508361204014

# Not Present in the Training Set

## Split

In [None]:
data_anc['Idiom Normal'].value_counts()

больное место                 57
правая рука                   56
болевая точка                 52
нож острый                    49
путеводная звезда             48
лавровый венок                44
бедный родственник            41
зелёная улица                 38
тяжёлая рука                  38
ваш брат                      34
вавилонское столпотворение    30
наша сестра                   28
пороховая бочка               27
дальний прицел                25
заблудшая овца                23
вторая ступень                23
старый воробей                22
красная бумажка               21
синяя птица                   20
долгая песня                  18
другой разговор               18
старый гриб                   16
чёрная кость                  15
девичья кожа                  12
маковое зерно                 10
избитая дорога                10
музейная редкость             10
куриная голова                 9
ободранная кошка               9
чернильная строка              3
Name: Idio

In [None]:
test_ancs = ['вавилонское столпотворение', 'ободранная кошка', 'наша сестра', 'пороховая бочка', 'дальний прицел', 'заблудшая овца', 'красная бумажка']

In [None]:
data_anc_test = data_anc.loc[data_anc['Idiom Normal'].isin(test_ancs)]
data_anc_test.shape

(163, 4)

In [None]:
data_anc_train = data_anc.loc[~data_anc['Idiom Normal'].isin(test_ancs)]
data_anc_train.shape

(643, 4)

In [None]:
163/806

0.2022332506203474

In [None]:
data_vnc['Idiom Normal'].value_counts()

бросать тень               53
сесть на мель              47
пускать корни              41
окунуться с головой        31
отвести глаза              29
давит грудь                29
поставить на колени        29
выступить на сцену         27
приложить руку             27
снимать шляпу              27
поднять на ноги            27
положить голову            25
пахнет порохом             23
вырвать с корнем           23
точить нож                 22
преградить дорогу          22
вильнуть хвостом           21
сидеть на печи             20
открыть глаза              20
умывать руки               19
открывать Америку          19
плести кружева             18
взваливать на плечи        17
имей глаза                 17
поддать жару               16
разбить лед                16
чесать затылок             16
прокладывать дорогу        14
бросать перо               14
прижать хвост              13
давать сдачу               13
катить бочку               12
прищемить хвост            12
поймать на

In [None]:
test_vncs = ['точить нож', 'преградить дорогу', 'вильнуть хвостом', 'сидеть на печи', 
             'открыть глаза', 'умывать руки', 'открывать Америку', 'плести кружева', 
             'положить голову']

In [None]:
data_vnc_test = data_vnc.loc[data_vnc['Idiom Normal'].isin(test_vncs)]
data_vnc_test.shape

(186, 4)

In [None]:
data_vnc_train = data_vnc.loc[~data_vnc['Idiom Normal'].isin(test_ancs)]
data_vnc_train.shape

(893, 4)

In [None]:
186/893

0.20828667413213886

In [None]:
data_all_train = pd.concat([data_vnc_train, data_anc_train], ignore_index=True)
data_all_train.head(2)

Unnamed: 0,Idiom Normal,Idiom Inflected,Label,Example
0,бить карту,бил карту,0,Он бил карту за картой и загребал золото и кре...
1,бить карту,бил карту,0,"Ермолов держал карты, сощуря правый глаз; ког..."


In [None]:
data_all_test = pd.concat([data_vnc_test, data_anc_test], ignore_index=True)
data_all_test.head(2)

Unnamed: 0,Idiom Normal,Idiom Inflected,Label,Example
0,открывать Америку,открывать Америку,1,С тех пор от него можно услышать: «Хватит кич...
1,открывать Америку,открывать Америку,0,"Впечатление было такое, что мы на судне Колум..."


## Embed

### VNC

In [None]:
tokenized_idioms_vnc_train_not_present = data_vnc_train['Idiom Inflected'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=50)))
tokenized_contexts_vnc_train_not_present = data_vnc_train['Example'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=500)))
max_idiom_length_vnc_train_not_present = max([len(sen) for sen in tokenized_idioms_vnc_train_not_present])
max_context_length_vnc_train_not_present = max([len(sen) for sen in tokenized_contexts_vnc_train_not_present])
print('Max idiom length: ', max_idiom_length_vnc_train_not_present)
print('Max context length: ', max_context_length_vnc_train_not_present)

padded_idioms_vnc_train_not_present = pad_sentence(tokenized_idioms_vnc_train_not_present, max_idiom_length_vnc_train_not_present+5)
padded_contexts_vnc_train_not_present = pad_sentence(tokenized_contexts_vnc_train_not_present, max_context_length_vnc_train_not_present+20)
print(len(padded_idioms_vnc_train_not_present))
print(len(padded_contexts_vnc_train_not_present))

Max idiom length:  11
Max context length:  234

Padding/truncating all sentences to 16 values...

Padding token: "[PAD]", ID: 0

Done.

Padding/truncating all sentences to 254 values...

Padding token: "[PAD]", ID: 0

Done.
893
893


In [None]:
masked_idioms_vnc_train_not_present = create_att_masks(padded_idioms_vnc_train_not_present)
masked_contexts_vnc_train_not_present = create_att_masks(padded_contexts_vnc_train_not_present)
print(len(masked_idioms_vnc_train_not_present))
print(len(masked_contexts_vnc_train_not_present))

893
893


In [None]:
embedded_idioms_vnc_train_not_present = use_batches(padded_idioms_vnc_train_not_present, masked_idioms_vnc_train_not_present)
len(embedded_idioms_vnc_train_not_present)

0
NUM_OF_IDIOMS -= BATCH_SIZE 693
200
NUM_OF_IDIOMS -= BATCH_SIZE 493
400
NUM_OF_IDIOMS -= BATCH_SIZE 293
600
NUM_OF_IDIOMS -= BATCH_SIZE 93
800
NUM_OF_IDIOMS -= BATCH_SIZE -107


893

In [None]:
embedded_contexts_vnc_train_not_present = use_batches(padded_contexts_vnc_train_not_present, masked_contexts_vnc_train_not_present)
len(embedded_contexts_vnc_train_not_present)

0
NUM_OF_IDIOMS -= BATCH_SIZE 693
200
NUM_OF_IDIOMS -= BATCH_SIZE 493
400
NUM_OF_IDIOMS -= BATCH_SIZE 293
600
NUM_OF_IDIOMS -= BATCH_SIZE 93
800
NUM_OF_IDIOMS -= BATCH_SIZE -107


893

In [None]:
labels_vnc_train_not_present = to_categorical(data_vnc_train.Label)

In [None]:
tokenized_idioms_vnc_test_not_present = data_vnc_test['Idiom Inflected'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=50)))
tokenized_contexts_vnc_test_not_present = data_vnc_test['Example'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=500)))
max_idiom_length_vnc_test_not_present = max([len(sen) for sen in tokenized_idioms_vnc_test_not_present])
max_context_length_vnc_test_not_present = max([len(sen) for sen in tokenized_contexts_vnc_test_not_present])
print('Max idiom length: ', max_idiom_length_vnc_test_not_present)
print('Max context length: ', max_context_length_vnc_test_not_present)

padded_idioms_vnc_test_not_present = pad_sentence(tokenized_idioms_vnc_test_not_present, max_idiom_length_vnc_test_not_present+5)
padded_contexts_vnc_test_not_present = pad_sentence(tokenized_contexts_vnc_test_not_present, max_context_length_vnc_test_not_present+20)
print(len(padded_idioms_vnc_test_not_present))
print(len(padded_contexts_vnc_test_not_present))

masked_idioms_vnc_test_not_present = create_att_masks(padded_idioms_vnc_test_not_present)
masked_contexts_vnc_test_not_present = create_att_masks(padded_contexts_vnc_test_not_present)
print(len(masked_idioms_vnc_test_not_present))
print(len(masked_contexts_vnc_test_not_present))

embedded_idioms_vnc_test_not_present = use_batches(padded_idioms_vnc_test_not_present, masked_idioms_vnc_test_not_present)
len(embedded_idioms_vnc_test_not_present)

embedded_contexts_vnc_test_not_present = use_batches(padded_contexts_vnc_test_not_present, masked_contexts_vnc_test_not_present)
len(embedded_contexts_vnc_test_not_present)

Max idiom length:  10
Max context length:  170

Padding/truncating all sentences to 15 values...

Padding token: "[PAD]", ID: 0

Done.

Padding/truncating all sentences to 190 values...

Padding token: "[PAD]", ID: 0

Done.
186
186
186
186
0
NUM_OF_IDIOMS -= BATCH_SIZE -14
0
NUM_OF_IDIOMS -= BATCH_SIZE -14


186

In [None]:
labels_vnc_test_not_present = to_categorical(data_vnc_test.Label)

### ANC

In [None]:
tokenized_idioms_anc_train_not_present = data_anc_train['Idiom Inflected'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=50)))
tokenized_contexts_anc_train_not_present = data_anc_train['Example'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=500)))
max_idiom_length_anc_train_not_present = max([len(sen) for sen in tokenized_idioms_anc_train_not_present])
max_context_length_anc_train_not_present = max([len(sen) for sen in tokenized_contexts_anc_train_not_present])
print('Max idiom length: ', max_idiom_length_anc_train_not_present)
print('Max context length: ', max_context_length_anc_train_not_present)

padded_idioms_anc_train_not_present = pad_sentence(tokenized_idioms_anc_train_not_present, max_idiom_length_anc_train_not_present+5)
padded_contexts_anc_train_not_present = pad_sentence(tokenized_contexts_anc_train_not_present, max_context_length_anc_train_not_present+20)
print(len(padded_idioms_anc_train_not_present))
print(len(padded_contexts_anc_train_not_present))

masked_idioms_anc_train_not_present = create_att_masks(padded_idioms_anc_train_not_present)
masked_contexts_anc_train_not_present = create_att_masks(padded_contexts_anc_train_not_present)
print(len(masked_idioms_anc_train_not_present))
print(len(masked_contexts_anc_train_not_present))

embedded_idioms_anc_train_not_present = use_batches(padded_idioms_anc_train_not_present, masked_idioms_anc_train_not_present)
len(embedded_idioms_anc_train_not_present)

embedded_contexts_anc_train_not_present = use_batches(padded_contexts_anc_train_not_present, masked_contexts_anc_train_not_present)
len(embedded_contexts_anc_train_not_present)

Max idiom length:  10
Max context length:  221

Padding/truncating all sentences to 15 values...

Padding token: "[PAD]", ID: 0

Done.

Padding/truncating all sentences to 241 values...

Padding token: "[PAD]", ID: 0

Done.
643
643
643
643
0
NUM_OF_IDIOMS -= BATCH_SIZE 443
200
NUM_OF_IDIOMS -= BATCH_SIZE 243
400
NUM_OF_IDIOMS -= BATCH_SIZE 43
600
NUM_OF_IDIOMS -= BATCH_SIZE -157
0
NUM_OF_IDIOMS -= BATCH_SIZE 443
200
NUM_OF_IDIOMS -= BATCH_SIZE 243
400
NUM_OF_IDIOMS -= BATCH_SIZE 43
600
NUM_OF_IDIOMS -= BATCH_SIZE -157


643

In [None]:
labels_anc_train_not_present = to_categorical(data_anc_train.Label)

In [None]:
tokenized_idioms_anc_test_not_present = data_anc_test['Idiom Inflected'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=50)))
tokenized_contexts_anc_test_not_present = data_anc_test['Example'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=500)))
max_idiom_length_anc_test_not_present = max([len(sen) for sen in tokenized_idioms_anc_test_not_present])
max_context_length_anc_test_not_present = max([len(sen) for sen in tokenized_contexts_anc_test_not_present])
print('Max idiom length: ', max_idiom_length_anc_test_not_present)
print('Max context length: ', max_context_length_anc_test_not_present)

padded_idioms_anc_test_not_present = pad_sentence(tokenized_idioms_anc_test_not_present, max_idiom_length_anc_test_not_present+5)
padded_contexts_anc_test_not_present = pad_sentence(tokenized_contexts_anc_test_not_present, max_context_length_anc_test_not_present+20)
print(len(padded_idioms_anc_test_not_present))
print(len(padded_contexts_anc_test_not_present))

masked_idioms_anc_test_not_present = create_att_masks(padded_idioms_anc_test_not_present)
masked_contexts_anc_test_not_present = create_att_masks(padded_contexts_anc_test_not_present)
print(len(masked_idioms_anc_test_not_present))
print(len(masked_contexts_anc_test_not_present))

embedded_idioms_anc_test_not_present = use_batches(padded_idioms_anc_test_not_present, masked_idioms_anc_test_not_present)
len(embedded_idioms_anc_test_not_present)

embedded_contexts_anc_test_not_present = use_batches(padded_contexts_anc_test_not_present, masked_contexts_anc_test_not_present)
len(embedded_contexts_anc_test_not_present)

Max idiom length:  11
Max context length:  192

Padding/truncating all sentences to 16 values...

Padding token: "[PAD]", ID: 0

Done.

Padding/truncating all sentences to 212 values...

Padding token: "[PAD]", ID: 0

Done.
163
163
163
163
0
NUM_OF_IDIOMS -= BATCH_SIZE -37
0
NUM_OF_IDIOMS -= BATCH_SIZE -37


163

In [None]:
labels_anc_test_not_present = to_categorical(data_anc_test.Label)

## Train

### VNC

In [None]:
model_vnc_not_present = Sequential()
model_vnc_not_present.add(Masking(mask_value=0., input_shape=(MAX_SEQUENCE_LEN,VECTOR_DIM)))
forward_layer = GRU(10, return_sequences=False, dropout=0.5)
backward_layer = GRU(10, return_sequences=False, dropout=0.5,
                    go_backwards=True)
model_vnc_not_present.add(Bidirectional(forward_layer, backward_layer=backward_layer,
                      input_shape=(MAX_SEQUENCE_LEN,VECTOR_DIM)))
model_vnc_not_present.add(Dense(NUM_CLASSES))
model_vnc_not_present.add(Activation('softmax'))

model_vnc_not_present.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
print('compiled model')

compiled model


In [None]:
model_vnc_not_present.fit(np.asarray(embedded_contexts_vnc_train_not_present), 
              labels_vnc_train_not_present, batch_size=8, epochs=10)
print('fit model')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
fit model


In [None]:
model_vnc_not_present.evaluate(np.asarray(embedded_contexts_vnc_test_not_present), labels_vnc_test_not_present)



[0.12515968084335327, 0.9569892287254333]

In [None]:
preds_vnc_not_present = model_vnc_not_present.predict(np.array(embedded_contexts_vnc_test_not_present))
f1_score(np.argmax(preds_vnc_not_present, axis=1), np.argmax(labels_vnc_test_not_present, axis=1))

0.943661971830986

### ANC

In [None]:
model_anc_not_present = Sequential()
model_anc_not_present.add(Masking(mask_value=0., input_shape=(MAX_SEQUENCE_LEN,VECTOR_DIM)))
forward_layer = GRU(10, return_sequences=False, dropout=0.5)
backward_layer = GRU(10, return_sequences=False, dropout=0.5,
                    go_backwards=True)
model_anc_not_present.add(Bidirectional(forward_layer, backward_layer=backward_layer,
                      input_shape=(MAX_SEQUENCE_LEN,VECTOR_DIM)))
model_anc_not_present.add(Dense(NUM_CLASSES))
model_anc_not_present.add(Activation('softmax'))

model_anc_not_present.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
print('compiled model')

compiled model


In [None]:
model_anc_not_present.fit(np.asarray(embedded_contexts_anc_train_not_present), 
              labels_anc_train_not_present, batch_size=8, epochs=10)
print('fit model')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
fit model


In [None]:
model_anc_not_present.evaluate(np.asarray(embedded_contexts_anc_test_not_present), labels_anc_test_not_present)



[1.2682161331176758, 0.5644171833992004]

In [None]:
preds_anc_not_present = model_anc_not_present.predict(np.array(embedded_contexts_anc_test_not_present))
f1_score(np.argmax(preds_anc_not_present, axis=1), np.argmax(labels_anc_test_not_present, axis=1))



0.6077348066298343