<a href="https://colab.research.google.com/github/alinaalborova/russian_idioms_processing/blob/main/MICE_multiBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Idiom Type and Token Classification

Based on [MICE: Mining Idioms with Contextual Embeddings](https://arxiv.org/pdf/2008.05759.pdf) by  Škvorc et al.


In [None]:
!pip install transformers
!pip install tensor2tensor

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 3.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 39.4MB/s 
[?25hCollecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |█

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import tensorflow as tf
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [None]:
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Activation, TimeDistributed, Masking, GRU
from tensorflow.keras import Sequential
from tensorflow.keras.utils import to_categorical
from ast import literal_eval
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from collections import Counter
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score

## Dataset

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
dataset_vnc_dir = '/content/drive/MyDrive/ВКР/Sense Disambiguation Corpus/VNCs_Annotated.csv'
data_vnc = pd.read_csv(dataset_vnc_dir )
data_vnc.head(2)

Unnamed: 0,Idiom Normal,Idiom Inflected,Label,Example
0,бить карту,бил карту,0,Он бил карту за картой и загребал золото и кре...
1,бить карту,бил карту,0,"Ермолов держал карты, сощуря правый глаз; ког..."


In [None]:
data_vnc.Label.value_counts()

1    455
0    438
Name: Label, dtype: int64

In [None]:
data_vnc.shape

(893, 4)

In [None]:
len(data_vnc['Idiom Normal'].value_counts())

51

In [None]:
len(data_anc['Idiom Inflected'].value_counts())

180

In [None]:
dataset_anc_dir = '/content/drive/MyDrive/ВКР/Sense Disambiguation Corpus/ANCs_Annotated.csv'
data_anc = pd.read_csv(dataset_anc_dir )
data_anc.head(2)

Unnamed: 0,Idiom Normal,Idiom Inflected,Label,Example
0,избитая дорога,избитой дороге,0,"С бурной быстротой, возможной только в сновид..."
1,избитая дорога,избитой дороге,0,"Как почтовый возок на избитой дороге, прыгает..."


In [None]:
data_anc['Idiom Normal'].value_counts()

больное место                 57
правая рука                   56
болевая точка                 52
нож острый                    49
путеводная звезда             48
лавровый венок                44
бедный родственник            41
зелёная улица                 38
тяжёлая рука                  38
ваш брат                      34
вавилонское столпотворение    30
наша сестра                   28
пороховая бочка               27
дальний прицел                25
вторая ступень                23
заблудшая овца                23
старый воробей                22
красная бумажка               21
синяя птица                   20
другой разговор               18
долгая песня                  18
старый гриб                   16
чёрная кость                  15
девичья кожа                  12
маковое зерно                 10
избитая дорога                10
музейная редкость             10
куриная голова                 9
ободранная кошка               9
чернильная строка              3
Name: Idio

In [None]:
len(data_anc['Idiom Normal'].value_counts())

81

In [None]:
data['Label'].value_counts()

0    856
1    843
Name: Label, dtype: int64

In [None]:
data.shape

(1699, 4)

In [None]:
data = pd.concat([data_vnc, data_anc], ignore_index=True)
data.head(2)

Unnamed: 0,Idiom Normal,Idiom Inflected,Label,Example
0,бить карту,бил карту,0,Он бил карту за картой и загребал золото и кре...
1,бить карту,бил карту,0,"Ермолов держал карты, сощуря правый глаз; ког..."


In [None]:
data.tail(2)

Unnamed: 0,Idiom Normal,Idiom Inflected,Label,Example
1697,бедный родственник,бедными родственниками,0,"Проходя мимо церквей, я вижу иногда человека,..."
1698,бедный родственник,бедных родственниках,0,"[Егор Дмитрич Глумов, муж] У молодой женщин..."


In [None]:
data.shape

(1699, 4)

## Load BERT

In [None]:
model_class, tokenizer_class, pretrained_weights = (ppb.TFBertModel, ppb.BertTokenizer, 'bert-base-multilingual-cased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
BERT_model = model_class.from_pretrained(pretrained_weights)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1961828.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1083389348.0, style=ProgressStyle(descr…




Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
def pad_sentence(tokenized, max_len):
  print('\nPadding/truncating all sentences to %d values...' % max_len)
  print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))

  # Pad our input tokens with value 0.
  # "post" indicates that we want to pad and truncate at the end of the sequence,
  # as opposed to the beginning.
  input_ids = pad_sequences(tokenized, maxlen=max_len, dtype="long", 
                            value=0, truncating="post", padding="post")
  print('\nDone.')
  return input_ids


def create_att_masks(input_ids):
  # Create attention masks
  attention_masks = []

  # For each sentence...
  for sent in input_ids:
      
      # Create the attention mask.
      #   - If a token ID is 0, then it's padding, set the mask to 0.
      #   - If a token ID is > 0, then it's a real token, set the mask to 1.
      att_mask = [int(token_id > 0) for token_id in sent]
      
      # Store the attention mask for this sentence.
      attention_masks.append(att_mask)
  attention_masks = np.array(attention_masks)    
  return attention_masks

def extract_full_embeddings(output):
  last_hidden_states = output[0] # lhs for all sentences
  extracted = []
  for i, el in enumerate(last_hidden_states): #for each sentence...
    extracted.append(last_hidden_states[i]) 
  return extracted

def use_batches(padded, masked):
  full_embeddings = []

  NUM_OF_IDIOMS_initial = len(masked)
  BATCH_SIZE = 200  # Using larger batch might kill the session when embedding contexts
  NUM_OF_IDIOMS = NUM_OF_IDIOMS_initial
  i = 0

  while NUM_OF_IDIOMS > 0:
    print(i)
    NUM_OF_IDIOMS -= BATCH_SIZE
    print('NUM_OF_IDIOMS -= BATCH_SIZE', NUM_OF_IDIOMS)
    if i < NUM_OF_IDIOMS_initial - BATCH_SIZE:
      output_batch = BERT_model(padded[i:i+BATCH_SIZE], attention_mask = masked[i:i+BATCH_SIZE])
    else:
      output_batch = BERT_model(padded[i:NUM_OF_IDIOMS_initial], attention_mask = masked[i:NUM_OF_IDIOMS_initial])
    i += BATCH_SIZE
    embeddings_batch = extract_full_embeddings(output_batch)
    full_embeddings.append(embeddings_batch)

  full_embeddings_all = []
  for batch in full_embeddings:
    for sentence in batch:
      full_embeddings_all.append(sentence)

  return full_embeddings_all


## Define RNN

In [None]:
MAX_SEQUENCE_LEN = 150
VECTOR_DIM = 768
NUM_CLASSES = 2

def build_model():
  model = Sequential()
  model.add(Masking(mask_value=0., input_shape=(MAX_SEQUENCE_LEN,VECTOR_DIM)))
  forward_layer = GRU(10, return_sequences=False, dropout=0.5)
  backward_layer = GRU(10, return_sequences=False, dropout=0.5,
                      go_backwards=True)
  model.add(Bidirectional(forward_layer, backward_layer=backward_layer,
                        input_shape=(MAX_SEQUENCE_LEN,VECTOR_DIM)))
  model.add(Dense(NUM_CLASSES))
  model.add(Activation('softmax'))

  model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
  print('compiled model')
  return model

## All Idioms

In [None]:
tokenized_idioms = data['Idiom Inflected'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=50)))
tokenized_contexts = data['Example'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=500)))
max_idiom_length = max([len(sen) for sen in tokenized_idioms])
max_context_length = max([len(sen) for sen in tokenized_contexts])
print('Max idiom length: ', max_idiom_length)
print('Max context length: ', max_context_length)

padded_idioms = pad_sentence(tokenized_idioms, max_idiom_length+5)
padded_contexts = pad_sentence(tokenized_contexts, max_context_length+20)
print(len(padded_idioms))
print(len(padded_contexts))

masked_idioms = create_att_masks(padded_idioms)
masked_contexts = create_att_masks(padded_contexts)
print(len(masked_idioms))
print(len(masked_contexts))

Max idiom length:  11
Max context length:  234

Padding/truncating all sentences to 16 values...

Padding token: "[PAD]", ID: 0

Done.

Padding/truncating all sentences to 254 values...

Padding token: "[PAD]", ID: 0

Done.
1699
1699
1699
1699


In [None]:
embedded_contexts = use_batches(padded_contexts, masked_contexts)
len(embedded_contexts)

0
NUM_OF_IDIOMS -= BATCH_SIZE 1499
200
NUM_OF_IDIOMS -= BATCH_SIZE 1299
400
NUM_OF_IDIOMS -= BATCH_SIZE 1099
600
NUM_OF_IDIOMS -= BATCH_SIZE 899
800
NUM_OF_IDIOMS -= BATCH_SIZE 699
1000
NUM_OF_IDIOMS -= BATCH_SIZE 499
1200
NUM_OF_IDIOMS -= BATCH_SIZE 299
1400
NUM_OF_IDIOMS -= BATCH_SIZE 99
1600
NUM_OF_IDIOMS -= BATCH_SIZE -101


1699

In [None]:
data

Unnamed: 0,Idiom Normal,Idiom Inflected,Label,Example
0,бить карту,бил карту,0,Он бил карту за картой и загребал золото и кре...
1,бить карту,бил карту,0,"Ермолов держал карты, сощуря правый глаз; ког..."
2,бить карту,бил карту,0,"Он сгреб кучку золота, прибавил к ней на глаз..."
3,бить карту,бил карту,0,Передо мною все мелькала бледная улыбка банко...
4,бить карту,бейте карту,1,"Бейте карту: или я, или вы комендантом в замк..."
...,...,...,...,...
1694,бедный родственник,бедными родственниками,1,"И мы, россиянцы, будем чувствовать себя на эт..."
1695,бедный родственник,бедными родственниками,1,"Скорее, бедными родственниками являются именн..."
1696,бедный родственник,бедными родственниками,0,Александр II с детских лет привык считать бед...
1697,бедный родственник,бедными родственниками,0,"Проходя мимо церквей, я вижу иногда человека,..."


In [None]:
labels = to_categorical(data.Label)

In [None]:
X_train, X_test = train_test_split(embedded_contexts, test_size=0.2, random_state=34)
Y_train, Y_test = train_test_split(labels, test_size=0.2, random_state=34)

### Classifier

In [None]:
model_all = build_model()

compiled model


In [None]:
model_all.fit(np.asarray(X_train), Y_train, batch_size=8, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fab6aba8410>

In [None]:
model_all.evaluate(np.asarray(X_test), Y_test)



[0.6363776922225952, 0.7911764979362488]

In [None]:
preds_all = model_all.predict(np.array(X_test))
f1_score(np.argmax(preds_all, axis=1), np.argmax(Y_test, axis=1))



0.7942028985507246

In [None]:
 n_classes = 3

fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(Y_test_anc[:, i], preds_anc[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

## VNC

In [None]:
tokenized_idioms_vnc = data_vnc['Idiom Inflected'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=50)))
tokenized_contexts_vnc = data_vnc['Example'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=500)))
max_idiom_length_vnc = max([len(sen) for sen in tokenized_idioms_vnc])
max_context_length_vnc = max([len(sen) for sen in tokenized_contexts_vnc])
print('Max idiom length: ', max_idiom_length_vnc)
print('Max context length: ', max_context_length_vnc)

padded_idioms_vnc = pad_sentence(tokenized_idioms_vnc, max_idiom_length_vnc+5)
padded_contexts_vnc = pad_sentence(tokenized_contexts_vnc, max_context_length_vnc+20)
print(len(padded_idioms_vnc))
print(len(padded_contexts_vnc))

Max idiom length:  11
Max context length:  234

Padding/truncating all sentences to 16 values...

Padding token: "[PAD]", ID: 0

Done.

Padding/truncating all sentences to 254 values...

Padding token: "[PAD]", ID: 0

Done.
893
893


In [None]:
masked_idioms_vnc = create_att_masks(padded_idioms_vnc)
masked_contexts_vnc = create_att_masks(padded_contexts_vnc)
print(len(masked_idioms_vnc))
print(len(masked_contexts_vnc))

893
893


In [None]:
embedded_contexts_vnc = use_batches(padded_contexts_vnc, masked_contexts_vnc)
len(embedded_contexts_vnc)

0
NUM_OF_IDIOMS -= BATCH_SIZE 693
200
NUM_OF_IDIOMS -= BATCH_SIZE 493
400
NUM_OF_IDIOMS -= BATCH_SIZE 293
600
NUM_OF_IDIOMS -= BATCH_SIZE 93
800
NUM_OF_IDIOMS -= BATCH_SIZE -107


893

In [None]:
labels_vnc = to_categorical(data_vnc.Label)
X_train_vnc, X_test_vnc = train_test_split(embedded_contexts_vnc, test_size=0.2, random_state=34)

In [None]:
Y_train_vnc, Y_test_vnc = train_test_split(labels_vnc, test_size=0.2, random_state=34)

### Classifier

In [None]:
model_vnc = build_model()

compiled model


In [None]:
model_vnc.fit(np.asarray(X_train_vnc), Y_train_vnc, batch_size=8, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fa656a3ca90>

In [None]:
model_vnc.evaluate(np.asarray(X_test_vnc), Y_test_vnc)



[0.49890008568763733, 0.832402229309082]

In [None]:
preds_vnc = model_vnc.predict(np.array(X_test_vnc))
f1_score(np.argmax(preds_vnc, axis=1), np.argmax(Y_test_vnc, axis=1))



0.8514851485148516

## ANC

In [None]:
tokenized_contexts_anc = data_anc['Example'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=500)))
max_context_length_anc = max([len(sen) for sen in tokenized_contexts_anc])
print('Max context length: ', max_context_length_anc)

padded_contexts_anc = pad_sentence(tokenized_contexts_anc, max_context_length_anc+20)
print(len(padded_contexts_anc))

Max context length:  221

Padding/truncating all sentences to 241 values...

Padding token: "[PAD]", ID: 0

Done.
806


In [None]:
masked_contexts_anc = create_att_masks(padded_contexts_anc)
print(len(masked_contexts_anc))

806


In [None]:
embedded_contexts_anc = use_batches(padded_contexts_anc, masked_contexts_anc)
len(embedded_contexts_anc)

0
NUM_OF_IDIOMS -= BATCH_SIZE 606
200
NUM_OF_IDIOMS -= BATCH_SIZE 406
400
NUM_OF_IDIOMS -= BATCH_SIZE 206
600
NUM_OF_IDIOMS -= BATCH_SIZE 6
800
NUM_OF_IDIOMS -= BATCH_SIZE -194


806

In [None]:
labels_anc = to_categorical(data_anc.Label)
X_train_anc, X_test_anc = train_test_split(embedded_contexts_anc, test_size=0.2, random_state=34)
Y_train_anc, Y_test_anc = train_test_split(labels_anc, test_size=0.2, random_state=34)

### Classifier

In [None]:
model_anc = build_model()

compiled model


In [None]:
model_anc.fit(np.asarray(X_train_anc), Y_train_anc, batch_size=8, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fa644b61510>

In [None]:
model_anc.evaluate(np.asarray(X_test_anc), Y_test_anc)



[0.6991873383522034, 0.7839506268501282]

In [None]:
preds_anc = model_anc.predict(np.array(X_test_anc))



In [None]:
f1_score(np.argmax(preds_anc, axis=1), np.argmax(Y_test_anc, axis=1))

0.7798742138364779

# Not Present in the Training Set

## Split

In [None]:
data_anc['Idiom Normal'].value_counts()

больное место                 57
правая рука                   56
болевая точка                 52
нож острый                    49
путеводная звезда             48
лавровый венок                44
бедный родственник            41
зелёная улица                 38
тяжёлая рука                  38
ваш брат                      34
вавилонское столпотворение    30
наша сестра                   28
пороховая бочка               27
дальний прицел                25
заблудшая овца                23
вторая ступень                23
старый воробей                22
красная бумажка               21
синяя птица                   20
долгая песня                  18
другой разговор               18
старый гриб                   16
чёрная кость                  15
девичья кожа                  12
маковое зерно                 10
избитая дорога                10
музейная редкость             10
куриная голова                 9
ободранная кошка               9
чернильная строка              3
Name: Idio

In [None]:
test_ancs = ['вавилонское столпотворение', 'ободранная кошка', 'наша сестра', 'пороховая бочка', 'дальний прицел', 'заблудшая овца', 'красная бумажка']

In [None]:
data_anc_test = data_anc.loc[data_anc['Idiom Normal'].isin(test_ancs)]
data_anc_test.shape

(163, 4)

In [None]:
data_anc_train = data_anc.loc[~data_anc['Idiom Normal'].isin(test_ancs)]
data_anc_train.shape

(643, 4)

In [None]:
163/806

0.2022332506203474

In [None]:
data_vnc['Idiom Normal'].value_counts()

бросать тень               53
сесть на мель              47
пускать корни              41
окунуться с головой        31
отвести глаза              29
давит грудь                29
поставить на колени        29
выступить на сцену         27
приложить руку             27
снимать шляпу              27
поднять на ноги            27
положить голову            25
пахнет порохом             23
вырвать с корнем           23
точить нож                 22
преградить дорогу          22
вильнуть хвостом           21
сидеть на печи             20
открыть глаза              20
умывать руки               19
открывать Америку          19
плести кружева             18
взваливать на плечи        17
имей глаза                 17
поддать жару               16
разбить лед                16
чесать затылок             16
прокладывать дорогу        14
бросать перо               14
прижать хвост              13
давать сдачу               13
катить бочку               12
прищемить хвост            12
поймать на

In [None]:
test_vncs = ['точить нож', 'преградить дорогу', 'вильнуть хвостом', 'сидеть на печи', 
             'открыть глаза', 'умывать руки', 'открывать Америку', 'плести кружева', 
             'положить голову']

In [None]:
data_vnc_test = data_vnc.loc[data_vnc['Idiom Normal'].isin(test_vncs)]
data_vnc_test.shape

(186, 4)

In [None]:
data_vnc_train = data_vnc.loc[~data_vnc['Idiom Normal'].isin(test_ancs)]
data_vnc_train.shape

(893, 4)

In [None]:
186/893

0.20828667413213886

In [None]:
data_all_train = pd.concat([data_vnc_train, data_anc_train], ignore_index=True)
data_all_train.head(2)

Unnamed: 0,Idiom Normal,Idiom Inflected,Label,Example
0,бить карту,бил карту,0,Он бил карту за картой и загребал золото и кре...
1,бить карту,бил карту,0,"Ермолов держал карты, сощуря правый глаз; ког..."


In [None]:
data_all_test = pd.concat([data_vnc_test, data_anc_test], ignore_index=True)
data_all_test.head(2)

Unnamed: 0,Idiom Normal,Idiom Inflected,Label,Example
0,открывать Америку,открывать Америку,1,С тех пор от него можно услышать: «Хватит кич...
1,открывать Америку,открывать Америку,0,"Впечатление было такое, что мы на судне Колум..."


## Embed

### VNC

In [None]:
tokenized_idioms_vnc_train_not_present = data_vnc_train['Idiom Inflected'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=50)))
tokenized_contexts_vnc_train_not_present = data_vnc_train['Example'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=500)))
max_idiom_length_vnc_train_not_present = max([len(sen) for sen in tokenized_idioms_vnc_train_not_present])
max_context_length_vnc_train_not_present = max([len(sen) for sen in tokenized_contexts_vnc_train_not_present])
print('Max idiom length: ', max_idiom_length_vnc_train_not_present)
print('Max context length: ', max_context_length_vnc_train_not_present)

padded_idioms_vnc_train_not_present = pad_sentence(tokenized_idioms_vnc_train_not_present, max_idiom_length_vnc_train_not_present+5)
padded_contexts_vnc_train_not_present = pad_sentence(tokenized_contexts_vnc_train_not_present, max_context_length_vnc_train_not_present+20)
print(len(padded_idioms_vnc_train_not_present))
print(len(padded_contexts_vnc_train_not_present))

Max idiom length:  11
Max context length:  234

Padding/truncating all sentences to 16 values...

Padding token: "[PAD]", ID: 0

Done.

Padding/truncating all sentences to 254 values...

Padding token: "[PAD]", ID: 0

Done.
893
893


In [None]:
masked_idioms_vnc_train_not_present = create_att_masks(padded_idioms_vnc_train_not_present)
masked_contexts_vnc_train_not_present = create_att_masks(padded_contexts_vnc_train_not_present)
print(len(masked_idioms_vnc_train_not_present))
print(len(masked_contexts_vnc_train_not_present))

893
893


In [None]:
embedded_idioms_vnc_train_not_present = use_batches(padded_idioms_vnc_train_not_present, masked_idioms_vnc_train_not_present)
len(embedded_idioms_vnc_train_not_present)

0
NUM_OF_IDIOMS -= BATCH_SIZE 693
200
NUM_OF_IDIOMS -= BATCH_SIZE 493
400
NUM_OF_IDIOMS -= BATCH_SIZE 293
600
NUM_OF_IDIOMS -= BATCH_SIZE 93
800
NUM_OF_IDIOMS -= BATCH_SIZE -107


893

In [None]:
embedded_contexts_vnc_train_not_present = use_batches(padded_contexts_vnc_train_not_present, masked_contexts_vnc_train_not_present)
len(embedded_contexts_vnc_train_not_present)

0
NUM_OF_IDIOMS -= BATCH_SIZE 693
200
NUM_OF_IDIOMS -= BATCH_SIZE 493
400
NUM_OF_IDIOMS -= BATCH_SIZE 293
600
NUM_OF_IDIOMS -= BATCH_SIZE 93
800
NUM_OF_IDIOMS -= BATCH_SIZE -107


893

In [None]:
labels_vnc_train_not_present = to_categorical(data_vnc_train.Label)

In [None]:
tokenized_idioms_vnc_test_not_present = data_vnc_test['Idiom Inflected'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=50)))
tokenized_contexts_vnc_test_not_present = data_vnc_test['Example'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=500)))
max_idiom_length_vnc_test_not_present = max([len(sen) for sen in tokenized_idioms_vnc_test_not_present])
max_context_length_vnc_test_not_present = max([len(sen) for sen in tokenized_contexts_vnc_test_not_present])
print('Max idiom length: ', max_idiom_length_vnc_test_not_present)
print('Max context length: ', max_context_length_vnc_test_not_present)

padded_idioms_vnc_test_not_present = pad_sentence(tokenized_idioms_vnc_test_not_present, max_idiom_length_vnc_test_not_present+5)
padded_contexts_vnc_test_not_present = pad_sentence(tokenized_contexts_vnc_test_not_present, max_context_length_vnc_test_not_present+20)
print(len(padded_idioms_vnc_test_not_present))
print(len(padded_contexts_vnc_test_not_present))

masked_idioms_vnc_test_not_present = create_att_masks(padded_idioms_vnc_test_not_present)
masked_contexts_vnc_test_not_present = create_att_masks(padded_contexts_vnc_test_not_present)
print(len(masked_idioms_vnc_test_not_present))
print(len(masked_contexts_vnc_test_not_present))

embedded_idioms_vnc_test_not_present = use_batches(padded_idioms_vnc_test_not_present, masked_idioms_vnc_test_not_present)
len(embedded_idioms_vnc_test_not_present)

embedded_contexts_vnc_test_not_present = use_batches(padded_contexts_vnc_test_not_present, masked_contexts_vnc_test_not_present)
len(embedded_contexts_vnc_test_not_present)

Max idiom length:  10
Max context length:  170

Padding/truncating all sentences to 15 values...

Padding token: "[PAD]", ID: 0

Done.

Padding/truncating all sentences to 190 values...

Padding token: "[PAD]", ID: 0

Done.
186
186
186
186
0
NUM_OF_IDIOMS -= BATCH_SIZE -14
0
NUM_OF_IDIOMS -= BATCH_SIZE -14


186

In [None]:
labels_vnc_test_not_present = to_categorical(data_vnc_test.Label)

### ANC

In [None]:
tokenized_idioms_anc_train_not_present = data_anc_train['Idiom Inflected'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=50)))
tokenized_contexts_anc_train_not_present = data_anc_train['Example'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=500)))
max_idiom_length_anc_train_not_present = max([len(sen) for sen in tokenized_idioms_anc_train_not_present])
max_context_length_anc_train_not_present = max([len(sen) for sen in tokenized_contexts_anc_train_not_present])
print('Max idiom length: ', max_idiom_length_anc_train_not_present)
print('Max context length: ', max_context_length_anc_train_not_present)

padded_idioms_anc_train_not_present = pad_sentence(tokenized_idioms_anc_train_not_present, max_idiom_length_anc_train_not_present+5)
padded_contexts_anc_train_not_present = pad_sentence(tokenized_contexts_anc_train_not_present, max_context_length_anc_train_not_present+20)
print(len(padded_idioms_anc_train_not_present))
print(len(padded_contexts_anc_train_not_present))

masked_idioms_anc_train_not_present = create_att_masks(padded_idioms_anc_train_not_present)
masked_contexts_anc_train_not_present = create_att_masks(padded_contexts_anc_train_not_present)
print(len(masked_idioms_anc_train_not_present))
print(len(masked_contexts_anc_train_not_present))

embedded_idioms_anc_train_not_present = use_batches(padded_idioms_anc_train_not_present, masked_idioms_anc_train_not_present)
len(embedded_idioms_anc_train_not_present)

embedded_contexts_anc_train_not_present = use_batches(padded_contexts_anc_train_not_present, masked_contexts_anc_train_not_present)
len(embedded_contexts_anc_train_not_present)

Max idiom length:  10
Max context length:  221

Padding/truncating all sentences to 15 values...

Padding token: "[PAD]", ID: 0

Done.

Padding/truncating all sentences to 241 values...

Padding token: "[PAD]", ID: 0

Done.
643
643
643
643
0
NUM_OF_IDIOMS -= BATCH_SIZE 443
200
NUM_OF_IDIOMS -= BATCH_SIZE 243
400
NUM_OF_IDIOMS -= BATCH_SIZE 43
600
NUM_OF_IDIOMS -= BATCH_SIZE -157
0
NUM_OF_IDIOMS -= BATCH_SIZE 443
200
NUM_OF_IDIOMS -= BATCH_SIZE 243
400
NUM_OF_IDIOMS -= BATCH_SIZE 43
600
NUM_OF_IDIOMS -= BATCH_SIZE -157


643

In [None]:
labels_anc_train_not_present = to_categorical(data_anc_train.Label)

In [None]:
tokenized_idioms_anc_test_not_present = data_anc_test['Idiom Inflected'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=50)))
tokenized_contexts_anc_test_not_present = data_anc_test['Example'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=500)))
max_idiom_length_anc_test_not_present = max([len(sen) for sen in tokenized_idioms_anc_test_not_present])
max_context_length_anc_test_not_present = max([len(sen) for sen in tokenized_contexts_anc_test_not_present])
print('Max idiom length: ', max_idiom_length_anc_test_not_present)
print('Max context length: ', max_context_length_anc_test_not_present)

padded_idioms_anc_test_not_present = pad_sentence(tokenized_idioms_anc_test_not_present, max_idiom_length_anc_test_not_present+5)
padded_contexts_anc_test_not_present = pad_sentence(tokenized_contexts_anc_test_not_present, max_context_length_anc_test_not_present+20)
print(len(padded_idioms_anc_test_not_present))
print(len(padded_contexts_anc_test_not_present))

masked_idioms_anc_test_not_present = create_att_masks(padded_idioms_anc_test_not_present)
masked_contexts_anc_test_not_present = create_att_masks(padded_contexts_anc_test_not_present)
print(len(masked_idioms_anc_test_not_present))
print(len(masked_contexts_anc_test_not_present))

embedded_idioms_anc_test_not_present = use_batches(padded_idioms_anc_test_not_present, masked_idioms_anc_test_not_present)
len(embedded_idioms_anc_test_not_present)

embedded_contexts_anc_test_not_present = use_batches(padded_contexts_anc_test_not_present, masked_contexts_anc_test_not_present)
len(embedded_contexts_anc_test_not_present)

Max idiom length:  11
Max context length:  192

Padding/truncating all sentences to 16 values...

Padding token: "[PAD]", ID: 0

Done.

Padding/truncating all sentences to 212 values...

Padding token: "[PAD]", ID: 0

Done.
163
163
163
163
0
NUM_OF_IDIOMS -= BATCH_SIZE -37
0
NUM_OF_IDIOMS -= BATCH_SIZE -37


163

In [None]:
labels_anc_test_not_present = to_categorical(data_anc_test.Label)

### All

In [None]:
tokenized_idioms_all_train_not_present = data_all_train['Idiom Inflected'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=50)))
tokenized_contexts_all_train_not_present = data_all_train['Example'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=500)))
max_idiom_length_all_train_not_present = max([len(sen) for sen in tokenized_idioms_all_train_not_present])
max_context_length_all_train_not_present = max([len(sen) for sen in tokenized_contexts_all_train_not_present])
print('Max idiom length: ', max_idiom_length_all_train_not_present)
print('Max context length: ', max_context_length_all_train_not_present)

padded_idioms_all_train_not_present = pad_sentence(tokenized_idioms_all_train_not_present, max_idiom_length_all_train_not_present+5)
padded_contexts_all_train_not_present = pad_sentence(tokenized_contexts_all_train_not_present, max_context_length_all_train_not_present+20)
print(len(padded_idioms_all_train_not_present))
print(len(padded_contexts_all_train_not_present))

masked_idioms_all_train_not_present = create_att_masks(padded_idioms_all_train_not_present)
masked_contexts_all_train_not_present = create_att_masks(padded_contexts_all_train_not_present)
print(len(masked_idioms_all_train_not_present))
print(len(masked_contexts_all_train_not_present))

embedded_idioms_all_train_not_present = use_batches(padded_idioms_all_train_not_present, masked_idioms_all_train_not_present)
len(embedded_idioms_all_train_not_present)

embedded_contexts_all_train_not_present = use_batches(padded_contexts_all_train_not_present, masked_contexts_all_train_not_present)
len(embedded_contexts_all_train_not_present)

Max idiom length:  11
Max context length:  234

Padding/truncating all sentences to 16 values...

Padding token: "[PAD]", ID: 0

Done.

Padding/truncating all sentences to 254 values...

Padding token: "[PAD]", ID: 0

Done.
1536
1536
1536
1536
0
NUM_OF_IDIOMS -= BATCH_SIZE 1336
200
NUM_OF_IDIOMS -= BATCH_SIZE 1136
400
NUM_OF_IDIOMS -= BATCH_SIZE 936
600
NUM_OF_IDIOMS -= BATCH_SIZE 736
800
NUM_OF_IDIOMS -= BATCH_SIZE 536
1000
NUM_OF_IDIOMS -= BATCH_SIZE 336
1200
NUM_OF_IDIOMS -= BATCH_SIZE 136
1400
NUM_OF_IDIOMS -= BATCH_SIZE -64
0
NUM_OF_IDIOMS -= BATCH_SIZE 1336
200
NUM_OF_IDIOMS -= BATCH_SIZE 1136
400
NUM_OF_IDIOMS -= BATCH_SIZE 936
600
NUM_OF_IDIOMS -= BATCH_SIZE 736
800
NUM_OF_IDIOMS -= BATCH_SIZE 536
1000
NUM_OF_IDIOMS -= BATCH_SIZE 336
1200
NUM_OF_IDIOMS -= BATCH_SIZE 136
1400
NUM_OF_IDIOMS -= BATCH_SIZE -64


1536

In [None]:
labels_all_train_not_present = to_categorical(data_all_train.Label)

In [None]:
tokenized_idioms_all_test_not_present = data_all_test['Idiom Inflected'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=50)))
tokenized_contexts_all_test_not_present = data_all_test['Example'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=500)))
max_idiom_length_all_test_not_present = max([len(sen) for sen in tokenized_idioms_all_test_not_present])
max_context_length_all_test_not_present = max([len(sen) for sen in tokenized_contexts_all_test_not_present])
print('Max idiom length: ', max_idiom_length_all_test_not_present)
print('Max context length: ', max_context_length_all_test_not_present)

padded_idioms_all_test_not_present = pad_sentence(tokenized_idioms_all_test_not_present, max_idiom_length_all_test_not_present+5)
padded_contexts_all_test_not_present = pad_sentence(tokenized_contexts_all_test_not_present, max_context_length_all_test_not_present+20)
print(len(padded_idioms_all_test_not_present))
print(len(padded_contexts_all_test_not_present))

masked_idioms_all_test_not_present = create_att_masks(padded_idioms_all_test_not_present)
masked_contexts_all_test_not_present = create_att_masks(padded_contexts_all_test_not_present)
print(len(masked_idioms_all_test_not_present))
print(len(masked_contexts_all_test_not_present))

embedded_idioms_all_test_not_present = use_batches(padded_idioms_all_test_not_present, masked_idioms_all_test_not_present)
len(embedded_idioms_all_test_not_present)

embedded_contexts_all_test_not_present = use_batches(padded_contexts_all_test_not_present, masked_contexts_all_test_not_present)
len(embedded_contexts_all_test_not_present)

Max idiom length:  11
Max context length:  192

Padding/truncating all sentences to 16 values...

Padding token: "[PAD]", ID: 0

Done.

Padding/truncating all sentences to 212 values...

Padding token: "[PAD]", ID: 0

Done.
349
349
349
349
0
NUM_OF_IDIOMS -= BATCH_SIZE 149
200
NUM_OF_IDIOMS -= BATCH_SIZE -51
0
NUM_OF_IDIOMS -= BATCH_SIZE 149
200
NUM_OF_IDIOMS -= BATCH_SIZE -51


349

In [None]:
labels_all_test_not_present = to_categorical(data_all_test.Label)

## Train

### VNC

In [None]:
model_vnc_not_present = Sequential()
model_vnc_not_present.add(Masking(mask_value=0., input_shape=(MAX_SEQUENCE_LEN,VECTOR_DIM)))
forward_layer = GRU(10, return_sequences=False, dropout=0.5)
backward_layer = GRU(10, return_sequences=False, dropout=0.5,
                    go_backwards=True)
model_vnc_not_present.add(Bidirectional(forward_layer, backward_layer=backward_layer,
                      input_shape=(MAX_SEQUENCE_LEN,VECTOR_DIM)))
model_vnc_not_present.add(Dense(NUM_CLASSES))
model_vnc_not_present.add(Activation('softmax'))

model_vnc_not_present.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
print('compiled model')

compiled model


In [None]:
model_vnc_not_present.fit(np.asarray(embedded_contexts_vnc_train_not_present), 
              labels_vnc_train_not_present, batch_size=8, epochs=10)
print('fit model')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
fit model


In [None]:
model_vnc_not_present.evaluate(np.asarray(embedded_contexts_vnc_test_not_present), labels_vnc_test_not_present)



[0.12515968084335327, 0.9569892287254333]

In [None]:
preds_vnc_not_present = model_vnc_not_present.predict(np.array(embedded_contexts_vnc_test_not_present))
f1_score(np.argmax(preds_vnc_not_present, axis=1), np.argmax(labels_vnc_test_not_present, axis=1))

0.943661971830986

### ANC

In [None]:
model_anc_not_present = Sequential()
model_anc_not_present.add(Masking(mask_value=0., input_shape=(MAX_SEQUENCE_LEN,VECTOR_DIM)))
forward_layer = GRU(10, return_sequences=False, dropout=0.5)
backward_layer = GRU(10, return_sequences=False, dropout=0.5,
                    go_backwards=True)
model_anc_not_present.add(Bidirectional(forward_layer, backward_layer=backward_layer,
                      input_shape=(MAX_SEQUENCE_LEN,VECTOR_DIM)))
model_anc_not_present.add(Dense(NUM_CLASSES))
model_anc_not_present.add(Activation('softmax'))

model_anc_not_present.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
print('compiled model')

compiled model


In [None]:
model_anc_not_present.fit(np.asarray(embedded_contexts_anc_train_not_present), 
              labels_anc_train_not_present, batch_size=8, epochs=10)
print('fit model')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
fit model


In [None]:
model_anc_not_present.evaluate(np.asarray(embedded_contexts_anc_test_not_present), labels_anc_test_not_present)



[1.2682161331176758, 0.5644171833992004]

In [None]:
preds_anc_not_present = model_anc_not_present.predict(np.array(embedded_contexts_anc_test_not_present))
f1_score(np.argmax(preds_anc_not_present, axis=1), np.argmax(labels_anc_test_not_present, axis=1))



0.6077348066298343

### All

In [None]:
model_all_not_present = Sequential()
model_all_not_present.add(Masking(mask_value=0., input_shape=(MAX_SEQUENCE_LEN,VECTOR_DIM)))
forward_layer = GRU(10, return_sequences=False, dropout=0.5)
backward_layer = GRU(10, return_sequences=False, dropout=0.5,
                    go_backwards=True)
model_all_not_present.add(Bidirectional(forward_layer, backward_layer=backward_layer,
                      input_shape=(MAX_SEQUENCE_LEN,VECTOR_DIM)))
model_all_not_present.add(Dense(NUM_CLASSES))
model_all_not_present.add(Activation('softmax'))

model_all_not_present.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
print('compiled model')

compiled model


In [None]:
model_all_not_present.fit(np.asarray(embedded_contexts_all_train_not_present), 
              labels_all_train_not_present, batch_size=8, epochs=10)
print('fit model')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
fit model


In [None]:
model_all_not_present.evaluate(np.asarray(embedded_contexts_all_test_not_present), labels_all_test_not_present)



[0.6098086833953857, 0.7765042781829834]

In [None]:
preds_all_not_present = model_all_not_present.predict(np.array(embedded_contexts_all_test_not_present))
f1_score(np.argmax(preds_all_not_present, axis=1), np.argmax(labels_all_test_not_present, axis=1))



0.7845303867403315