<a href="https://colab.research.google.com/github/alinaalborova/russian_idioms_processing/blob/main/MICE_Token_Level_ruBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Idiom Type and Token Classification

Based on [MICE: Mining Idioms with Contextual Embeddings](https://arxiv.org/pdf/2008.05759.pdf) by  Škvorc et al.


## Libraries

In [None]:
!pip install transformers
!pip install tensor2tensor



In [None]:
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Activation, Concatenate, Masking, GRU
from tensorflow.keras import Sequential
from tensorflow.keras import Input, Model
from tensorflow.keras.utils import to_categorical
from ast import literal_eval
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
#from keras.utils import plot_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import tensorflow as tf
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm_notebook

## Dataset

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
dataset_vnc_dir = '/content/drive/MyDrive/ВКР/Sense Disambiguation Corpus/token_level_vnc.csv'
data_vnc = pd.read_csv(dataset_vnc_dir )
data_vnc.drop(data_vnc.iloc[:, :1], axis=1, inplace=True)
data_vnc.head(2)

Unnamed: 0,Idiom Normal,Idiom Inflected,Token,Label,Context
0,взваливать на плечи,взваливают на плечи,бесчеловечный,2,"Там несвобода, и тут несвобода, там сажают и ..."
1,выступить на сцену,выступить на сцену,выступить,0,"Monsieur Орлов, наш учитель пения, говорил мн..."


In [None]:
dataset_anc_dir = '/content/drive/MyDrive/ВКР/Sense Disambiguation Corpus/token_level_anc.csv'
data_anc = pd.read_csv(dataset_anc_dir )
data_anc.drop(data_anc.iloc[:, :1], axis=1, inplace=True)
data_anc.head(2)

Unnamed: 0,Idiom Normal,Idiom Inflected,Token,Label,Context
0,старый воробей,старые воробьи,старые,1,"― Ведь мы с вами, Марья Алексевна, старые вор..."
1,заблудшая овца,заблудшим овцам,заблудшим,1,"― Хирлемс не должен быть слишком строг, если..."


## Define Classifier

In [None]:
MAX_CONTEXT_LEN = 149
MAX_TOKEN_LEN = 13
VECTOR_DIM = 768
NUM_CLASSES = 3

def build_model():
    input_token = Input(shape=(MAX_TOKEN_LEN, VECTOR_DIM), name='input1')
    input_context = Input(shape=(MAX_CONTEXT_LEN,VECTOR_DIM), name='input2')

    #token
    forward_layer = GRU(10, return_sequences=False, dropout=0.5)
    backward_layer = GRU(10, return_sequences=False, dropout=0.5,
                    go_backwards=True)
    bidirectional1 = Bidirectional(forward_layer, backward_layer=backward_layer,
                      input_shape=(MAX_TOKEN_LEN,VECTOR_DIM))(input_token)

    #context
    bidirectional2 = Bidirectional(forward_layer, backward_layer=backward_layer,
                      input_shape=(MAX_CONTEXT_LEN,VECTOR_DIM))(input_context)

    concat = Concatenate(axis=1)([bidirectional1, bidirectional2])

    dense = Dense(NUM_CLASSES)(concat)
    softmax = Activation('softmax', name='output')(dense)

    model = Model(inputs=[input_token, input_context], outputs=softmax)
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    #plot_model(model, to_file='multiple_inputs.png')
    return model
model_all = build_model()

model_all.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input1 (InputLayer)             [(None, 13, 768)]    0                                            
__________________________________________________________________________________________________
input2 (InputLayer)             [(None, 149, 768)]   0                                            
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 20)           46800       input1[0][0]                     
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 20)           46800       input2[0][0]                     
______________________________________________________________________________________________

## Embed

### Load BERT

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel, TFAutoModel
  
tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")

BERT_model = AutoModel.from_pretrained('DeepPavlov/rubert-base-cased')

In [None]:
def get_bert_embeddings_single_sentence(sentence):
  inputs = tokenizer(sentence, return_tensors="pt")
  outputs=BERT_model(**inputs)
  return outputs.last_hidden_state
  
def get_bert_embeddings(sentences, padded_size=150, vec_size=768):
  embeddings_all = []
  #for sentence in tqdm_notebook(sentences):
  for sentence in tqdm_notebook(sentences):
    lhs = get_bert_embeddings_single_sentence(sentence)
    vecs = lhs[0]
    sentence_padded = np.empty(shape=(padded_size, ), dtype=object)
    for id, token in enumerate(sentence_padded):
      try:
        sentence_padded[id] = vecs[id].detach().numpy()
      except:
        sentence_padded[id] = np.zeros(vec_size)
    embeddings_all.append(sentence_padded)
  return embeddings_all

## VNC

In [None]:
tokens_vnc = get_bert_embeddings(data_vnc['Token'][:2000], padded_size=15)

HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))




In [None]:
tokens_vnc2 = get_bert_embeddings(data_vnc['Token'][2000:], padded_size=15)

HBox(children=(FloatProgress(value=0.0, max=1846.0), HTML(value='')))




In [None]:
embedded_tokens_vnc = tokens_vnc + tokens_vnc2

In [None]:
contexts_vnc1 = get_bert_embeddings(data_vnc['Context'][:2000])

HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))




In [None]:
contexts_vnc2 = get_bert_embeddings(data_vnc['Context'][2000:])

HBox(children=(FloatProgress(value=0.0, max=1846.0), HTML(value='')))




In [None]:
embedded_contexts_vnc = contexts_vnc1 + contexts_vnc2

In [None]:
labels_vnc = to_categorical(data_vnc.Label)
X_train_tokens_vnc, X_test_tokens_vnc = train_test_split(embedded_tokens_vnc, test_size=0.3, random_state=34)
X_train_contexts_vnc, X_test_contexts_vnc = train_test_split(embedded_contexts_vnc, test_size=0.3, random_state=34)
Y_train_vnc, Y_test_vnc = train_test_split(labels_vnc, test_size=0.3, random_state=34)

### Classifier

#### Define

In [None]:
MAX_CONTEXT_LEN = 150
MAX_TOKEN_LEN = 15
VECTOR_DIM = 768
NUM_CLASSES = 3

def build_model():
    input_token = Input(shape=(MAX_TOKEN_LEN, VECTOR_DIM), name='input1')
    input_context = Input(shape=(MAX_CONTEXT_LEN,VECTOR_DIM), name='input2')

    #token
    forward_layer = GRU(10, return_sequences=False, dropout=0.5)
    backward_layer = GRU(10, return_sequences=False, dropout=0.5,
                    go_backwards=True)
    bidirectional1 = Bidirectional(forward_layer, backward_layer=backward_layer,
                      input_shape=(MAX_TOKEN_LEN,VECTOR_DIM))(input_token)

    #context
    bidirectional2 = Bidirectional(forward_layer, backward_layer=backward_layer,
                      input_shape=(MAX_CONTEXT_LEN,VECTOR_DIM))(input_context)

    concat = Concatenate(axis=1)([bidirectional1, bidirectional2])

    dense = Dense(NUM_CLASSES)(concat)
    softmax = Activation('softmax', name='output')(dense)

    model = Model(inputs=[input_token, input_context], outputs=softmax)
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    #plot_model(model, to_file='multiple_inputs.png')
    return model

#### Train & Test

In [None]:
model_vnc = build_model()

In [None]:
tokens_tensor_train_vnc = tf.convert_to_tensor(X_train_tokens_vnc)
contexts_tensor_train_vnc = tf.convert_to_tensor(X_train_contexts_vnc)
tokens_tensor_test_vnc = tf.convert_to_tensor(X_test_tokens_vnc)
contexts_tensor_test_vnc = tf.convert_to_tensor(X_test_contexts_vnc)

In [None]:
model_vnc.fit({'input1': tokens_tensor_train_vnc, 'input2': contexts_tensor_train_vnc}, 
               {'output': Y_train_vnc}, batch_size=8, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fbf9a6ff750>

In [None]:
model_vnc.evaluate({'input1': tokens_tensor_test_vnc, 'input2': contexts_tensor_test_vnc}, 
               {'output': Y_test_vnc})



[0.21632592380046844, 0.8804159164428711]

In [None]:
preds_vnc = model_vnc.predict({'input1': tokens_tensor_test_vnc, 'input2': contexts_tensor_test_vnc})
f1_score(np.argmax(preds_vnc, axis=1), np.argmax(Y_test_vnc, axis=1), average='micro')

0.8804159445407279

In [None]:
f1_score(np.argmax(preds_vnc, axis=1), np.argmax(Y_test_vnc, axis=1), average='macro')

0.8742530924506257

In [None]:
f1_score(np.argmax(preds_vnc, axis=1), np.argmax(Y_test_vnc, axis=1), average='weighted')

0.878672066300463

## ANC

In [None]:
data_anc.shape

(2540, 5)

In [None]:
tokens_anc = get_bert_embeddings(data_anc['Token'][:1000], padded_size=15)
tokens_anc2 = get_bert_embeddings(data_anc['Token'][1000:], padded_size=15)

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1540.0), HTML(value='')))




In [None]:
embedded_tokens_anc = tokens_anc + tokens_anc2

In [None]:
del tokens_anc
del tokens_anc2

In [None]:
contexts_anc1 = get_bert_embeddings(data_anc['Context'][:1000])
contexts_anc2 = get_bert_embeddings(data_anc['Context'][1000:])

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1540.0), HTML(value='')))




In [None]:
embedded_contexts_anc = contexts_anc1 + contexts_anc2

In [None]:
del contexts_anc1
del contexts_anc2

In [None]:
labels_anc = to_categorical(data_anc.Label)
X_train_tokens_anc, X_test_tokens_anc = train_test_split(embedded_tokens_anc, test_size=0.3, random_state=34)
X_train_contexts_anc, X_test_contexts_anc = train_test_split(embedded_contexts_anc, test_size=0.3, random_state=34)
Y_train_anc, Y_test_anc = train_test_split(labels_anc, test_size=0.3, random_state=34)

### Classifier

#### Train & Test

In [None]:
model_anc = build_model()

In [None]:
tokens_tensor_train_anc = tf.convert_to_tensor(X_train_tokens_anc)
contexts_tensor_train_anc = tf.convert_to_tensor(X_train_contexts_anc)
tokens_tensor_test_anc = tf.convert_to_tensor(X_test_tokens_anc)
contexts_tensor_test_anc = tf.convert_to_tensor(X_test_contexts_anc)

In [None]:
model_anc.fit({'input1': tokens_tensor_train_anc, 'input2': contexts_tensor_train_anc}, 
               {'output': Y_train_anc}, batch_size=8, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fcc32dd4ed0>

In [None]:
model_anc.evaluate({'input1': tokens_tensor_test_anc, 'input2': contexts_tensor_test_anc}, 
               {'output': Y_test_anc})



[0.2543363571166992, 0.8464567065238953]

In [None]:
preds_anc = model_anc.predict({'input1': tokens_tensor_test_anc, 'input2': contexts_tensor_test_anc})
f1_score(np.argmax(preds_anc, axis=1), np.argmax(Y_test_anc, axis=1), average='micro')

0.846456692913386

In [None]:
f1_score(np.argmax(preds_anc, axis=1), np.argmax(Y_test_anc, axis=1), average='macro')

0.8282772401349909

In [None]:
f1_score(np.argmax(preds_anc, axis=1), np.argmax(Y_test_anc, axis=1), average='weighted')

0.8440899316515293

### Smaller Dataset

#### 70%

In [None]:
model_anc_70 = build_model()

In [None]:
len(X_train_tokens_anc) * 0.7

1244.6

In [None]:
len(X_test_tokens_anc) * 0.7

533.4

In [None]:
tokens_tensor_train_anc_70 = tf.convert_to_tensor(X_train_tokens_anc[:1245])
tokens_tensor_test_anc_70 = tf.convert_to_tensor(X_test_tokens_anc[:533])
contexts_tensor_train_anc_70 = tf.convert_to_tensor(X_train_contexts_anc[:1245])
contexts_tensor_test_anc_70 = tf.convert_to_tensor(X_test_contexts_anc[:533])

In [None]:
model_anc_70.fit({'input1': tokens_tensor_train_anc_70, 'input2': contexts_tensor_train_anc_70}, 
               {'output': Y_train_anc[:1245]}, batch_size=8, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fcc2eb96d90>

In [None]:
model_anc_70.evaluate({'input1': tokens_tensor_test_anc_70, 'input2': contexts_tensor_test_anc_70}, 
               {'output': Y_test_anc[:533]})



[0.3474300503730774, 0.7692307829856873]

In [None]:
model_anc_70.evaluate({'input1': tokens_tensor_test_anc, 'input2': contexts_tensor_test_anc}, 
               {'output': Y_test_anc})



[0.33695778250694275, 0.7742782235145569]

In [None]:
preds_anc = model_anc_70.predict({'input1': tokens_tensor_test_anc_70, 'input2': contexts_tensor_test_anc_70})
f1_score(np.argmax(preds_anc, axis=1), np.argmax(Y_test_anc[:533], axis=1), average='micro')

0.7692307692307693

In [None]:
f1_score(np.argmax(preds_anc, axis=1), np.argmax(Y_test_anc[:533], axis=1), average='macro')

0.7551143207017397

In [None]:
f1_score(np.argmax(preds_anc, axis=1), np.argmax(Y_test_anc[:533], axis=1), average='weighted')

0.7608885368190298

In [None]:
preds_anc = model_anc_70.predict({'input1': tokens_tensor_test_anc, 'input2': contexts_tensor_test_anc})
print(f1_score(np.argmax(preds_anc, axis=1), np.argmax(Y_test_anc, axis=1), average='micro'))
print(f1_score(np.argmax(preds_anc, axis=1), np.argmax(Y_test_anc, axis=1), average='macro'))
print(f1_score(np.argmax(preds_anc, axis=1), np.argmax(Y_test_anc, axis=1), average='weighted'))

0.7742782152230971
0.7575736606935134
0.7657849693883884


#### 50% 

In [None]:
model_anc_50 = build_model()

In [None]:
len(X_train_tokens_anc) * 0.5

889.0

In [None]:
len(X_test_tokens_anc) * 0.5

381.0

In [None]:
tokens_tensor_train_anc_50 = tf.convert_to_tensor(X_train_tokens_anc[:889])
tokens_tensor_test_anc_50 = tf.convert_to_tensor(X_test_tokens_anc[:381])
contexts_tensor_train_anc_50 = tf.convert_to_tensor(X_train_contexts_anc[:889])
contexts_tensor_test_anc_50 = tf.convert_to_tensor(X_test_contexts_anc[:381])

In [None]:
model_anc_50.fit({'input1': tokens_tensor_train_anc_50, 'input2': contexts_tensor_train_anc_50}, 
               {'output': Y_train_anc[:889]}, batch_size=8, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fcc2b959fd0>

In [None]:
model_anc_50.evaluate({'input1': tokens_tensor_test_anc_50, 'input2': contexts_tensor_test_anc_50}, 
               {'output': Y_test_anc[:381]})



[0.3153291344642639, 0.7952755689620972]

In [None]:
model_anc_50.evaluate({'input1': tokens_tensor_test_anc, 'input2': contexts_tensor_test_anc}, 
               {'output': Y_test_anc})



[0.3237176835536957, 0.7860892415046692]

In [None]:
preds_anc = model_anc_50.predict({'input1': tokens_tensor_test_anc_50, 'input2': contexts_tensor_test_anc_50})
f1_score(np.argmax(preds_anc, axis=1), np.argmax(Y_test_anc[:381], axis=1), average='micro')

0.7952755905511811

In [None]:
f1_score(np.argmax(preds_anc, axis=1), np.argmax(Y_test_anc[:381], axis=1), average='macro')

0.7598666577325114

In [None]:
f1_score(np.argmax(preds_anc, axis=1), np.argmax(Y_test_anc[:381], axis=1), average='weighted')

0.7974725046095319

In [None]:
preds_anc = model_anc_50.predict({'input1': tokens_tensor_test_anc, 'input2': contexts_tensor_test_anc})
print(f1_score(np.argmax(preds_anc, axis=1), np.argmax(Y_test_anc, axis=1), average='micro'))
print(f1_score(np.argmax(preds_anc, axis=1), np.argmax(Y_test_anc, axis=1), average='macro'))
print(f1_score(np.argmax(preds_anc, axis=1), np.argmax(Y_test_anc, axis=1), average='weighted'))

0.7860892388451444
0.753947509573183
0.7885989264285002


#### 30%

In [None]:
model_anc_30 = build_model()

In [None]:
len(X_train_tokens_anc) * 0.3

533.4

In [None]:
len(X_test_tokens_anc) * 0.3

228.6

In [None]:
tokens_tensor_train_anc_30 = tf.convert_to_tensor(X_train_tokens_anc[:533])
tokens_tensor_test_anc_30 = tf.convert_to_tensor(X_test_tokens_anc[:228])
contexts_tensor_train_anc_30 = tf.convert_to_tensor(X_train_contexts_anc[:533])
contexts_tensor_test_anc_30 = tf.convert_to_tensor(X_test_contexts_anc[:228])

In [None]:
model_anc_30.fit({'input1': tokens_tensor_train_anc_30, 'input2': contexts_tensor_train_anc_30}, 
               {'output': Y_train_anc[:533]}, batch_size=8, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fcc28bc7e90>

In [None]:
model_anc_30.evaluate({'input1': tokens_tensor_test_anc_30, 'input2': contexts_tensor_test_anc_30}, 
               {'output': Y_test_anc[:228]})



[0.37392914295196533, 0.7149122953414917]

In [None]:
model_anc_30.evaluate({'input1': tokens_tensor_test_anc, 'input2': contexts_tensor_test_anc}, 
               {'output': Y_test_anc})



[0.3807950019836426, 0.7230970859527588]

In [None]:
preds_anc = model_anc_30.predict({'input1': tokens_tensor_test_anc_30, 'input2': contexts_tensor_test_anc_30})
f1_score(np.argmax(preds_anc, axis=1), np.argmax(Y_test_anc[:228], axis=1), average='micro')

0.7149122807017545

In [None]:
f1_score(np.argmax(preds_anc, axis=1), np.argmax(Y_test_anc[:228], axis=1), average='macro')

0.6542285484916788

In [None]:
f1_score(np.argmax(preds_anc, axis=1), np.argmax(Y_test_anc[:228], axis=1), average='weighted')

0.7115881047138798

In [None]:
preds_anc = model_anc_30.predict({'input1': tokens_tensor_test_anc, 'input2': contexts_tensor_test_anc})
print(f1_score(np.argmax(preds_anc, axis=1), np.argmax(Y_test_anc, axis=1), average='micro'))
print(f1_score(np.argmax(preds_anc, axis=1), np.argmax(Y_test_anc, axis=1), average='macro'))
print(f1_score(np.argmax(preds_anc, axis=1), np.argmax(Y_test_anc, axis=1), average='weighted'))

0.7230971128608924
0.6771530270266309
0.7213731338246442


### 10%

In [None]:
model_anc_10 = build_model()

In [None]:
len(X_train_tokens_anc) * 0.1

177.8

In [None]:
len(X_test_tokens_anc) * 0.1

76.2

In [None]:
tokens_tensor_train_anc_10 = tf.convert_to_tensor(X_train_tokens_anc[:177])
tokens_tensor_test_anc_10 = tf.convert_to_tensor(X_test_tokens_anc[:76])
contexts_tensor_train_anc_10 = tf.convert_to_tensor(X_train_contexts_anc[:177])
contexts_tensor_test_anc_10 = tf.convert_to_tensor(X_test_contexts_anc[:76])

In [None]:
model_anc_10.fit({'input1': tokens_tensor_train_anc_10, 'input2': contexts_tensor_train_anc_10}, 
               {'output': Y_train_anc[:177]}, batch_size=8, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fcc25b6ab90>

In [None]:
model_anc_10.evaluate({'input1': tokens_tensor_test_anc_10, 'input2': contexts_tensor_test_anc_10}, 
               {'output': Y_test_anc[:76]})



[0.3938930332660675, 0.7236841917037964]

In [None]:
model_anc_10.evaluate({'input1': tokens_tensor_test_anc, 'input2': contexts_tensor_test_anc}, 
               {'output': Y_test_anc})



[0.44073590636253357, 0.6811023354530334]

In [None]:
preds_anc = model_anc_10.predict({'input1': tokens_tensor_test_anc_10, 'input2': contexts_tensor_test_anc_10})
print(f1_score(np.argmax(preds_anc, axis=1), np.argmax(Y_test_anc[:76], axis=1), average='micro'))
print(f1_score(np.argmax(preds_anc, axis=1), np.argmax(Y_test_anc[:76], axis=1), average='macro'))
print(f1_score(np.argmax(preds_anc, axis=1), np.argmax(Y_test_anc[:76], axis=1), average='weighted'))

0.7236842105263158
0.6627346300548914
0.7175054311277221


In [None]:
preds_anc = model_anc_10.predict({'input1': tokens_tensor_test_anc, 'input2': contexts_tensor_test_anc})
print(f1_score(np.argmax(preds_anc, axis=1), np.argmax(Y_test_anc, axis=1), average='micro'))
print(f1_score(np.argmax(preds_anc, axis=1), np.argmax(Y_test_anc, axis=1), average='macro'))
print(f1_score(np.argmax(preds_anc, axis=1), np.argmax(Y_test_anc, axis=1), average='weighted'))

0.6811023622047244
0.6395481678106575
0.6805693329009728
