<a href="https://colab.research.google.com/github/alinaalborova/russian_idioms_processing/blob/main/MICE_fastText.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Idiom Type and Token Classification

Based on [MICE: Mining Idioms with Contextual Embeddings](https://arxiv.org/pdf/2008.05759.pdf) by  Škvorc et al.


In [None]:
!pip install transformers
!pip install tensor2tensor

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 4.1MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 41.2MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |███████

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import tensorflow as tf
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [None]:
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Activation, TimeDistributed, Masking, GRU
from tensorflow.keras import Sequential
from tensorflow.keras.utils import to_categorical
from ast import literal_eval
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from collections import Counter
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score

## Install & Import fastText

In [None]:
pip install fasttext

Collecting fasttext
[?25l  Downloading https://files.pythonhosted.org/packages/f8/85/e2b368ab6d3528827b147fdb814f8189acc981a4bc2f99ab894650e05c40/fasttext-0.9.2.tar.gz (68kB)
[K     |████▊                           | 10kB 11.5MB/s eta 0:00:01[K     |█████████▌                      | 20kB 10.1MB/s eta 0:00:01[K     |██████████████▎                 | 30kB 6.0MB/s eta 0:00:01[K     |███████████████████             | 40kB 5.5MB/s eta 0:00:01[K     |███████████████████████▉        | 51kB 2.9MB/s eta 0:00:01[K     |████████████████████████████▋   | 61kB 3.2MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 2.6MB/s 
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3098404 sha256=8f73f118b52ba2039ff7f23613112ddddb049c9ca0bcb4a91f4a3bdc8111abff
  Stored in directory: /root/.cache/pip/wheels/98/ba/7f/b154944a1cf5a8cee91c15

In [None]:
import fasttext.util
fasttext.util.download_model('ru', if_exists='ignore')  # Russian
ft = fasttext.load_model('cc.ru.300.bin')

Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.bin.gz





## Dataset

In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
dataset_vnc_dir = '/content/drive/MyDrive/ВКР/Sense Disambiguation Corpus/VNCs_Annotated.csv'
data_vnc = pd.read_csv(dataset_vnc_dir )
data_vnc.head(2)

Unnamed: 0,Idiom Normal,Idiom Inflected,Label,Example
0,бить карту,бил карту,0,Он бил карту за картой и загребал золото и кре...
1,бить карту,бил карту,0,"Ермолов держал карты, сощуря правый глаз; ког..."


In [None]:
data_vnc.Label.value_counts()

1    455
0    438
Name: Label, dtype: int64

In [None]:
data_vnc.shape

(893, 4)

In [None]:
len(data_vnc['Idiom Normal'].value_counts())

51

In [None]:
dataset_anc_dir = '/content/drive/MyDrive/ВКР/Sense Disambiguation Corpus/ANCs_Annotated.csv'
data_anc = pd.read_csv(dataset_anc_dir )
data_anc.head(2)

Unnamed: 0,Idiom Normal,Idiom Inflected,Label,Example
0,избитая дорога,избитой дороге,0,"С бурной быстротой, возможной только в сновид..."
1,избитая дорога,избитой дороге,0,"Как почтовый возок на избитой дороге, прыгает..."


In [None]:
len(data_anc['Idiom Inflected'].value_counts())

180

In [None]:
data_anc['Idiom Normal'].value_counts()

больное место                 57
правая рука                   56
болевая точка                 52
нож острый                    49
путеводная звезда             48
лавровый венок                44
бедный родственник            41
тяжёлая рука                  38
зелёная улица                 38
ваш брат                      34
вавилонское столпотворение    30
наша сестра                   28
пороховая бочка               27
дальний прицел                25
вторая ступень                23
заблудшая овца                23
старый воробей                22
красная бумажка               21
синяя птица                   20
долгая песня                  18
другой разговор               18
старый гриб                   16
чёрная кость                  15
девичья кожа                  12
маковое зерно                 10
музейная редкость             10
избитая дорога                10
ободранная кошка               9
куриная голова                 9
чернильная строка              3
Name: Idio

In [None]:
len(data_anc['Idiom Normal'].value_counts())

30

In [None]:
data = pd.concat([data_vnc, data_anc], ignore_index=True)
data.head(2)

Unnamed: 0,Idiom Normal,Idiom Inflected,Label,Example
0,бить карту,бил карту,0,Он бил карту за картой и загребал золото и кре...
1,бить карту,бил карту,0,"Ермолов держал карты, сощуря правый глаз; ког..."


In [None]:
data['Label'].value_counts()

0    856
1    843
Name: Label, dtype: int64

In [None]:
data.shape

(1699, 4)

In [None]:
data.tail(2)

Unnamed: 0,Idiom Normal,Idiom Inflected,Label,Example
1697,бедный родственник,бедными родственниками,0,"Проходя мимо церквей, я вижу иногда человека,..."
1698,бедный родственник,бедных родственниках,0,"[Егор Дмитрич Глумов, муж] У молодой женщин..."


In [None]:
data.shape

(1699, 4)

## Tokenize & Get fastText Embeddings

In [None]:
import re
GROUPING_SPACE_REGEX = re.compile('([^\w_-]|[+])', re.U)

def tokenize(text):
  """
  Split text into tokens. Don't split by hyphen.
  """
  return [t for t in GROUPING_SPACE_REGEX.split(text)
          if t and not t.isspace()]

from keras.preprocessing.sequence import pad_sequences

def pad_sentence(tokenized, max_len):
  print('\nPadding/truncating all sentences to %d values...' % max_len)
  print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))

  # Pad our input tokens with value 0.
  # "post" indicates that we want to pad and truncate at the end of the sequence,
  # as opposed to the beginning.
  input_ids = pad_sequences(tokenized, maxlen=max_len, dtype="long", 
                            value=0, truncating="post", padding="post")
  print('\nDone.')
  return input_ids

In [None]:
def get_embeddings(text, length, padded_size=100):
  vec_size = 300
  embeddings_list = []
  for i, sentence in enumerate(text):
    local_embeddings = np.empty(shape=(padded_size, ), dtype=object)
    for id, token in enumerate(local_embeddings):
      try:
        local_embeddings[id] = ft.get_word_vector(sentence[id])
      except:
        local_embeddings[id] = np.zeros(vec_size)
    embeddings_list.append(local_embeddings)
  return embeddings_list

### All

In [None]:
contexts_tokenized_all = data.Example.apply(tokenize)
embedded_contexts_all = get_embeddings(contexts_tokenized_all, length=len(list(contexts_tokenized_all)))

In [None]:
labels = to_categorical(data.Label)
X_train, X_test = train_test_split(embedded_contexts_all, test_size=0.2)
Y_train, Y_test = train_test_split(labels, test_size=0.2)
X_train = tf.convert_to_tensor(X_train)
X_test = tf.convert_to_tensor(X_test)

### VNC

In [None]:
contexts_tokenized_vnc = data_vnc.Example.apply(tokenize)
embedded_contexts_vnc = get_embeddings(contexts_tokenized_vnc, length=len(list(contexts_tokenized_vnc)))

In [None]:
labels_vnc = to_categorical(data_vnc.Label)
X_train_vnc, X_test_vnc = train_test_split(embedded_contexts_vnc, test_size=0.2)
Y_train_vnc, Y_test_vnc = train_test_split(labels_vnc, test_size=0.2)
X_train_vnc = tf.convert_to_tensor(X_train_vnc)
X_test_vnc = tf.convert_to_tensor(X_test_vnc)

### ANC

In [None]:
contexts_tokenized_anc = data_anc.Example.apply(tokenize)
embedded_contexts_anc = get_embeddings(contexts_tokenized_anc, length=len(list(contexts_tokenized_anc)))

In [None]:
labels_anc = to_categorical(data_anc.Label)
X_train_anc, X_test_anc = train_test_split(embedded_contexts_anc, test_size=0.2)
Y_train_anc, Y_test_anc = train_test_split(labels_anc, test_size=0.2)
X_train_anc = tf.convert_to_tensor(X_train_anc)
X_test_anc = tf.convert_to_tensor(X_test_anc)

## RNN

In [None]:
MAX_SEQUENCE_LEN = 500
VECTOR_DIM = 300
NUM_CLASSES = 2

In [None]:
def build_model():
  model = Sequential()
  model.add(Masking(mask_value=0., input_shape=(MAX_SEQUENCE_LEN,VECTOR_DIM)))
  forward_layer = GRU(10, return_sequences=False, dropout=0.5)
  backward_layer = GRU(10, return_sequences=False, dropout=0.5,
                      go_backwards=True)
  model.add(Bidirectional(forward_layer, backward_layer=backward_layer,
                        input_shape=(MAX_SEQUENCE_LEN,VECTOR_DIM)))
  model.add(Dense(NUM_CLASSES))
  model.add(Activation('softmax'))

  model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
  print('compiled model')
  return model

### All

In [None]:
model_all = build_model()

compiled model


In [None]:
model_all = build_model()
model_all.fit(X_train, Y_train, batch_size=8, epochs=10)

compiled model
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f870d945d90>

In [None]:
model_all.evaluate(X_test, Y_test)



[0.7096795439720154, 0.47058823704719543]

In [None]:
preds_all = model_all.predict(np.asarray(X_test))
f1_score(np.argmax(preds_all, axis=1), np.argmax(Y_test, axis=1))



0.5027624309392265

### VNC

In [None]:
model_vnc = build_model()
model_vnc.fit(X_train_vnc, Y_train_vnc, batch_size=8, epochs=10)

compiled model
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f87089e7310>

In [None]:
model_vnc.evaluate(X_test_vnc, Y_test_vnc)



[0.7380333542823792, 0.48603352904319763]

In [None]:
preds_vnc = model_vnc.predict(np.array(X_test_vnc))
f1_score(np.argmax(preds_vnc, axis=1), np.argmax(Y_test_vnc, axis=1))



0.5533980582524272

### ANC

In [None]:
model_anc = build_model()
model_anc.fit(X_train_anc, Y_train_anc, batch_size=8, epochs=10)

compiled model
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f870540ef90>

In [None]:
model_anc.evaluate(X_test_anc, Y_test_anc)



[0.6913085579872131, 0.5308641791343689]

In [None]:
preds_anc = model_anc.predict(X_test_anc)
f1_score(np.argmax(preds_anc, axis=1), np.argmax(Y_test_anc, axis=1))



0.5189873417721519