In [1]:
!pip install tensorflow_text
!pip install transformers

Collecting tensorflow_text
  Downloading tensorflow_text-2.8.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 26.3 MB/s 
Collecting tf-estimator-nightly==2.8.0.dev2021122109
  Downloading tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB)
[K     |████████████████████████████████| 462 kB 71.5 MB/s 
Installing collected packages: tf-estimator-nightly, tensorflow-text
Successfully installed tensorflow-text-2.8.1 tf-estimator-nightly-2.8.0.dev2021122109
Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 27.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 60.7 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K  

In [2]:
!unzip data.zip

Archive:  data.zip
   creating: data/
   creating: data/Development/
  inflating: data/Development/dev_definitions.txt  
  inflating: data/Development/dev_examples.txt  
  inflating: data/Development/dev_hypernyms.txt  
  inflating: data/Development/dev_labels.txt  
  inflating: data/README.txt         
   creating: data/Test/
  inflating: data/Test/test_definitions.txt  
  inflating: data/Test/test_examples.txt  
  inflating: data/Test/test_hypernyms.txt  
   creating: data/Training/
  inflating: data/Training/train_definitions.txt  
  inflating: data/Training/train_examples.txt  
  inflating: data/Training/train_hypernyms.txt  
  inflating: data/Training/train_labels.txt  


In [3]:
import re
import os
import errno
import pandas as pd

!rm -rf processed_data

try:
    os.makedirs("processed_data")
except OSError as e:
    if e.errno != errno.EEXIST:
        raise

def decontracted(phrase):

    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can't", "can not", phrase)
    phrase = re.sub(r"gonna", "going to", phrase)
    phrase = re.sub(r"wanna", "want to", phrase)

    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r"\.+", " ", phrase)
    phrase = re.sub(r"[^A-Za-z$]", " ", phrase)
    phrase = re.sub(r" +", " ", phrase)

    return phrase


def get_df(pref, testing=False):

    dataset = []
    with open(pref + "_" + "examples.txt") as examples:
        for example in examples:
            example = example[:-1].lower().split("\t")
            if example[0] != decontracted(example[0]):
                text = re.sub(example[0], "\$", example[2])
                text = decontracted(text)
                text = re.sub(r"\$", example[0], text)
                example[2] = text
            else:
                example[2] = decontracted(example[2])
            dataset.append(example)

    with open(pref + "_" + "hypernyms.txt") as hypernyms:
        for index, line in enumerate(hypernyms):
            line = line[:-1].lower().split("\t")
            line = " ; ".join(line)
            line = re.sub(r'_', ' ', line)
            dataset[index].append(line)

    with open(pref + "_" + "definitions.txt") as definitions:
        for index, line in enumerate(definitions):
            line = line[:-1].lower().split(";")
            for i in range(len(line)):
                line[i] = line[i].strip()
                if dataset[index][0] != decontracted(dataset[index][0]):
                    text = re.sub(dataset[index][0], "\$", line[i])
                    text = decontracted(text)
                    text = re.sub(r"\$", dataset[index][0], text)
                    line[i] = text
                else:
                    line[i] = decontracted(line[i])
            line = " ; ".join(line)
            dataset[index].append(line)

    cols = ["target", "position", "sentence", "hypernym", "definition"]

    if not testing:
        cols.append("label")
        with open(pref + "_" + "labels.txt") as labels:
            for index, line in enumerate(labels):
                line = line[:-1]
                dataset[index].append(line)

    df = pd.DataFrame(dataset, columns=cols)

    return df


dev_df = get_df("data/Development/dev")
train_df = get_df("data/Training/train")
test_df = get_df("data/Test/test", True)

dev_df.to_csv("processed_data/dev.csv", index=None)
train_df.to_csv("processed_data/train.csv", index=None)
test_df.to_csv("processed_data/test.csv", index=None)

In [4]:
import tensorflow_text
import tensorflow as tf
import tensorflow_hub as hub

import numpy as np
import pandas as pd

from keras import backend as K

import transformers
from transformers import BertTokenizer
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

In [5]:
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2")

In [85]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

transformers.logging.set_verbosity_error()

def extract_data(filename):

    df = pd.read_csv(filename)
    X = {'input_type_ids':[],'input_word_ids':[],'input_mask':[], 'mask1': [], 'mask2': []}
    Y = []

    for _, row in df.iterrows():
        word = row['target']
        sentence = row['sentence']
        hypernyms = row['hypernym']
        definitions = row['definition']
        if row['label'] == 'T':
            label = 1.0
        else:
            label = 0.0

        desc = [definitions]
        if isinstance(hypernyms, str):
          desc.append(hypernyms)

        desc = ' ; '.join(desc)

        x = tokenizer(sentence, desc, max_length = 128, padding='max_length', truncation = True)

        sep1 = x['input_ids'].index(102)
        sep2 = x['input_ids'].index(102, sep1 + 1)

        x['mask1'] = np.zeros((128))
        x['mask2'] = np.zeros((128))

        for i in range(0, sep1):
            x['mask1'][i] = 1.0
        
        for i in range(sep1, sep2):
            x['mask2'][i] = 1.0

        X['input_type_ids'].append(x['token_type_ids'])
        X['input_word_ids'].append(x['input_ids'])
        X['input_mask'].append(x['attention_mask'])
        X['mask1'].append(x['mask1'])
        X['mask2'].append(x['mask2'])

        Y.append(label)     

    X['input_type_ids'] = np.array(X['input_type_ids'], dtype=np.float32)
    X['input_word_ids'] = np.array(X['input_word_ids'], dtype=np.float32)
    X['input_mask'] = np.array(X['input_mask'], dtype=np.float32)

    X['mask1'] = np.array(X['mask1'], dtype=np.float32)
    X['mask2'] = np.array(X['mask2'], dtype=np.float32)

    Y = np.array(Y)

    return X, Y

X,Y = extract_data('processed_data/train.csv')
Xval, Yval = extract_data('processed_data/dev.csv')

In [91]:
input_type_ids = tf.keras.layers.Input(shape=(128,),dtype=tf.int32, name="input_type_ids")
input_word_ids = tf.keras.layers.Input(shape=(128,),dtype=tf.int32, name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(128,),dtype=tf.int32, name="input_mask")

input_mask1 = tf.keras.layers.Input(shape=(128),dtype=tf.float32, name="input_mask1")
input_mask2 = tf.keras.layers.Input(shape=(128),dtype=tf.float32, name="input_mask2")

pooled_output, sequence_output = bert_encoder([input_word_ids, input_mask, input_type_ids])

def Custom(tensors):
    
    mask1 = tensors[1]
    mask2 = tensors[2]
    tensor = tensors[0]

    mod_tensor1 = tensor * tf.expand_dims(mask1, axis = 2)
    mod_tensor1 = K.sum(mod_tensor1, axis = 1)
    mod_tensor1 = mod_tensor1 / tf.expand_dims(K.sum(mask1, axis = 1), axis = 1)

    mod_tensor2 = tensor * tf.expand_dims(mask2, axis = 2)
    mod_tensor2 = K.sum(mod_tensor2, axis = 1)
    mod_tensor2 = mod_tensor2 / tf.expand_dims(K.sum(mask2, axis = 1), axis = 1)

    return mod_tensor1, mod_tensor2

seq1, seq2 = tf.keras.layers.Lambda(Custom)([sequence_output, input_mask1, input_mask2])

x = tf.keras.layers.Concatenate()([seq1, seq2, pooled_output])
x = tf.keras.layers.Dense(1, activation = 'sigmoid')(x)

model = tf.keras.models.Model(
      inputs=[
        input_word_ids,
        input_mask,
        input_type_ids,
        input_mask1,
        input_mask2], 
      outputs=x)

model.compile(optimizer=Adam(), loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

Model: "model_25"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 128)]        0           []                               
                                                                                                  
 input_type_ids (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 keras_layer (KerasLayer)       [(None, 768),        109482241   ['input_word_ids[0][0]',         
                                 (None, 128, 768)]                'input_mask[0][0]',      

In [92]:
model.fit([X['input_word_ids'], X['input_mask'], X['input_type_ids'], X['mask1'], X['mask2']], Y, 
          epochs = 20, batch_size = 16,
          validation_data = ([Xval['input_word_ids'], Xval['input_mask'], Xval['input_type_ids'], Xval['mask1'], Xval['mask2']], Yval)
          )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f75dbc52dd0>