Import libraies

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import time



Read train and test data files

In [2]:
def read_file(file):
	File = open(file, "r")
	data = {"id": [], "corpus": [],  "sentence": [],"token" :[], "complexity": []}
	
	File.readline()
	while True:
		line = File.readline()
		if len(line) == 0:
			break

		line = line.split()
		data["id"].append(line[0])
		data["corpus"].append(line[1])

		data["sentence"].append(' '.join(line[2:-2]))
		data["token"].append(line[-2])
		data["complexity"].append(float(line[-1]))
	return pd.DataFrame(data)


In [3]:
train = read_file('train.txt')
# test = read_file('test.txt' )

In [4]:
train.head()

Unnamed: 0,id,corpus,sentence,token,complexity
0,3ZLW647WALVGE8EBR50EGUBPU4P32A,bible,"Behold, there came up out of the river seven c...",river,0.0
1,34R0BODSP1ZBN3DVY8J8XSIY551E5C,bible,I am a fellow bondservant with you and with yo...,brothers,0.0
2,3S1WOPCJFGTJU2SGNAN2Y213N6WJE3,bible,"The man, the lord of the land, said to us, 'By...",brothers,0.05
3,3BFNCI9LYKQN09BHXHH9CLSX5KP738,bible,Shimei had sixteen sons and six daughters; but...,brothers,0.15
4,3G5RUKN2EC3YIWSKUXZ8ZVH95R49N2,bible,"""He has put my brothers far from me.",brothers,0.263889


Data cleaning and Preprocessing Data

In [5]:
import nltk
import string
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [6]:
lemmatizer = WordNetLemmatizer() 
def lemmatize(text):
	lemmas = [lemmatizer.lemmatize(word) for word in word_tokenize(text)] 
	return " ".join(lemmas)

# remove stopwords
def remove_stopwords(text, token):
	words = [w for w in word_tokenize(text) if (w not in stopwords.words('english')) or w in list(word_tokenize(token))]
	return " ".join(words)

In [7]:
def preprocess(data):
	
	data["token"] = data['token'].astype(str)

	# lower case
	data['sentence'] = data['sentence'].apply(lambda x: x.lower())
	data['token'] = data['token'].apply(lambda x: x.lower())

	# remove punctuation
	translator = str.maketrans('', '', string.punctuation)
	data['sentence'] = data['sentence'].apply(lambda x: x.translate(translator))
	data['token'] = data['token'].apply(lambda x: x.translate(translator))

	# remove stopwords only if token != a stopword
	try:
		nltk.data.find('corpora/stopwords')
	except LookupError:
		nltk.download('stopwords')

	data['sentence'] = data.apply(lambda x: remove_stopwords(x['sentence'], x['token']) , axis=1)
	
	# lemmatize
	data['sentence'] = data['sentence'].apply(lambda x: lemmatize(x))
	data['token'] = data['token'].apply(lambda x: lemmatize(x))

	return data

In [8]:
data = preprocess(train)
# test_data = preprocess(test)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
data.head()

Unnamed: 0,id,corpus,sentence,token,complexity
0,3ZLW647WALVGE8EBR50EGUBPU4P32A,bible,behold came river seven cattle sleek fat fed m...,river,0.0
1,34R0BODSP1ZBN3DVY8J8XSIY551E5C,bible,fellow bondservant brother prophet keep word book,brother,0.0
2,3S1WOPCJFGTJU2SGNAN2Y213N6WJE3,bible,man lord land said u know honest men leave one...,brother,0.05
3,3BFNCI9LYKQN09BHXHH9CLSX5KP738,bible,shimei sixteen son six daughter brother didnt ...,brother,0.15
4,3G5RUKN2EC3YIWSKUXZ8ZVH95R49N2,bible,put brother far,brother,0.263889


Creating new Features for data

In [10]:
data['token_length'] = data['token'].str.len()
# test_data['token_length'] = test_data['token'].str.len()
data.head()

Unnamed: 0,id,corpus,sentence,token,complexity,token_length
0,3ZLW647WALVGE8EBR50EGUBPU4P32A,bible,behold came river seven cattle sleek fat fed m...,river,0.0,5
1,34R0BODSP1ZBN3DVY8J8XSIY551E5C,bible,fellow bondservant brother prophet keep word book,brother,0.0,7
2,3S1WOPCJFGTJU2SGNAN2Y213N6WJE3,bible,man lord land said u know honest men leave one...,brother,0.05,7
3,3BFNCI9LYKQN09BHXHH9CLSX5KP738,bible,shimei sixteen son six daughter brother didnt ...,brother,0.15,7
4,3G5RUKN2EC3YIWSKUXZ8ZVH95R49N2,bible,put brother far,brother,0.263889,7


In [11]:
def syllable_count(word):
    word = word.lower()
    count = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count

In [12]:
data['syllables'] = data['token'].apply(lambda x: syllable_count(x) )
# test_data['syllables'] = test_data['token'].apply(lambda x: syllable_count(x) )

In [13]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip -d embeddings
!rm glove.6B.zip
!rm embeddings/glove.6B.50d.txt
!rm embeddings/glove.6B.100d.txt
!rm embeddings/glove.6B.200d.txt

--2022-12-21 17:45:13--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-12-21 17:45:13--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-12-21 17:45:14--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [14]:
data['token'] = data['token'].astype(str)
data['sentence'] = data['sentence'].astype(str)

# test_data['token'] = test_data['token'].astype(str)
# test_data['sentence'] = test_data['sentence'].astype(str)


Split training set into validation and train dataset.

In [15]:
from sklearn.model_selection import train_test_split

train, validate_set = train_test_split(train, test_size=0.2, random_state=12)
print(len(train))
print(len(validate_set))

6129
1533


In [16]:
train_sentence_list = list(train['sentence'])
train_complexity_list = list(train['complexity'])
train_token_list = list(train['token'])

# test_sentence_list = list(test['sentence'])
# test_complexity_list = list(test['complexity'])
# test_token_list = list(test['token'])

validate_set_sentence_list = list(validate_set['sentence'])
validate_set_complexity_list = list(validate_set['complexity'])
validate_set_token_list = list(validate_set['token'])

print(len(train_token_list))
# print(len(test_sentence_list))
print(len(validate_set_sentence_list))

6129
1533


Create Embedding from Glove Embeddings

In [17]:
def read_glove_vector(glove_vec):
  with open(glove_vec, 'r', encoding='UTF-8') as f:
    words = set()
    word_to_vec_map = {}
    for line in f:
      w_line = line.split()
      curr_word = w_line[0]
      word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)

  return word_to_vec_map

word_to_vec_map = read_glove_vector('embeddings/glove.6B.300d.txt')

# get embeddings, and pad till max_len
def get_embeddings(sentences, max_len=0):
    sentence_emb = []
    for s in sentences:

        temp_sent_emb = [ word_to_vec_map[x] if x in word_to_vec_map else np.full((300,), np.nan) for x in s.split() ]
        mean_emb = np.nanmean(np.array(temp_sent_emb), axis=0)

        temp_sent_emb = np.array([ mean_emb if np.isnan(x[0]) else x for x in temp_sent_emb ])
        temp_sent_emb = np.concatenate((temp_sent_emb, np.zeros((max_len-temp_sent_emb.shape[0],300))))

        sentence_emb.append(temp_sent_emb)

    return np.array(sentence_emb)

Create Padding for sentences

In [18]:

# max len for the sentence used for padding
max_len_sent =max(max([len(s.split()) for s in train_sentence_list]),
    max([len(s.split()) for s in validate_set_sentence_list]))
print(max_len_sent)

max_len_token = max(max([len(s.split()) for s in train_token_list]) ,
                    max([len(s.split()) for s in validate_set_token_list])) 
print(max_len_token)


123
1


In [19]:
train_sent_emb = get_embeddings(train_sentence_list, max_len_sent)
# test_sent_emb = get_embeddings(test_sentence_list, max_len_sent)
val_sentence_emb = get_embeddings(validate_set_sentence_list, max_len_sent )

print("Sentence embedding shape train : {}".format(train_sent_emb.shape))
# print("Sentence embedding shape test : {}".format(test_sent_emb.shape))
print("Sentence embedding shape val : {}".format(val_sentence_emb.shape))

Sentence embedding shape train : (6129, 123, 300)
Sentence embedding shape val : (1533, 123, 300)


Create Positional data values from token : Location of token in senetence

In [20]:
train_token_data =   [ 
                        [ 
                            len(s.split(t)[0].split()), 
                            len(t.split())
                        ]
                        for s,t in zip(train_sentence_list, train_token_list) 
                    ]

# test_token_data =   [ 
#                         [ 
#                             len(s.split(t)[0].split()), 
#                             len(t.split())
#                         ]
#                         for s,t in zip(test_sentence_list, test_token_list) 
#                     ]

validate_token_data =   [ 
                        [ 
                            len(s.split(t)[0].split()), 
                            len(t.split())
                        ]
                        for s,t in zip(validate_set_sentence_list, validate_set_token_list) 
                    ]



In [21]:
features_train = train[['token_length', 'syllables'
                        ]].values

# features_test = test[['token_length', 'syllables',
#                         ]].values
features_validate_set = validate_set[['token_length', 'syllables',
                        ]].values

print(len(features_train))

6129


In [22]:
class FFN(tf.keras.layers.Layer):
    def __init__(
        self, 
        hidden_size,
        output_size,
        rate,
        softmax=False,
    ):
        super(FFN, self).__init__()

        self.layer1 = tf.keras.layers.Dense(hidden_size, activation="relu")  # (batch_size, hidden_size)
        self.dropout = tf.keras.layers.Dropout(rate)
        self.layer2 = tf.keras.layers.Dense(output_size, activation= "softmax" if softmax is True else None)  # (batch_size, output_size)

    def call(self, x, training):
        return self.layer2(self.dropout(self.layer1(x), training=training) ) 


In [23]:
class OurModelBiLSTM(tf.keras.Model):
    def __init__(
        self,
        lstm_units,
        hidden_size,
        random_seed,
        seq_len,
        embedding_size,
        rate=0.25
        ):
        """
        hidden_size - for FFN
        """

        super(OurModelBiLSTM, self).__init__()

        tf.random.set_seed(random_seed)
        self.random_seed = random_seed
        self.lstm_units = lstm_units

        self.bilstm = tf.keras.layers.Bidirectional(
                tf.keras.layers.LSTM(self.lstm_units, return_sequences=True), input_shape=(seq_len,embedding_size)
            )
        self.dropout = tf.keras.layers.Dropout(rate=rate)

        self.layernorm = tf.keras.layers.LayerNormalization()
        self.dense = tf.keras.layers.Dense(2*self.lstm_units)
        self.getWeights = FFN(hidden_size, 3, rate, softmax=True)
        self.final_layer = FFN(hidden_size, 1, rate)

    
    # one fwd pass on the model
    def call(self, input_seq, token_position, feature_emb, training=False):
        """
        input_seq : [batch_size, seq_len, embedding_size]
        token_position = [batch_size, 2] - [start,length] for tokens for which complexity is to be predicted
        """

        # bi-lstm pass
        bilstm_output = self.bilstm(input_seq)

        # bilstm_output = self.bilstm(input_seq)      #(batch_size, seq_len, 2*lstm_units)
        bilstm_output = self.dropout(bilstm_output, training=training)

        # extract token embeddings    
        # for each example we have start token and length
        # we take mean of these embeddings position

        token_emb = tf.stack(
                [tf.reduce_mean(tf.gather(i, tf.range(j[0],j[0]+j[1]), axis=0), axis=0) for i,j in zip(bilstm_output,token_position) ]
            )   # (batch_size, 2*lstm_units)

        # combine all embeddings - take mean
        added_emb = tf.reduce_mean(bilstm_output, axis=1)    # (batch_size, 2*lstm_units)
        
        # feature_emb - (batch_shape, features)
        feature_emb = self.dense(feature_emb)           # (batch_size, 2*lstm_units)
        # feature_emb = tf.cast(feature_emb, tf.float32)

        # get weights
        # weights = self.getWeights(self.layernorm(tf.add_n([token_emb,added_emb, feature_emb])), training)    # (batch_size, 3)
        weights = self.getWeights(self.layernorm(tf.concat([token_emb,added_emb, feature_emb], axis=1)), training)    # (batch_size, 3)

        # generate attenton-based final embeddings
        # final_emb = weights[0]*token_emb + weights[1]*added_emb + weights[3]*features_emb

        final_emb = tf.zeros(shape=(), dtype=tf.dtypes.float32)   # (bacth_size, 2*lstm_units)
        final_emb += tf.expand_dims(weights[:, 0], axis=1) * token_emb
        final_emb += tf.expand_dims(weights[:, 1], axis=1) * added_emb
        final_emb += tf.expand_dims(weights[:, 2], axis=1) * feature_emb

        # output complexity
        final_output = self.final_layer(final_emb, training)

        return final_output

    # loss function
    def loss_function(self, real, pred):
        loss_ = tf.keras.losses.MSE(real, pred)
        l = tf.reduce_mean(loss_)
        return l

    # set optimizer
    def compile(self, optimizer):
        super(OurModelBiLSTM, self).compile()
        self.optimizer = optimizer


    # train step - fwd pass + back prop + update model weights
    # @tf.function() - may not work beacuse zip is used
    def train_step(self, input_seq, token_position, y, feature_emb):
        training=True
        with tf.GradientTape() as tape:
            # Forward pass
            y_pred = self(input_seq, token_position, feature_emb, training)  
            loss = self.loss_function(tf.reshape(y,(-1,1)), y_pred)

        # calculate gradients
        gradients = tape.gradient(loss, self.trainable_variables)

        # update model weights using gradients
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))

        return loss

    # fit funtion of model
    def _fit(
        self, 
        input_seq_data, 
        token_position_data, 
        feature_emb_data,
        complexity, 
        epochs, 
        batch_size,
        val_input_seq_data=None,
        val_token_position_data=None,
        val_feature_emb_data=None,
        val_complexity=None
        ):

        # create tensorflow dataset
        tf_data = tf.data.Dataset.from_tensor_slices((input_seq_data, token_position_data, feature_emb_data, complexity))

        # shuffle and batch
        tf_data = tf_data.shuffle(100000, seed=self.random_seed).batch(batch_size)

        train_loss = tf.keras.metrics.Mean(name="train_loss")
        loss_train = {"MSE": []}
        loss_val = {"MSE": []}

        #training starts
        for epoch in range(epochs):
            start = time.time()
            
            # reset state for mean loss
            train_loss.reset_states()

            # loop over batches
            for step, x in tf_data.enumerate():

                # get inputs for model
                input_seq = x[0]
                token_position = x[1]
                feature_emb = x[2]
                y = x[3]

                # call trainstep
                loss = self.train_step(input_seq, token_position, y, feature_emb)
                train_loss(loss)

            loss_train["MSE"].append(train_loss.result().numpy())
            print("Epoch {} loss  MSE: {}, time taken: {:.2f}s".format(epoch + 1, loss_train["MSE"][-1], time.time() - start))

            # validation if provided
            if (val_input_seq_data is not None):
                val_pred = self._predict(val_input_seq_data, val_token_position_data, val_feature_emb_data)
                loss_val["MSE"].append(self.loss_function(tf.reshape(val_complexity, (-1,1)), val_pred).numpy())
                print("Validation loss MSE : {}".format(loss_val["MSE"][-1]))


        if (val_input_seq_data is not None):
            return loss_train, loss_val
        return loss_train

    
    # predict function
    def _predict(self, input_seq_data, token_position_data, feature_emb_data):
        
        # create tensorflow dataset
        tf_data = tf.data.Dataset.from_tensor_slices((input_seq_data, token_position_data, feature_emb_data))

        # batch for memory constraints
        tf_data = tf_data.batch(512)

        pred_list = []
        for step, x in tf_data.enumerate():
            # get inputs for model
            input_seq = x[0]
            token_position = x[1]
            feature_emb = x[2]

            pred_list.append(self(input_seq, token_position, feature_emb))
        return tf.concat(pred_list, axis=0)
     

In [24]:
lstm_units=20
hidden_size=10
random_seed=12
seq_len=max_len_sent
embedding_size=300
rate=0.4
epochs=20
batch_size=32
model = OurModelBiLSTM(lstm_units=lstm_units, hidden_size=hidden_size, random_seed=random_seed, seq_len=seq_len, embedding_size=embedding_size, rate=rate)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.005))

loss = model._fit ( train_sent_emb, 
                    train_token_data,
                    feature_emb_data = features_train,
                    complexity = train_complexity_list,
                    epochs=epochs, 
                    batch_size=batch_size,
                    val_input_seq_data=val_sentence_emb,
                    val_token_position_data = validate_token_data,
                    val_feature_emb_data = features_validate_set,
                    val_complexity = validate_set_complexity_list
                )
     

Epoch 1 loss  MSE: 0.023003390058875084, time taken: 38.14s
Validation loss MSE : 0.01719142682850361
Epoch 2 loss  MSE: 0.013761792331933975, time taken: 31.76s
Validation loss MSE : 0.009958258830010891
Epoch 3 loss  MSE: 0.011445410549640656, time taken: 31.55s
Validation loss MSE : 0.008967380039393902
Epoch 4 loss  MSE: 0.01027032732963562, time taken: 31.32s
Validation loss MSE : 0.009060914628207684
Epoch 5 loss  MSE: 0.009315293282270432, time taken: 31.85s
Validation loss MSE : 0.009455887600779533
Epoch 6 loss  MSE: 0.009084396064281464, time taken: 31.37s
Validation loss MSE : 0.007851263508200645
Epoch 7 loss  MSE: 0.008754400536417961, time taken: 35.38s
Validation loss MSE : 0.00789842288941145
Epoch 8 loss  MSE: 0.008407429791986942, time taken: 31.22s
Validation loss MSE : 0.008765201084315777
Epoch 9 loss  MSE: 0.008304108865559101, time taken: 31.48s
Validation loss MSE : 0.008296521380543709
Epoch 10 loss  MSE: 0.007832781411707401, time taken: 30.37s
Validation loss

In [25]:
# pred = model._predict(test_sent_emb, test_token_data, features_test)


In [26]:
# from sklearn.metrics import mean_absolute_error

# mae_score = mean_absolute_error(test_complexity_list, pred)
# mae_score

In [27]:
model.summary()

Model: "our_model_bi_lstm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirectiona  multiple                 51360     
 l)                                                              
                                                                 
 dropout (Dropout)           multiple                  0         
                                                                 
 layer_normalization (LayerN  multiple                 240       
 ormalization)                                                   
                                                                 
 dense (Dense)               multiple                  120       
                                                                 
 ffn (FFN)                   multiple                  1243      
                                                                 
 ffn_1 (FFN)                 multiple            

In [28]:
model.save_weights('./my_modal')