In [1]:
import pyconll
import numpy as np

### Functions

In [2]:
def load_word2vecModel(word2vec_file):
    """
    Load a pretrained word2vec model and return it as variable.
    
    Args:
        word2vec_file: text filename (with full path) containing the word2vec model
   
    Returns:
        w2v_model: variable containing the loaded model    
    """
    print("Loading word2vec model...")
    import os

    w2v_model = {}
    with open(word2vec_file) as f:
        for line in f:
            word, wordVector = line.split(maxsplit=1)
            wordVector = np.fromstring(wordVector, 'f', sep=' ')
            w2v_model[word] = wordVector
    
    print("Done.")        
    return w2v_model

In [3]:
def generate_extra_embedding_vecs(EMBEDDING_DIM,seed_state=42):
    """
    Generate random embedding vectors for the tags 'EOS' (end of sentence),
    'PAD' (for zero-padding) and 'OOV' (out of vocabulary).
    
    Args:
        EMBEDDING_DIM (int)
        (seed_state) (int): for the random generator, to have predictable values [default: 42]
        
    Returns:
        rand_embed_vecs (list of 3 elements, each one a random vector for the tags: [oov, eos, pad])
    """
    
    np.random.seed(seed_state)
    oov_vec = np.random.normal(size=EMBEDDING_DIM)
    #oov_vec.shape

    eos_vec = np.random.normal(size=EMBEDDING_DIM)
    #eos_vec.shape

    pad_vec = np.random.normal(size=EMBEDDING_DIM)
    #pad_vec.shape
    
    return [oov_vec, eos_vec, pad_vec]


In [4]:
def tag_encoding_dictionary(conllu_file):
    """
    Create dictionary for encoding all the unique tags found in the conllu-formatted file into integers

    Args:
        conllu_file: filename (with whole path) of conllu-formatted file that we use

    Returns:
        tag_dict: tag dictionary used for encoding
    """  

      #get unique list of tags from file, bash (didn't manage from parser Pyconll),
    import subprocess

    tag_list = subprocess.check_output("awk '{ print $4 }' " +  conllu_file + " | sort | uniq", shell=True)
    tag_list = tag_list.decode().splitlines()
    tag_list[:] = [x for x in tag_list if x] #remove empty strings

    tag_dict = {}
    for int_code, tag in enumerate(tag_list):
        tag_dict[tag] = int_code

        tag_dict['EOS'] = int_code+1
        tag_dict['PAD'] = int_code+2

    return tag_dict

In [5]:
def word2vec_data_encoding(data,word2vec_model,MAX_SEQUENCE_LEN, EMBEDDING_DIM, extra_embeddings):
    """
    Encode input data into: 1) arrays of word2vec embeddings and 2) corresponding labels ('tags')
    
    Args:
        data (PyConll object, with N sentences): the data to be encoded
        (https://pyconll.readthedocs.io/en/stable/index.html)
        word2vec_model (dictionary, keys=words, values: embeddings): pretrained word2vec model used in encoding
        MAX_SEQUENNCE_LEN (int): maximum length of sentence (of training data)
        EMBEDDING_DIM (int): length of the embedding vectors of word2vec_model
        extra_embeddings (list of 3 embedding vectors): this list corresponds to embeddings of 
       [OOV, EOS, PAD], respectively.
        
    Returns:
        sentences_X (numpy array of size (N x MAX_SEQUENCE_LEN x EMBEDDING_DIM): input data for the classifier
        tags_y (numpy array of size (N x MAX_SEQUENCE_LEN): output data (labels) for the classifier
    """
    try:
        len(word2vec_model['the']) == EMBEDDING_DIM
    except:
        raise Exception('Error: mismatch between EMBEDDING_DIM and dimension of word2vec_model vectors')
    print("Encoding data into embeddings...")
    
    oov_vec= extra_embeddings[0]
    eos_vec= extra_embeddings[1]
    pad_vec= extra_embeddings[2]
    
    N = len(data)
    
    sentences_X = np.empty((N, MAX_SEQUENCE_LEN,EMBEDDING_DIM))
    tags_y = np.empty((N, MAX_SEQUENCE_LEN))

    for idx_sentence,sentence in enumerate(data):
        #print(sentence)

        idx_eos = len(sentence)
        for idx_word, token in enumerate(sentence):
            token_fixed = token.form.lower()
        
            if token_fixed in word2vec_model:
                #print('in:')
                #print(token_fixed)
                sentences_X[idx_sentence,idx_word,:] = word2vec_model[token_fixed]
                tags_y[idx_sentence,idx_word] = tag_dict[token.upos]
            else:
                #print('OOV:')
                #print(token_fixed)
                sentences_X[idx_sentence,idx_word,:] = oov_vec
                tags_y[idx_sentence,idx_word] = tag_dict[token.upos]

        #print('EOS')
        sentences_X[idx_sentence,idx_eos] = eos_vec
        tags_y[idx_sentence,idx_eos] = tag_dict['EOS']

        #add zero-padding if necessary
        if idx_eos < MAX_SEQUENCE_LEN:
            sentences_X[(idx_sentence,range(idx_eos+1,MAX_SEQUENCE_LEN))]= pad_vec
            tags_y[(idx_sentence,range(idx_eos+1,MAX_SEQUENCE_LEN))] = tag_dict['PAD']
    print("Done.")
    return [sentences_X, tags_y]

## Load conllu-formatted data and pre-trained word2vec model

Load pre-trained model for the embeddings

In [6]:
word2vec_file = 'data/glove.6B/glove.6B.100d.txt'
word2vec_model = load_word2vecModel(word2vec_file)

Loading word2vec model...
Done.


In [7]:
 EMBEDDING_DIM = len(word2vec_model['the']) # 100-dimensional word embedding vectors

Load data (train, validation and test)

In [8]:
conllu_train_file = 'data/ud-1.2/en/en-ud-train.conllu'
conllu_val_file = 'data/ud-1.2/en/en-ud-dev.conllu'
conllu_test_file = 'data/ud-1.2/en/en-ud-test.conllu'

data_train = pyconll.load_from_file(conllu_train_file)
data_val = pyconll.load_from_file(conllu_val_file)
data_test = pyconll.load_from_file(conllu_test_file)

In [9]:
MAX_SEQUENCE_LEN = len(max(data_train, key=len)) +1 #since we add an EOS 

Create a dictionary to encode tags into integers

In [10]:
tag_dict = tag_encoding_dictionary(conllu_train_file)

Generate random vectors as embeddings for tags 'EOS', 'PAD', and 'OOV'

In [11]:
#list with elements: [oov_vec, eos_vec, pad_vec]
extra_embeddings = generate_extra_embedding_vecs(EMBEDDING_DIM)

Encode conllu-formatted data into embeddings

In [12]:
[X_train, y_train] = word2vec_data_encoding(data_train,word2vec_model,MAX_SEQUENCE_LEN, EMBEDDING_DIM, extra_embeddings)

Encoding data into embeddings...
Done.


In [13]:
[X_val, y_val] = word2vec_data_encoding(data_val,word2vec_model,MAX_SEQUENCE_LEN, EMBEDDING_DIM, extra_embeddings)

Encoding data into embeddings...
Done.


In [14]:
[X_test, y_test] = word2vec_data_encoding(data_test,word2vec_model,MAX_SEQUENCE_LEN, EMBEDDING_DIM, extra_embeddings)

Encoding data into embeddings...
Done.


## POS-tag model (feat extraction (encoder) + classification (decoder))

In [15]:
from keras.models import Sequential
from keras.layers import InputLayer
from keras.layers import LSTM
from keras.layers import Dense
from keras.regularizers import L1L2

Using TensorFlow backend.


Model definition:

In [16]:
model = Sequential()

hidden_units = 50
num_tags = len(tag_dict)
model.add(InputLayer(input_shape=(MAX_SEQUENCE_LEN, EMBEDDING_DIM)))
model.add(LSTM(hidden_units, return_sequences=True))
model.add(Dense(num_tags, activation='softmax')) # Dense can handle 3D input too
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 160, 50)           30200     
_________________________________________________________________
dense (Dense)                (None, 160, 19)           969       
Total params: 31,169
Trainable params: 31,169
Non-trainable params: 0
_________________________________________________________________


Model training:

In [17]:
#training settings
BATCH_SIZE = 128
EPOCHS = 20

In [18]:
from keras.utils import np_utils

#model.fit(X_train, np_utils.to_categorical(y_train,num_tags), batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(X_val, y_val))
model.fit(X_train, np_utils.to_categorical(y_train,num_tags), batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.2)
 

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fc4904cbdd8>

Evaluation:

In [19]:
scores = model.evaluate(X_test, np_utils.to_categorical(y_test,num_tags))



In [20]:
#print(f"{model.metrics_names[0]}: {scores[0] * 100}") #loss
print("Test set - " f"{model.metrics_names[1]}: {scores[1] * 100}") #accuracy

Test set - accuracy: 93.00944805145264


In [None]:
#TODOs
#1. Do the training using the given validation data (now just training data is split).
#2. Include GPU option, to train the keras model using the GPU