In [1]:
pip install pyconll

Collecting pyconll
  Downloading https://files.pythonhosted.org/packages/2c/6e/c325d0db05ac1b8d45645de903e4ba691d419e861c915c3d4ebfcaf8ac25/pyconll-2.2.1-py3-none-any.whl
Installing collected packages: pyconll
Successfully installed pyconll-2.2.1


In [3]:
import gensim
import pyconll
import numpy as np

In [6]:
ls

en-ud-dev.conllu   en-ud-train.conllu  [0m[01;34msample_data[0m/
en-ud-test.conllu  glove.6B.100d.txt


### Pre-trained embedding model

In [45]:
def load_word2vecModel(word2vec_file):
  """
  Load a pretrained word2vec model and return it as variable.

  Args:
    word2vec_file: text filename (with full path) containing the word2vec model
   
  Returns:
    w2v_model: variable containing the loaded model    
  """
  
  print("Loading word2vec model...")
  import os

  w2v_model = {}
  with open(word2vec_file) as f:
      for line in f:
          word, wordVector = line.split(maxsplit=1)
          wordVector = np.fromstring(wordVector, 'f', sep=' ')
          w2v_model[word] = wordVector

  return w2v_model

In [47]:
w2v_file = 'glove.6B.100d.txt'
w2v_model = load_word2vecModel(w2v_file)

Loading word2vec model (readfile)...




In [48]:
 EMBEDDING_DIM = len(w2v_model['the']) # 100-dimensional word embedding vectors

In [None]:
EMBEDDING_DIM

100

In [13]:
#get embedding keras layer from model (with Gensim)
#embedding_layer = pretrained_model.wv.get_keras_embedding(train_embeddings=False)
#embedding_layer

## Data

In [14]:
#use file copied to current folder
conllu_train_file = 'en-ud-train.conllu'
train = pyconll.load_from_file(conllu_train_file)

#https://pyconll.readthedocs.io/en/stable/starting.html

In [None]:
#create OOV and EOS random embedding vectors
# also for padding
oov_vec = np.random.normal(size=EMBEDDING_DIM)
#oov_vec.shape

eos_vec = np.random.normal(size=EMBEDDING_DIM)
eos_vec.shape

pad_vec = np.random.normal(size=EMBEDDING_DIM)
pad_vec.shape

(100,)

In [15]:
conllu_train_file

'en-ud-train.conllu'

In [2]:
"awk '{ print $4 }' en-ud-train.conllu | sort | uniq"

"awk '{ print $4 }' en-ud-train.conllu | sort | uniq"

In [22]:
"awk '{ print $4 }' " +  conllu_train_file + " | sort | uniq"

"awk '{ print $4 }' en-ud-train.conllu | sort | uniq"

In [23]:
import subprocess
tag_lissubprocess.check_output("awk '{ print $4 }' " +  conllu_train_file + " | sort | uniq", shell=True)

b'\nADJ\nADP\nADV\nAUX\nCONJ\nDET\nINTJ\nNOUN\nNUM\nPART\nPRON\nPROPN\nPUNCT\nSCONJ\nSYM\nVERB\nX\n'

In [26]:
def tag_encoding_dictionary(conllu_file):
  """
  Create dictionary for encoding all the unique tags found in the conllu file into integers

  Args:
    conllu_file: filename (with whole path) at we use

  Returns:
    tag_dict: tag dictionary used for encoding
  """  

  #get unique list of tags from file, bash (didn't manage from parser Pyconll),
  import subprocess

  tag_list = subprocess.check_output("awk '{ print $4 }' " +  conllu_file + " | sort | uniq", shell=True)
  tag_list = tag_list.decode().splitlines()
  tag_list[:] = [x for x in tag_list if x] #remove empty strings

  tag_dict = {}
  for int_code, tag in enumerate(tag_list):
    tag_dict[tag] = int_code
  
  tag_dict['EOS'] = int_code+1
  tag_dict['PAD'] = int_code+2
  
  return tag_dict

In [27]:
try_tag_dict = tag_encoding_dictionary(conllu_train_file)

In [28]:
try_tag_dict

{'ADJ': 0,
 'ADP': 1,
 'ADV': 2,
 'AUX': 3,
 'CONJ': 4,
 'DET': 5,
 'EOS': 17,
 'INTJ': 6,
 'NOUN': 7,
 'NUM': 8,
 'PAD': 18,
 'PART': 9,
 'PRON': 10,
 'PROPN': 11,
 'PUNCT': 12,
 'SCONJ': 13,
 'SYM': 14,
 'VERB': 15,
 'X': 16}

In [None]:
#convert pyconll object to list of sentences, which in turn are list of words (strings)

dataset_train = []
tagset_train = []
for sentence_conll in train:
  sentence_set = []
  sntnc_tag_set = []
  max_idx=len(sentence_conll)-1
  for idx,token_conll in enumerate(sentence_conll):
    sentence_set.append(token_conll.form.lower())
    sntnc_tag_set.append(tag_dict[token.upos])

  dataset_train.append(sentence_set)
  tagset_train.append(sntnc_tag_set)

NameError: ignored

In [None]:
dataset_train

['[',
 'this',
 'killing',
 'of',
 'a',
 'respected',
 'cleric',
 'will',
 'be',
 'causing',
 'us',
 'trouble',
 'for',
 'years',
 'to',
 'come',
 '.',
 ']']

In [None]:
w2v_model['abdullah'].shape

(100,)

In [None]:
#initializing

MAX_SEQUENCE_LEN = len(max(train, key=len)) +1 #since we add an EOS 
N_train = len(train)
BATCH_SIZE = 32 #batch size is a portion of N_train
sentences_train = np.empty((N_train, MAX_SEQUENCE_LEN,EMBEDDING_DIM))
tags_train = np.empty((N_train, MAX_SEQUENCE_LEN))
#input dimension for LSTM: (BATCH_SIZE x MAX_SEQUENCE_LEN x EMBEDDING_DIM)

for idx_sentence,sentence in enumerate(train):
  #print(sentence)

  idx_eos = len(sentence)
  for idx_word, token in enumerate(sentence):
    token_fixed = token.form.lower()
    #if token.form in w2v_model.vocab:
    if token_fixed in w2v_model:
      #print(word)  
      print('in:')
      #print(idx_word)
      print(token_fixed)
      sentences_train[idx_sentence,idx_word,:] = w2v_model[token_fixed] #for Gensim: w2v_model.wv[token_fixed]
      tags_train[idx_sentence,idx_word] = tag_dict[token.upos]
      #print(w2v_model[token_fixed].shape)
    else:
      print('OOV:')
      #print(idx_word)
      print(token_fixed)
      sentences_train[idx_sentence,idx_word,:] = oov_vec
      tags_train[idx_sentence,idx_word] = tag_dict[token.upos]

  print('EOS')
  #print(idx_word)
  sentences_train[idx_sentence,idx_eos] = eos_vec
  tags_train[idx_sentence,idx_eos] = tag_dict['EOS']

  #add zero-padding if necessary
  if idx_eos < MAX_SEQUENCE_LEN:
    sentences_train[(idx_sentence,range(idx_eos+1,MAX_SEQUENCE_LEN))]= pad_vec
    tags_train[(idx_sentence,range(idx_eos+1,MAX_SEQUENCE_LEN))] = tag_dict['PAD']


  #https://pyconll.readthedocs.io/en/stable/pyconll/unit/token.html

#upos -- output
#form -- input

#Pre-processing LATER
#TODO convert to small case (better: truncating) these 2 fields
#TODO divide into sub-words (chunks) --> byte pair encoding --> 



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
the
in:
one
in:
board
in:
that
in:
is
in:
cracked
in:
(
in:
the
in:
crack
in:
is
in:
deep
in:
enough
in:
to
in:
stick
in:
a
in:
penny
in:
in
in:
it
in:
and
in:
it
in:
goes
in:
clear
in:
through
in:
)
in:
yet
in:
they
in:
do
in:
not
in:
want
in:
to
in:
take
in:
the
in:
time
in:
to
in:
bother
in:
with
in:
what
in:
once
in:
was
in:
a
in:
happy
in:
customer
in:
and
in:
has
in:
now
in:
become
in:
a
in:
dissatisfied
in:
customer
in:
.
EOS
in:
so
in:
i
in:
figure
in:
if
in:
they
in:
do
in:
n't
in:
want
in:
to
in:
take
in:
the
in:
time
in:
to
in:
fix
in:
the
in:
fence
in:
that
in:
they
in:
installed
in:
then
in:
i
in:
'll
in:
take
in:
the
in:
time
in:
to
in:
let
in:
everyone
in:
i
in:
can
in:
know
in:
about
in:
how
in:
they
in:
treat
in:
customers
in:
once
in:
they
in:
have
in:
your
in:
money
in:
!!!
EOS
in:
stay
in:
away
in:
!!!
EOS
in:
you
in:
get
in:
what
in:
you
in:
pay
in:
for
in:
!!!
EOS
in:
they
in:
came
in:
in
in:
under
i

In [None]:
MAX_SEQUENCE_LEN

160

In [None]:
#check that sentences have right length after padding
#sentences_train[2,:,:].shape
tags_train[3,:].shape

(160,)

In [None]:
#conllu alternative

#data_file = open("fi-ud-train_edit.conllu", "r", encoding="utf-8")
#lines = data_file.readlines()
#parsed_file = parse(lines)

In [None]:
X_train = sentences_train #[idx_sentence,:,:]
y_train = tags_train #[idx_sentence,:]
X_train.shape

(32, 160, 100)

In [None]:
sentences_train.shape


(32, 17, 100)

In [None]:
X_train.shape

(17, 100)

## POS-tag model (feat extraction (encoder) + classification (decoder))

In [None]:
from keras.models import Sequential
from keras.layers import InputLayer
from keras.layers import LSTM
from keras.layers import Dense
from keras.regularizers import L1L2

Using TensorFlow backend.


In [None]:
num_tags = len(tag_dict)

In [None]:
num_tags

19

In [None]:
model = Sequential()

hidden_units = 50
#NOTE: LSTM doesn't let me input input_shape=(BATCH_SIZE,MAX_SEQUENCE_LEN, EMBEDDING_DIM) (it says this is 4 dimensions, and it expects 3)
#it seems batch size is implied?
# Hwoever, expected shape of input is still: (BATCH_SIZE,MAX_SEQUENCE_LEN, EMBEDDING_DIM))
#https://stackoverflow.com/questions/42335856/keras-valueerror-input-0-is-incompatible-layer-issues
#https://stackoverflow.com/questions/47671732/keras-input-a-3-channel-image-into-lstm
model.add(InputLayer(input_shape=(MAX_SEQUENCE_LEN, EMBEDDING_DIM))) #same as adding to LSTM the input_shape field
#model.add(embedding_layer)
model.add(LSTM(hidden_units, return_sequences=True))
#https://keras.io/api/layers/recurrent_layers/lstm/
model.add(Dense(num_tags, activation='softmax')) # Dense can handle 3D input too
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
#it seems Keras doesn't have logsoftmax?

model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (None, 160, 50)           30200     
_________________________________________________________________
dense_5 (Dense)              (None, 160, 19)           969       
Total params: 31,169
Trainable params: 31,169
Non-trainable params: 0
_________________________________________________________________


In [None]:
from keras.utils import np_utils
y_train_enc = np_utils.to_categorical(y_train,num_tags)
y_train_enc.shape
#try do this in loop, per sentence
#tags_train_enc = np_utils.to_categorical(tags_train,num_tags)

(32, 160, 19)

In [None]:
y_train_enc

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 0., ..., 0., 0., 1.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 0., ..., 0., 0., 1.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 0., ..., 0., 0., 1.]],

       ...,

       [[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0.

In [None]:
#model.fit(X_train, y_train_enc, batch_size=128, epochs=40, validation_data=(X_val, Y_val))
model.fit(X_train, y_train_enc, batch_size=128, epochs=40, validation_split=0.2)
 

Train on 25 samples, validate on 7 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.callbacks.History at 0x7efd78878f60>

In [None]:
#TODOs
#1. Implement a function such that data is processed to be in proper format for input to the DNN
#2. Use it on the test data, and also validation data
#3. Do the training using the given validation data (now just training data is split), and evaluate on the test data.