In [None]:
import numpy as np
import tensorflow as tf
import keras
import theano

In [None]:
word_embeddings_path = '../input/glove-840b-300d/glove.840B.300d.txt'
import io
word2idx = {}
word_embeddings = []
embedding_size = None
#Loading embeddings
with io.open(word_embeddings_path, 'r', encoding="utf-8") as f_em:
    for line in f_em:
        split = line.strip().split(" ")
        if len(split) <= 2:
            continue
        if embedding_size is None:
            embedding_size = len(split) - 1
            # Embeddings iniatilization for paddings and unknown words
            word2idx["PADDING_TOKEN"] = len(word2idx)
            word_embeddings.append(np.zeros(embedding_size))

            word2idx["UNKNOWN_TOKEN"] = len(word2idx)
            word_embeddings.append(np.random.uniform(-0.25, 0.25, embedding_size))
        if len(split) - 1 != embedding_size:
            continue
        word_embeddings.append(np.asarray(split[1:], dtype='float32'))
        word2idx[split[0]] = len(word2idx)

word_embeddings = np.array(word_embeddings, dtype='float32')

In [None]:
case2idx = {'numeric': 0, 'all_lower':1, 'all_upper':2, 'initial_upper':3, 'other':4, 'mainly_numeric':5, 'contains_digit': 6, 'PADDING_TOKEN':7}
case_embeddings = np.identity(len(case2idx), dtype=theano.config.floatX)

def get_casing(word, case_lookup):   
    casing = 'other'
    
    num_digits = 0
    for char in word:
        if char.isdigit():
            num_digits += 1
            
    digit_fraction = num_digits / float(len(word))
    
    if word.isdigit(): #Digit
        casing = 'numeric'
    elif digit_fraction > 0.5:
        casing = 'mainly_numeric'
    elif word.islower(): #All lower
        casing = 'all_lower'
    elif word.isupper(): #All upper
        casing = 'all_upper'
    elif word[0].isupper(): #First upper,other lower
        casing = 'initial_upper'
    elif num_digits > 0:
        casing = 'contains_digit'  
   
    return case_lookup[casing]

In [None]:
MAX_COLUMNS = 2
WORD_COL_NUM = 0
LABEL_COL_NUM = 1
def read_file(file_path):
    """
    :param file_path: path for corpus in CoNLL-format
    :return: corpus_sentences - list of sentences, splitted into words
    """
    corpus_sentences = []
    input_sentence = []
    with open(file_path, 'r', encoding='utf-8') as f_in:
        for line in f_in:
            line = line.strip()

            if len(line) == 0 or line[0] == '#':
                if len(input_sentence) > 0:
                    corpus_sentences.append(input_sentence)
                    input_sentence = []
                continue
            if len(line.split('\t')) < MAX_COLUMNS:
                print(line)
                continue
            input_sentence.append(line.split('\t'))

    if len(input_sentence) > 0:
        corpus_sentences.append(input_sentence)

    print(file_path, len(corpus_sentences), "sentences")
    return corpus_sentences

#Path for parts of CoNLL-2003 corpus
train_path = '../input/conll2003/conll.train'
train_sentences = read_file(train_path)

dev_path = '../input/conll2003/conll.dev'
dev_sentences = read_file(dev_path)

test_path = '../input/conll2003/conll.test'
test_sentences = read_file(test_path)

In [None]:
#Loading all class labels and adding new label for paddings
label_set = set()
label_set.add('PADDING_LABEL')
for dataset in [train_sentences, dev_sentences, test_sentences]:
    for sentence in dataset:
        for token in sentence:
            label = token[LABEL_COL_NUM]
            label_set.add(label)    

# Turing labels into indices
label2idx = {}
idx2label = {}
for label in label_set:
    label2idx[label] = len(label2idx)
    
print(label2idx)

In [None]:
def get_token_indices(token, word2idx, case2idx, unknown_idx):

    token_unknown = False
    # Each token has several corresponding columns. Token text is in first column
    word = token[WORD_COL_NUM]
    # First trying to find word in embedding dictionary, if unable trying to find decapitalized word, if unable
    # word is considered unknown
    if word2idx.get(word) is not None:
        word_idx = word2idx[word]
    elif word2idx.get(word.lower()) is not None:
        word_idx = word2idx[word.lower()]
    else:
        word_idx = unknown_idx
        token_unknown = True

    case_idx = get_casing(word, case2idx)
    return token_unknown, word_idx, case_idx

In [None]:
train_sentences[1]

In [None]:
def create_matrices(sentences, word2idx, label2idx, case2idx):   
    
    unknown_idx = word2idx['UNKNOWN_TOKEN']
    padding_casing = case2idx['PADDING_TOKEN']
    padding_idx = word2idx['PADDING_TOKEN'] 
    padding_label = label2idx['PADDING_LABEL']  
    
    dataset = []
    total_tokens = 0
    unknown_tokens = 0
    for sentence in sentences:
        
        # Index of first non-padding in sentence
        proper_sentence_start = 1

        word_indices = np.array([padding_idx] * (len(sentence) + 2))
        case_indices = np.array([padding_casing] * (len(sentence) + 2))
        label_indices = np.array([padding_label] * (len(sentence) + 2))

        for pos_in_sentence, word in enumerate(sentence):

            token_unknown, word_idx, case_idx = get_token_indices(word, word2idx, case2idx, unknown_idx)
            pos_in_padded_sentence = pos_in_sentence + proper_sentence_start
            word_indices[pos_in_padded_sentence] = word_idx
            case_indices[pos_in_padded_sentence] = case_idx
            label_indices[pos_in_padded_sentence] = label2idx[word[LABEL_COL_NUM]]

            # Calculating percent of tokens not covered by embeddings
            total_tokens += 1
            if token_unknown:
                unknown_tokens += 1

        # All data for one sentence put in one list
        dataset.append([word_indices, case_indices, label_indices])
        
    percent = 0.0
    if total_tokens != 0:
        percent = float(unknown_tokens) / total_tokens * 100
    print("{} tokens, {} unknown, {:.3}%".format(total_tokens, unknown_tokens, percent ))
    return dataset



train_data = create_matrices(train_sentences, word2idx, label2idx, case2idx)
dev_data = create_matrices(dev_sentences, word2idx, label2idx, case2idx)
test_data = create_matrices(test_sentences, word2idx, label2idx, case2idx)

for sentence in train_data[:5]:
    print(sentence)

In [None]:
from keras.layers import Embedding, LSTM, Dense, TimeDistributed, Dropout, Bidirectional, Input, concatenate
from keras.models import Model
from keras.optimizers import Adam
# Practical implementations should have dim of 100 or more
SENTENCE_LSTM_DIM = 10

n_out = len(label2idx)

tokens_input = Input(dtype='int32', shape=(None,), name='tokens_input')
tokens_embedding_layer = Embedding(input_dim=word_embeddings.shape[0], 
                                   output_dim=word_embeddings.shape[1],
                                   weights=[word_embeddings], trainable=False, 
                                   name='tokens_embeddings')
tokens = tokens_embedding_layer(tokens_input)


casing_input = Input(dtype='int32', shape=(None,), name='casing_input')
casing_embedding_layer = Embedding(input_dim=case_embeddings.shape[0], 
                                   output_dim=case_embeddings.shape[1],
                                   weights=[case_embeddings], trainable=True, 
                                   name='casing_embeddings')
casing = casing_embedding_layer(casing_input)

merged_embeddings = concatenate([tokens, casing], name='merged_embeddings')
for_lstm = Dropout(0.2)(merged_embeddings)
# If GPU is used  choose implementation=2
blstm = Bidirectional(LSTM(SENTENCE_LSTM_DIM, return_sequences=True, implementation=1), 
                      name='blstm')(for_lstm)

result = TimeDistributed(Dense(n_out, activation='softmax', name='result'))(blstm)

model = Model(inputs=[tokens_input, casing_input], outputs=result)

# default lr = 0.001, beta_1=0.9
adam = Adam(lr=0.001, beta_1=0.9)
model.compile(loss='sparse_categorical_crossentropy', optimizer=adam)
model.summary()

In [None]:
import random
import time

def iterate_minibatches(dataset):   
    for sentence in dataset:
        tokens, casing, labels = sentence     
            
        labels = np.expand_dims(labels, -1) 
        yield np.asarray([tokens]), np.asarray([casing]), np.asarray([labels])

# Here again code should be adapted for batches of sentences     
def tag_dataset(dataset):
    predicted_labels = []
    correct_labels = []
    for tokens, casing, labels in dataset:
        pred = model.predict_on_batch([np.asarray([tokens]), np.asarray([casing])])[0]
        pred_labels = [el.tolist().index(max(el)) for el in pred]
        predicted_labels.append(pred_labels)
        correct_labels.append(labels)
        #print(predicted_labels, correct_labels)
    return predicted_labels, correct_labels

def compute_accuracy(predictions, correct, padding_label):
    total_tokens = 0
    guessed_tokens = 0
    for guessed_sentence, correct_sentence in zip(predictions, correct):
        assert (len(guessed_sentence) == len(correct_sentence)), "Guessed and correct sentences do not match"
        for j in range(len(guessed_sentence)):
            if correct_sentence[j] != padding_label:
                total_tokens += 1
                if guessed_sentence[j] == correct_sentence[j]:
                    guessed_tokens += 1

    if total_tokens == 0:
        return float(0)
    else:
        accuracy = float(guessed_tokens) / total_tokens
        return accuracy

        
number_of_epochs = 10
print("%d epochs" % number_of_epochs)

print("%d train sentences" % len(train_data))
print("%d dev sentences" % len(dev_data))
print("%d test sentences" % len(test_data))

padding_label = label2idx['PADDING_LABEL']

for epoch in range(number_of_epochs):    
    print("--------- Epoch %d -----------" % epoch)
    random.shuffle(train_data)
    
    start_time = time.time()    
    for batch in iterate_minibatches(train_data):
        tokens, casing, labels = batch       
        model.train_on_batch([tokens, casing], labels)   
    print("%.2f sec for training" % (time.time() - start_time))
               
    #Train Dataset       
    start_time = time.time()  
    print("================================== Train Data ==================================")
    predicted_labels, correct_labels = tag_dataset(train_data)        
    accuracy = compute_accuracy(predicted_labels, correct_labels, padding_label)
    print("Accuracy = ", accuracy)

    #Dev Dataset 
    print("================================== Dev Data: ==================================")
    predicted_labels, correct_labels = tag_dataset(dev_data)  
    accuracy = compute_accuracy(predicted_labels, correct_labels, padding_label)
    print("Accuracy = ", accuracy)


    #Test Dataset 
    print("================================== Test Data: ==================================")
    predicted_labels, correct_labels = tag_dataset(test_data)  
    accuracy = compute_accuracy(predicted_labels, correct_labels, padding_label)
    print("Accuracy = ", accuracy)

        
    print("%.2f sec for evaluation" % (time.time() - start_time))