# NERNN: Named Entity Recognition with Word Embeddings and Char RNNs
- Use a combination of word embeddings and character embeddings + RNN to predict whether a word is a named entity

In [1]:
# Computational imports
import numpy as np
import pandas as pd
import tensorflow as tf
tf.reset_default_graph()

# Keras imports
import keras
import keras.backend as K
from keras.preprocessing import sequence

# NLP imports
import nltk
from nltk.tokenize import word_tokenize, wordpunct_tokenize

Using TensorFlow backend.


# I. Random Data Generation
1. Gen random sentences
2. Construct a list of word inputs as well as a list of character inputs
3. Pad the word and character inputs as well as the tags
4. Find end of word indices in order to index into the character inputs (or the output representation)

In [2]:
# Construct random sentences
import random

s_nouns = ["A dude", "My mom", "The king", "Some guy", "A cat with rabies", "A sloth", "Your homie", "This cool guy my gardener met yesterday", "Superman"]
p_nouns = ["These dudes", "Both of my moms", "All the kings of the world", "Some guys", "All of a cattery's cats", "The multitude of sloths living under your bed", "Your homies", "Like, these, like, all these people", "Supermen"]
s_verbs = ["eats", "kicks", "gives", "treats", "meets with", "creates", "hacks", "configures", "spies on", "retards", "meows on", "flees from", "tries to automate", "explodes"]
p_verbs = ["eat", "kick", "give", "treat", "meet with", "create", "hack", "configure", "spy on", "retard", "meow on", "flee from", "try to automate", "explode"]
infinitives = ["to make a pie.", "for no apparent reason.", "because the sky is green.", "for a disease.", "to be able to make toast explode.", "to know more about archeology."]

def sing_sen_maker():
    '''Makes a random senctence from the different parts of speech. Uses a SINGULAR subject'''
    return random.choice(s_nouns), random.choice(s_verbs), random.choice(s_nouns).lower() or random.choice(p_nouns).lower(), random.choice(infinitives)

In [3]:
# Define random data generator
def generate_data(num_examples, word_vocab_size, char_vocab_size):
    # Word sequence 
    sentences = []
    sent_lens = []

    # Character sequences
    word_lens = []  # for indexing after 
    chars_lens = []  # length of example in characters

    # tags
    tags = []

    # Generate sentences
    for _ in range(num_examples):
        # new sentence
        sent = ' '.join(sing_sen_maker())
        words = word_tokenize(sent)
        sentences.append(words)

        # track the length of the sentence
        sent_len = len(words)
        sent_lens.append(sent_len)

        # track the lengths of the words
        words_len = map(len, words)
        word_lens.append(words_len)

        # track the length of the document in characters
        char_len = sum(words_len) + sent_len - 1
        chars_lens.append(char_len)

        tag = (np.random.rand(sent_len) <= 0.2).astype('int')
        tags.append(tag)

    # ==================================== #
    # Vocablulary Construction and Padding #
    # ==================================== #

    from collections import Counter

    # Function for vocab construction
    def construct_map(element_lists, vocab_size):
        '''
        Constructs a vocabulary from 
        '''
        c = Counter()
        for els in element_lists:
            c.update(els)

        most_common = [x[0] for x in c.most_common(vocab_size)]
        hash_map = dict(zip(most_common, range(1, len(most_common)+1)))

        return hash_map

    def encode_strings(element_lists, hash_map):
        '''
        Encode the element_list in terms of integers
        NOTE: 0 is reserved for masking
        '''
        new_element_list = []
        for els in element_lists:
            new_els = map(lambda x: hash_map.get(x, len(hash_map)+1), els)
            new_element_list.append(new_els)
        return new_element_list

    # ====================== #
    # 1. First the sentences #
    # ====================== #

    # Encode the words as integers
    word_map = construct_map(sentences, word_vocab_size)
    sentences_enc = encode_strings(sentences, word_map)

    # Pad the sentences
    max_sent_len = max(sent_lens)
    sentences_enc = sequence.pad_sequences(sentences_enc, maxlen=max_sent_len,
                                           padding='post', value=0, dtype='int64')

    # ========================= #
    # 2. Second, the characters #
    # ========================= #

    char_lists = [list(' '.join(sent)) for sent in sentences]

    # Encode the characters
    char_map = construct_map(char_lists, vocab_size=char_vocab_size)
    char_enc = encode_strings(char_lists, char_map)

    # Pad the characters
    max_char_len = max(chars_lens)
    char_enc = sequence.pad_sequences(char_enc, maxlen=max_char_len,
                                           padding='post', value=0, dtype='int64')

    # ============ #
    # Pad the tags #
    # ============ #
    tags_padded = sequence.pad_sequences(tags, maxlen=max_sent_len, 
                                                       padding='post', value=-1., dtype='int64')

    # ================================= #
    # Construct the end of word indices #
    # ================================= #
    # NOTE: This will be leading spaces through the end of the word

    word_end_inds = []
    for wl, cl in zip(word_lens, chars_lens):
        w_inds = np.cumsum(wl) + np.arange(len(wl)) - 1
        word_end_inds.append(w_inds)

    # Also need to pad the end of word indices
    word_end_inds = sequence.pad_sequences(word_end_inds, maxlen=max_sent_len, padding='post', value=-1, dtype='int64')
    
    return sentences_enc, char_enc, word_end_inds, max_sent_len, max_char_len, tags_padded

# II. Construct Model

In [2]:
from keras.models import Sequential, Model
from keras.layers import Embedding, GRU, Dense, Lambda, InputLayer, TimeDistributed, Layer, Input, merge, Bidirectional

In [3]:
# Going to need a custom layer for selecting the end of words in the character RNN
class GatherLayer(Layer):
    '''
    Scans over the batch to gather specific indices along the time axis
    '''
    def __init__(self, **kwargs):
        super(GatherLayer, self).__init__(**kwargs)
    
    def build(self, input_shape):
        super(GatherLayer, self).build(input_shape)
        
    def compute_mask(self, x, mask=None):
        '''
        Compute the mask
        '''
        return K.cast(K.not_equal(x[1], -1), 'bool')
    
    def call(self, inputs, mask=None):
        '''
        First input is the rnn out (batch_size, max_word_steps, char_lstm_dim)
        Second input is indicies to gather (batch_size, max_word_steps)
        '''
        rnn_inp = inputs[0]
        ind_inp = inputs[1]
        
        ind_inp_zeroed = tf.select(tf.not_equal(ind_inp, -1), ind_inp, tf.zeros_like(ind_inp, dtype='int64'))
        
        def f(inp):
            '''
            Gathers the inds for the input mat of (max_char_len, char_lstm_dim)
            '''
            mat = inp[0]
            inds = inp[1]
            return tf.gather(mat, inds)
        
        map_fn_out = tf.map_fn(f, elems=(rnn_inp, ind_inp_zeroed), dtype='float32')
        
        return map_fn_out
    
    def get_output_shape_for(self, input_shape):
        rnn_shape = input_shape[0]
        ind_shape = input_shape[1]
        return (rnn_shape[0], ind_shape[1], rnn_shape[2])

In [4]:
def construct_model(word_vocab_size, char_vocab_size,
                    w_emb_dim=100, w_lstm_dim=128, 
                    c_emb_dim=100, c_lstm_dim=128):

    # ===================== #
    # 1. Construct word RNN #
    # ===================== #

    word_model = Sequential()
    word_model.add(Embedding(input_dim=word_vocab_size+2, output_dim=w_emb_dim, 
                             input_length=None, mask_zero=True, name='word_embedding'))

#     word_model.add(Bidirectional(GRU(w_lstm_dim, return_sequences=True, dropout_W=0.5)))

    # ============================== #
    # 2. Construct the character RNN #
    # ============================== #

    char_model = Sequential()
    char_model.add(Embedding(input_dim=char_vocab_size+2, output_dim=c_emb_dim, 
                             input_length=None, mask_zero=True, name='char_embedding'))
    char_model.add(Bidirectional(GRU(c_lstm_dim, return_sequences=True)))
    temp_out = char_model.output

    # ====================================== #
    # 3. Merge the Word RNN and the Char RNN #
    # ====================================== #

    # Create an input for the matrix of word end indices
    inds = Input(shape=(None,), dtype='int64')
    
    # Slice the character model out
    char_model_slice = GatherLayer()([temp_out, inds])

    # Concatenate the outputs of the word model and the sliced character model
    merge_out = merge([word_model.output, char_model_slice], mode='concat', concat_axis=2)

    # Add Bidirectional lstm here
    gru_out = Bidirectional(GRU(w_emb_dim, return_sequences=True, dropout_W=0.5))(merge_out)
    
    # =================================== #
    # 4. Compute Output and Compile Model #
    # =================================== #

    # Time distribute a final layer for binary output
    fout = TimeDistributed(Dense(1, activation='sigmoid'))(merge_out)
    
    model = Model([word_model.input, char_model.input, inds], output=[fout])

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

# III. Train the Model on Fake Data to Test It

In [12]:
# Generate random data
num_examples = 1000
word_vocab_size = 100
char_vocab_size = 30

sentences_enc, char_enc, word_end_inds, max_sent_len, max_char_len, tags_padded = generate_data(num_examples,
                                                                                                word_vocab_size, 
                                                                                                char_vocab_size)

In [13]:
# Build the model
model = construct_model(word_vocab_size, char_vocab_size)



In [14]:
# Fit the model on fake data
model.fit([sentences_enc, char_enc, word_end_inds], np.expand_dims(tags_padded, 2), nb_epoch=2)
print "Seems to work!"

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/2
Epoch 2/2
Seems to work!


# IV. Bring in Wikipedia Data to Train On
- Downloaded WikiNER data into data directory
- Will read in pre-split train/test/dev data into pandas
- Preprocess the data to only tag PER (peoples names)
- Will batch sentences together into 'Documents'

### Utility Functions

In [5]:
import re

def untokenize(words):
    """
    Untokenizing a text undoes the tokenizing operation, restoring
    punctuation and spaces to the places that people expect them to be.
    Ideally, `untokenize(tokenize(text))` should be identical to `text`,
    except for line breaks.
    """
    text = ' '.join(words)
    step1 = text.replace("`` ", '"').replace(" ''", '"').replace('. . .',  '...')
    step2 = step1.replace(" ( ", " (").replace(" ) ", ") ")
    step3 = re.sub(r' ([.,:;?!%]+)([ \'"`])', r"\1\2", step2)
    step4 = re.sub(r' ([.,:;?!%]+)$', r"\1", step3)
    step5 = step4.replace(" '", "'").replace(" n't", "n't").replace(
         "can not", "cannot")
    step6 = step5.replace(" ` ", " '")
    return step6.strip()

def prepend_zeros(arr, num_zeros=1, dtype='int64'):
    '''
    Takes a 1-D numpy array and prepends the specified number of zeros
    '''
    zs = np.zeros(shape=(num_zeros,), dtype=dtype)
    return np.concatenate([zs, arr], axis=0)

def spans(txt, tokens):
    '''
    Takes the original (read: "untokenized" text) and the tokens and returns a list of word
    end indices.
    
    Parameters
    ==========
    txt : string
        untokenized / raw string we want to index the tokens into
    tokens : list, array
        list of tokens that make up the txt
        
    Returns
    =======
    word_inds : list
        list of word span indices for the tokens into the txt
    '''
    word_inds = []
    offset = 0
    for token in tokens:
        offset = txt.find(token, offset)
        word_inds.append((offset, offset+len(token)))
        offset += len(token)
    return word_inds

## a. Methods for Reading in Data

In [6]:
# Imports
import os
import gzip
import glob

# ==== #
# Vars #
# ==== #

# Paths
DATA_DIR = './data'
WIKI_DATA = os.path.join(DATA_DIR, 'WikiGold')

# Misc.
DOCSTART_TAG = '-DOCSTART-'

# ======================================= #
# Methods for Reading and Processing Data #
# ======================================= #

# Read in the data only (no processing except putting in in pandas)
def read_wiki_datasets(data_dir):
    '''
    Reads in the wikipedia datasets from the data directory
    
    Paramters
    =========
    data_dir : str
        path to the data directory
        
    Returns
    =======
    train_df : pandas.core.frame.DataFrame
        dataframe with training data
    test_df : pandas.core.frame.DataFrame
        dataframe with test data
    dev_df : pandas.core.frame.DataFrame
        dataframe with dev data
    TODO: (areiner) what the hell is dev data?... I didn't do these splits
    '''
    
    dataset_names = ('train', 'test', 'dev')
    datasets = []  # ordered train, test, dev
    for dname in dataset_names: 
        found_datasets = glob.glob(data_dir + '/*' + dname + '*.pkl')
        if len(found_datasets)==0:
            print "No dataset with name {} found".format(dname)
        elif len(found_datasets) > 1:
            print "Multiple dataset with name {} found".format(dname)
        else:
            df = pd.read_pickle(found_datasets[0])
            datasets.append(df)
    return datasets

# Method to filter all tags such that it is only 'O' or 'PER'
def modify_wikigold_tags(df):
    '''
    Takes in a dataframe with columns "sentence" and "tags" and modifies
    the "tags" column to convert everything that is not "B-PER" to "O" and
    converts "B-PER" to "PER"
    
    Parameters
    ==========
    df : pandas.core.frame.DataFrame
        input dataframe of sentences and tags (with columns "sentence" and "tags")
    '''
    def modify_taglist(taglist):
        '''
        Function to apply to each array of tags in each cell of the "tags"
        column in the dataframe
        '''
        m = {'B-PER': 'PER'}
        return map(lambda v: m.get(v, 'O'), taglist)

    df['tags'] = df['tags'].apply(modify_taglist)
    
    return df

# Method to find the index for document separations
def calc_docstart_inds(df):
    '''
    Takes in a dataframe and looks for specific document start tags
    '''
    return df.index[df.sentence.map(lambda x: DOCSTART_TAG in x)].get_values()

# Method to concatenate sentence tags to document tags
def construct_doc_tag(dfs, max_sent_per_doc=None):
    '''
    Takes a DataFrame slice and returns a dataframe of tokens and tags that
    concatenates all sentence tokens and tags for the whole dataframe, potentially
    into groups of size specified by max_sent_per_doc size
    '''
    new_df = pd.DataFrame(columns=['document', 'tags'])
    
    sentences = []
    tags = []
    
    for i, (_, row) in enumerate(dfs.iterrows()):
        
        if (max_sent_per_doc is not None) and i!=0 and (i % max_sent_per_doc == 0):
            new_df = new_df.append({'document': sentences, 'tags': tags}, ignore_index=True)
            sentences = []
            tags = []

        tokens = row.sentence
        tagseq = row.tags
        
        sentences.extend(tokens)
        tags.extend(tagseq)
    
    new_df = new_df.append({'document': sentences, 'tags': tags}, ignore_index=True)
    
    return new_df

# Method to cluster sentence token dataframes into documents
def transform_sent_to_docs(df, max_sent_per_doc=None):
    '''
    Transform a DataFrame of lists of tokens and tags per sentence into
    a DataFrame of "Documents" and the tags for that document.
    At a minimum, we split by document length as per the wikipedia page
    '''
    doc_df = pd.DataFrame(columns=['document', 'tags'])
    
    # 1. Calculate the indices of Document starts
    doc_starts = prepend_zeros(calc_docstart_inds(df))
    
    # 2. Slice the df for the sentences in each document
    for i, (start, end) in enumerate(zip(doc_starts[:-1], doc_starts[1:])):
        if i != 0:
            start += 1
        dfs = df.iloc[start:end]
        inc_doc_df = construct_doc_tag(dfs, max_sent_per_doc=max_sent_per_doc)
        doc_df = doc_df.append(inc_doc_df)
    
    doc_df.reset_index(inplace=True, drop=True)
    
    return doc_df
        
# Method that adds a column which contains the untokenized sentence / document
def untokenize_column(df, col='document', new_col='document_string'):
    '''
    Untokenizes a column of lists of tokens. The intended use in the pipeline is to
    construct untokenized strings after clustering sentences into documents or partial
    documents.
    '''
    df[new_col] = df[col].map(untokenize)
    return df

# Method that adds a column which contains the index of the tokens into the untokenized sentence / document
def word_index_columns(df, doc_col='document_string', tok_col='document'):
    '''
    Takes a column of strings (either sentences or documents as long as it's one continuous string) 
    and creats two new columns a 'word_start_inds' column that contains the start indices of all tokens
    and a 'word_end_inds' that contains the end indices of all tokens
    '''
    # Calculate the word spanning indices
    word_inds = df.apply(lambda r: spans(r[doc_col], r[tok_col]), axis=1)
    word_start_inds = word_inds.apply(lambda v: [e[0] for e in v])
    word_end_inds = word_inds.apply(lambda v: [e[1] - 1 for e in v])
    
    # Insert new columns
    df['word_start_inds'] = word_start_inds
    df['word_end_inds'] = word_end_inds
    return df

## b. Methods for data preprocessing

In [7]:
# Construct word and character maps
def construct_map(element_lists, vocab_size=None):
    '''
    Constructs a vocabulary from 
    '''
    from collections import Counter
    c = Counter()
    for els in element_lists:
        c.update(els)

    if vocab_size is not None:
        most_common = [x[0] for x in c.most_common(vocab_size)]
        hash_map = dict(zip(most_common, range(1, len(most_common)+1)))
    else:
        hash_map = dict(zip(c.keys(), range(1, len(c)+1)))

    return hash_map

def reverse_map(m):
    m_inv = dict(((ind, k) for k, v in m.iteritems()))
    return m_inv
    
def character_column(df):
    '''
    Inserts a column of the individual characters into the dataframe 
    
    Parameters
    ==========
    df : pandas.core.frame.DataFrame
        dataframe containing the string / untokenized documents
        
    Returns
    =======
    new_df : pandas.core.frame.DataFrame
        Updated df
    '''
    df['chars'] = df['document_string'].map(lambda v: list(v))
    return df

def construct_word_char_maps(df, vocab_size=None, return_inv_dicts=False):
    '''
    Construct the word and character maps from the dataframe
    
    Parameters
    ==========
    df : pandas DataFrame
        dataframe with tokens and character lists already constructed
    
    Returns
    =======
    token_map : dict
    char_map : dict
    '''
    token_map = construct_map(df['document'])
    char_map = construct_map(df['chars'])
    
    if return_inv_dicts:
        token_map_inv = reverse_map(token_map)
        char_map_inv = reverse_map(char_map)
        
        return token_map, char_map, token_map_inv, char_map_inv
    else:
        return token_map, char_map

def encode_strings(element_lists, hash_map):
    '''
    Encode the element_list in terms of integers
    NOTE: 0 is reserved for masking
    '''
    new_element_list = []
    for els in element_lists:
        new_els = map(lambda x: hash_map.get(x, len(hash_map)+1), els)
        new_element_list.append(new_els)
    return new_element_list
    
def encode_tokens_chars(df, token_map, char_map):
    '''
    Encode the tokens and characters
    '''
    df['token_enc'] = encode_strings(df['document'], token_map)
    df['char_enc'] = encode_strings(df['chars'], char_map)
    return df

def _encode_tags(tag_list):
    '''
    Encode a taglist
    '''
    def _tag_str_to_int(tag_str):
        if tag_str == 'PER':
            return 1
        else:
            return 0
        
    tags_enc = map(_tag_str_to_int, tag_list)
    return tags_enc
    
def encode_tags(df):
    '''
    Encode the tags as binary outcomes
    
    Parameters
    =========
    df : pandas.core.frame.DataFrame
        dataframe with character tags that are in the set {'O', and 'PER'}
    '''
    
    df['tags_enc'] = df['tags'].apply(_encode_tags)
    return df

## c. Full data processing pipeline function

In [8]:
def process_wikigold_dataset(df, max_sent_per_doc=None):
    
    # Text Processing
    df = modify_wikigold_tags(df)
    df = transform_sent_to_docs(df, max_sent_per_doc=max_sent_per_doc) # Concatenate sentence tokens into topics
    df = untokenize_column(df)  # (approximately) concatenate the tokens into documents
    df = word_index_columns(df)  # index the tokens into the untokenized strings
    df = character_column(df)

    # Encoding work
    tm, cm = construct_word_char_maps(df)  # first construct the hash maps
    df = encode_tokens_chars(df, tm, cm)  # encode the tokens and the characters
    df = encode_tags(df)  # encode the tags as binary outcomes 'O' -> 0 and 'PER' -> 1
    
    return df, tm, cm

def run_wikigold_data_pipeline(data_path, max_sent_per_doc=None):
    '''
    Runs the entire transformation pipeline for WikiGold data.
    
    Pipeline Steps
    --------------
    1. Read in wiki datasets (train, test, dev)
    2. Modify the IOB tags to only 'O' and 'PER'
    For each dataset in the wiki datasets:
        3. Take sentence examples and concatenate them into individual documents
        4. Take the documents which consist of lists of tokens and "untokenize" them into continuous strings
        5. Take the tokens and index them into the untokenized text so we have start and end indices for each token
        6. Split up the untokenized text into a list of characters
        7. Construct element -> monotonically increasing index map for both tokens and characters
        8. Encode the tokens and characters by their index in their respective maps
        9. Encode the tags as binary outcomes 'O' -> 0 and 'PER' -> 1
    '''
    # 1. Reading in data
    wiki_datasets = read_wiki_datasets(WIKI_DATA)

    wiki_datasets_processed = []
    wiki_datasets_maps = []
    # 2. Run processing on each dataframe
    for df in wiki_datasets:
        new_df, token_map, char_map = process_wikigold_dataset(df, max_sent_per_doc=max_sent_per_doc)
        wiki_datasets_processed.append(new_df)
        wiki_datasets_maps.append((token_map, char_map))
    
    return wiki_datasets_processed, wiki_datasets_maps

In [9]:
(train, test, dev), ((tm_train, cm_train), (tm_test, cm_test), (tm_dev, cm_dev)) = \
                                run_wikigold_data_pipeline(WIKI_DATA, max_sent_per_doc=5)

## d. Final steps - convert to numpy and pad sequences

In [10]:
def pad_encoding_column(col, value=0):
    '''
    Takes a pandas series of lists of encodings and returns a single matrix with padded results
    '''    
    col_padded = sequence.pad_sequences(col.tolist(), padding='post', value=value)
    return col_padded

def generate_padded_data(df):
    '''
    Take the dataframe and return numpy matrices with padded encodings
    '''
    cols_to_pad = ['token_enc', 'char_enc', 'tags_enc', 'word_start_inds', 'word_end_inds']
    padded_data = []
    for col in cols_to_pad:
        if ('tag' in col) or ('inds' in col):
            col_pad = pad_encoding_column(df[col], value=-1)
        else:
            col_pad = pad_encoding_column(df[col])
        padded_data.append(col_pad)
    return padded_data


# V. Construct Model and Bring in Pre-trained word emebddings
- For now, we will use GoogleNews embedding vectors with dimensionality 300
- We will not pre-train character emebeddings for now
    - Why? bc I haven't found character embedings yet and I would imagine the best representations can different markedly from use case to use case

## a. Construct a gensim model that we can use to access underlying embeddings from the massive GoogleNews embedding matrix
- 3 Million words in GoogleNews Vocab with 300 dimensional embeddings

In [16]:
import gensim
from gensim.models import Word2Vec

In [17]:
# Load in the gensim word2vec model
w2vmodel = Word2Vec.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)

## b. Construct the model and set the embeddings

In [11]:
# Model construction
word_vocab_size = len(tm_train)
char_vocab_size = len(cm_train)
# model = construct_model(word_vocab_size=word_vocab_size, char_vocab_size=char_vocab_size,
#                         w_emb_dim=w2vmodel.vector_size)
model = construct_model(word_vocab_size=word_vocab_size, char_vocab_size=char_vocab_size,
                        w_emb_dim=100, w_lstm_dim=64, c_emb_dim=50, c_lstm_dim=32)



In [20]:
# Define a method for setting the models word_embedding layer to have pretrained embeddings
def set_embeddings(w2v_model, keras_model, token_map):
    '''
    Takes in a gensim Word2Vec model and our keras model and then adapts the word_embeddings
    in our model to the pre-trained vectors from the w2v model if they exist. Otherwise, they are
    left to the original initalization
    
    Paramters
    =========
    w2v_model : gensim.models.word2vec.Word2Vec
        Word2Vec model that is already loaded with pre-trained embedings
    keras_model : keras.engine.training.Model
        Keras model that has been constructed such that the word embedding layer has the same
        number of embedding dimensions as the pre-trained embeddings
    token_map : dict
        map from token to index in the vocab
    '''
    word_emb_layer = keras_model.get_layer('word_embedding')
    weights = word_emb_layer.get_weights()[0]
    
    for token, ind in token_map.iteritems():
        try:
            pre_trained_emb = w2v_model[token]
            
        except:
            pre_trained_emb = weights[ind]
        weights[ind] = pre_trained_emb
    word_emb_layer.set_weights([weights])

In [21]:
# Set the embeddings
set_embeddings(w2vmodel, model, tm_train)

# VI. Train the Model!

In [12]:
# Generate the data
token_enc_train, char_enc_train, tags_enc_train, word_start_ind_train, word_end_ind_train = \
                                                                                        generate_padded_data(train)

In [None]:
model.fit([token_enc_train, char_enc_train, word_end_ind_train], np.expand_dims(tags_enc_train, 2), batch_size=10)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10


In [14]:
np.set_printoptions(precision=4, suppress=True)

In [None]:
i = 12
x_in = [token_enc_train[i:i+1], char_enc_train[i:i+1], word_end_ind_train[i:i+1]]

out = model.predict(x_in)[0, :, 0]

token_ex = np.array(train.document[i])
tags_ex = np.array(train.tags[i])
tags_enc_ex = np.array(train.tags_enc[i])

m = token_ex[(tags_enc_ex.nonzero()[0])]

print m
print zip(m, out[tags_enc_ex.nonzero()[0]])
out_sorted = out.argsort()
out_sorted = out_sorted[out_sorted<len(token_ex)]
print "Lowest scores:", token_ex[out_sorted[:30]]
print "Highest scores:", token_ex[out_sorted[-20:]]
print "Highest scores:", out[out_sorted[-20:]]

In [76]:
# Test data
token_enc_test, char_enc_test, tags_enc_test, word_start_ind_test, word_end_ind_test = \
                                                                                        generate_padded_data(test)

In [84]:
i = 2
x_in = [token_enc_test[i:i+1], char_enc_test[i:i+1], word_end_ind_test[i:i+1]]

out = model.predict(x_in)[0, :, 0]

token_ex = np.array(test.document[i])
tags_ex = np.array(test.tags[i])
tags_enc_ex = np.array(test.tags_enc[i])

m = token_ex[(tags_enc_ex.nonzero()[0])]

print m
print zip(m, out[tags_enc_ex.nonzero()[0]])
out_sorted = out.argsort()
out_sorted = out_sorted[out_sorted<len(token_ex)]
print "Lowest scores:", token_ex[out_sorted[:30]]
print "Highest scores:", token_ex[out_sorted[-20:]]
print "Highest scores:", out[out_sorted[-20:]]

['Dorothea' 'Dorothea' 'von' 'Schlegel' 'Rahel' 'Levin' 'Henriette' 'Herz'
 'Madame' 'de' 'Sta\xc3\xab' 'Friedrich' 'Philipp' 'Moses' 'Mendelssohn'
 'Immanuel' 'Kant' 'John' 'Locke' 'Alexander' 'Pope' 'Dorothea']
[('Dorothea', 0.00082105485), ('Dorothea', 0.0094426926), ('von', 0.0055673928), ('Schlegel', 0.0051288288), ('Rahel', 0.00068829377), ('Levin', 9.7050888e-06), ('Henriette', 0.00013976262), ('Herz', 2.7702123e-05), ('Madame', 0.010900004), ('de', 0.0044732802), ('Sta\xc3\xab', 0.0014611182), ('Friedrich', 0.52245504), ('Philipp', 0.00066411082), ('Moses', 0.00057833287), ('Mendelssohn', 0.0017986018), ('Immanuel', 0.00022611055), ('Kant', 9.4189062e-07), ('John', 7.342115e-05), ('Locke', 7.0163347e-05), ('Alexander', 0.0013022374), ('Pope', 0.051861312), ('Dorothea', 0.01213771)]
Lowest scores: ['Kant' 'and' 'and' 'adopted' 'and' 'greatest' 'and' 'critics' 'of'
 'translator' 'Levin' ',' 'novelists' 'to' 'musicians' ',' 'convert'
 'leading' ',' 'Herz' ',' ',' 'as' ',' 'medieva

In [68]:
out[out_sorted[-30:]]

array([  8.30020115e-04,   8.30020115e-04,   8.30020115e-04,
         8.30020115e-04,   8.30020115e-04,   8.30020115e-04,
         8.30020115e-04,   8.30020115e-04,   8.30020115e-04,
         8.30020115e-04,   8.30020115e-04,   8.30020115e-04,
         8.30020115e-04,   8.30020115e-04,   8.30020115e-04,
         8.30020115e-04,   8.30020115e-04,   8.30020115e-04,
         8.30020115e-04,   8.30020115e-04,   8.30020115e-04,
         8.30020115e-04,   8.30020115e-04,   9.63850296e-04,
         1.43575238e-03,   1.99742848e-03,   9.65667307e-01,
         9.86828208e-01,   9.99197304e-01,   9.99897480e-01], dtype=float32)

In [60]:
out.shape

(297, 1)

In [43]:
(out[0] >= 0.5)[:, 0].nonzero()

(array([17, 18, 19]),)

In [39]:
for i, p in enumerate(out[0]):
    print i, p

0 [ 0.00019067]
1 [ 0.00043793]
2 [  6.12061240e-06]
3 [  1.58033617e-05]
4 [  4.24458989e-07]
5 [ 0.00011363]
6 [ 0.00010162]
7 [  1.08037614e-06]
8 [  1.75458936e-05]
9 [  4.20558354e-05]
10 [  4.43179033e-07]
11 [  6.54117741e-07]
12 [  9.87347266e-06]
13 [  1.81310043e-06]
14 [  1.18962475e-06]
15 [ 0.00196883]
16 [  3.62454812e-05]
17 [ 0.99313885]
18 [ 0.99983346]
19 [ 0.97283983]
20 [ 0.00040712]
21 [  2.31974832e-06]
22 [  2.13084841e-05]
23 [ 0.00026095]
24 [  2.85764304e-06]
25 [  6.44781903e-06]
26 [  7.43022756e-05]
27 [  1.64395004e-07]
28 [  1.74500087e-06]
29 [  1.41274072e-07]
30 [  7.09953483e-08]
31 [  2.35899483e-06]
32 [  2.13374938e-06]
33 [  2.04589123e-05]
34 [  6.47321883e-07]
35 [  3.27852149e-06]
36 [  7.01268291e-05]
37 [  3.58893194e-05]
38 [  4.99607097e-07]
39 [  1.68278433e-07]
40 [  5.87318937e-06]
41 [  4.08215328e-07]
42 [  9.65235245e-07]
43 [  1.19888762e-06]
44 [  4.67477621e-05]
45 [  5.74571686e-07]
46 [  6.05322566e-05]
47 [  4.70991154e-06]
48 [

# Old Stuff

## Q: How can we determine the end of word indices from the tokens?
- we need to use the provided tokens and reconstruct the original string
    - We do this via an "untokenize" function found off the shelf online
    - If we tokenize again (using nltk word_tokenize) do we get the same result?   

In [312]:
print word_tokenize(res.iloc[4].document_string)[257]
print res.iloc[4].document[257]
print untokenize(res.iloc[4].document[254:260])

def untokenize2(tokens):
    import string
    return "".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in tokens]).strip()
untokenize2(res.iloc[4].document[254:270])

``
"
bonus tracks: " Battle of


'bonus tracks:" Battle of One"( an original song that was also set'