# ELMo

Retrieves sentence embedding values for movie reviews from a move review dataset using a pre-trained ELMo model.

In [1]:
import pandas as pd
import numpy as np
import spacy
import re
import time
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime

W0606 22:28:29.040401 140706603112192 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [2]:
pd.set_option('display.max_colwidth', 200)
tf.logging.set_verbosity(tf.logging.WARN)

In [3]:
data_dir = "data"

## Load IMDB Data

In [5]:
imdb_data = pd.read_pickle("{}/imdb_data.pickle.gz".format(data_dir))

In [6]:
imdb_data.sample(2)

Unnamed: 0,data_set,polarity,sentence,movie_id
4271,test,0,"Poorly acted, poorly written and poorly directed. Special effects are cheap. Best performance is by Yvette Napir, but that's not saying much. Story is a confusing mess about corporate greed leadin...",tt0323491
15511,test,1,"I would give this television series a 10 plus if i could. The writers were ""smack on"" and I think the best actors and actresses were a bonus to the show.These characters were so real. One could te...",tt0161233


Retrieving the text embedding for all samples may take a long time.
For demo purposes it is useful to limit the size.

In [7]:
imdb_data = imdb_data.sample(800)

## Cleaning Text

Before retrieving the text embeddings the reviews are cleaned up and limited in length to speed up processing.

This code is a little more complex to handle running on Windows where spacy doesn't seem to work for downloading the en corpus.

In [8]:
max_words = 128

text_http_re  = re.compile(r'http\S+')
text_digit_re = re.compile(r'[0-9]')
text_html_re  = re.compile(r'<[^>]{0,20}>')
text_punc_re  = re.compile('[' + re.escape('\'!"#$%&()*+-/:;<=>?@[\\]^_`{|}~') + ']')
text_ws_re    = re.compile('\s+')

def clean_text(text):
    text = text.lower()
    text = text_http_re.sub('', text)
    text = text_html_re.sub('', text)
    text = text_digit_re.sub(' ', text)
    text = text_punc_re.sub('', text)
    text = text_ws_re.sub(' ', text)
    text = text.strip()
    return text

def create_lemmatizer_spacy():
    nlp = spacy.load('en', disable=['parser', 'ner'])
    def lemmatize(text):
        return ' '.join([token.lemma_ for token in nlp(text)][0:max_words])
    
    return lemmatize

def create_lemmatizer_nltk():
    from nltk.stem import WordNetLemmatizer 
    lemmatizer = WordNetLemmatizer()
    
    def lemmatize(text):
        return ' '.join([lemmatizer.lemmatize(w) for w in text.split()][0:max_words])
    
    return lemmatize

# Setup a lemmatize function, spacy.load may fail on windows for en.
try:
    lemmatize = create_lemmatizer_spacy()
except:
    print("Using nltk for lemmatization.")
    lemmatize = create_lemmatizer_nltk()
            
def process_text(text):
    return lemmatize(clean_text(text))

In [9]:
imdb_data['clean_review'] = imdb_data.sentence.apply(process_text)

## Extract ELMo Embeddings

In [11]:
# Utility function to break sentences into batches.
# Passing in a single large batch can run into memory issues, but passing 1 at a time can slow down the overall process.
def batches(sentences, batch_size):
    results = []
    chunk = []
    for s in sentences:
        chunk.append(s)
        if len(chunk) >= batch_size:
            yield(chunk)
            chunk = []
    if len(chunk) > 0:
        yield(chunk)

def elmo_create_embedding_extractor(module, batch_size=20):
    with tf.Graph().as_default():
        sentences = tf.placeholder(tf.string)
        embed = hub.Module(module, trainable=True)
        embeddings = embed(sentences)
        session = tf.train.MonitoredSession()
        
    def extract(_sentences):
        results = []
        for s_batch in batches(_sentences, batch_size):
            results.extend(session.run(embeddings, { sentences: s_batch }))
            print("[{}] Extracted {}".format(datetime.now(), len(results)))
        return results

    return extract

elmo_get_embedding = elmo_create_embedding_extractor("https://tfhub.dev/google/elmo/2", 100)

Instructions for updating:
Colocations handled automatically by placer.


W0606 22:41:32.518659 140706603112192 deprecation.py:323] From /anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


In [None]:
current_time = datetime.now()
imdb_data['embedding'] = elmo_get_embedding(imdb_data.clean_review.values)
print("Extraction took time ", datetime.now() - current_time)

[2019-06-06 22:44:06.130025] Extracted 100
[2019-06-06 22:44:16.655238] Extracted 200
[2019-06-06 22:44:27.226981] Extracted 300
[2019-06-06 22:44:37.887041] Extracted 400
[2019-06-06 22:44:48.439659] Extracted 500
[2019-06-06 22:44:59.009956] Extracted 600


## Save embedding values to disk.

In [11]:
imdb_data.to_pickle("{}/imdb_data_w_elmo_embedding.pickle.gz".format(data_dir))