# ELMo

In [1]:
import pandas as pd
import numpy as np
import spacy
import re
import time
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime

W0605 05:42:06.724507 140617860892416 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [2]:
pd.set_option('display.max_colwidth', 200)
tf.logging.set_verbosity(tf.logging.WARN)

In [3]:
data_dir = "data"

## Load IMDB Data

In [4]:
imdb_data = pd.read_pickle(f"{data_dir}/imdb_data.pickle.gz")

In [5]:
imdb_data.sample(2)

Unnamed: 0,data_set,polarity,sentence,movie_id
31288,train,0,"Gayniggers from Outer Space is a short foreign film about black, gay aliens who explore the galaxy until they stumble upon Earth. Being gay, their goal is to have a male-only universe in which all...",tt0274518
5967,test,0,"Based on the 2007 spy novel by David Ignatius, Body of Lies tells the story of a CIA operative Roger Ferris (DiCaprio) who is sent to Jordan to track down an Al-Qaeda mastermind, all the while tre...",tt0758774


In [6]:
imdb_data = imdb_data.sample(800)

## Cleaning Text

In [7]:
max_words = 128

text_http_re  = re.compile(r'http\S+')
text_digit_re = re.compile(r'[0-9]')
text_html_re  = re.compile(r'<[^>]{0,20}>')
text_punc_re  = re.compile('[' + re.escape('\'!"#$%&()*+-/:;<=>?@[\\]^_`{|}~') + ']')
text_ws_re    = re.compile('\s+')

def clean_text(text):
    text = text.lower()
    text = text_http_re.sub('', text)
    text = text_html_re.sub('', text)
    text = text_digit_re.sub(' ', text)
    text = text_punc_re.sub('', text)
    text = text_ws_re.sub(' ', text)
    text = text.strip()
    return text

def create_lemmatizer_spacy():
    nlp = spacy.load('en', disable=['parser', 'ner'])
    def lemmatize(text):
        return ' '.join([token.lemma_ for token in nlp(text)][0:max_words])
    
    return lemmatize

def create_lemmatizer_nltk():
    from nltk.stem import WordNetLemmatizer 
    lemmatizer = WordNetLemmatizer()
    
    def lemmatize(text):
        return ' '.join([lemmatizer.lemmatize(w) for w in text.split()][0:max_words])
    
    return lemmatize

# Setup a lemmatize function, spacy.load may fail on windows for en.
try:
    lemmatize = create_lemmatizer_spacy()
except:
    print("Using nltk for lemmatization.")
    lemmatize = create_lemmatizer_nltk()
            
def process_text(text):
    return lemmatize(clean_text(text))

In [8]:
imdb_data['clean_review'] = imdb_data.sentence.apply(process_text)

## Extract ELMo Embeddings

In [9]:
# Utility function to break sentences into batches.
def batches(sentences, batch_size):
    results = []
    chunk = []
    for s in sentences:
        chunk.append(s)
        if len(chunk) >= batch_size:
            yield(chunk)
            chunk = []
    if len(chunk) > 0:
        yield(chunk)

def elmo_create_embedding_extractor(module, batch_size=20):
    with tf.Graph().as_default():
        sentences = tf.placeholder(tf.string)
        embed = hub.Module(module, trainable=True)
        embeddings = embed(sentences)
        session = tf.train.MonitoredSession()
        
    def extract(_sentences):
        results = []
        for s_batch in batches(_sentences, batch_size):
            results.extend(session.run(embeddings, { sentences: s_batch }))
            print(f"[{datetime.now()}] Extracted {len(results)}")
        return results

    return extract

elmo_get_embedding = elmo_create_embedding_extractor("https://tfhub.dev/google/elmo/2", 50)

Instructions for updating:
Colocations handled automatically by placer.


W0605 05:42:27.012676 140617860892416 deprecation.py:323] From /anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


In [10]:
current_time = datetime.now()
imdb_data['embedding'] = elmo_get_embedding(imdb_data.clean_review.values)
print("Extraction took time ", datetime.now() - current_time)

[2019-06-05 05:42:36.879758] Extracted 50
[2019-06-05 05:42:44.167155] Extracted 100
[2019-06-05 05:42:51.343977] Extracted 150
[2019-06-05 05:42:58.480070] Extracted 200
[2019-06-05 05:43:05.660219] Extracted 250
[2019-06-05 05:43:12.939412] Extracted 300
[2019-06-05 05:43:20.127130] Extracted 350
[2019-06-05 05:43:27.281750] Extracted 400
[2019-06-05 05:43:34.351196] Extracted 450
[2019-06-05 05:43:41.441912] Extracted 500
[2019-06-05 05:43:48.537823] Extracted 550
[2019-06-05 05:43:55.594576] Extracted 600
[2019-06-05 05:44:02.647957] Extracted 650
[2019-06-05 05:44:09.644633] Extracted 700
[2019-06-05 05:44:16.726220] Extracted 750
[2019-06-05 05:44:23.815015] Extracted 800
Extraction took time  0:01:55.273435


In [11]:
imdb_data.to_pickle(f"{data_dir}/imdb_data_w_elmo_embedding.pickle.gz")