# ELMo

In [1]:
import pandas as pd
import numpy as np
import spacy
import re
import time
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime

W0604 19:13:19.210821 140382938605312 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [2]:
pd.set_option('display.max_colwidth', 200)
tf.logging.set_verbosity(tf.logging.WARN)

In [3]:
data_dir = "data"

## Load IMDB Data

In [4]:
imdb_data = pd.read_pickle(f"{data_dir}/imdb_data.pickle.gz")

In [5]:
imdb_data.sample(2)

Unnamed: 0,data_set,polarity,sentence,movie_id
35859,train,0,"The saddest part of this is the fact that these are 87 minutes I'll never get back. I knew this was terrible from the get-go, with the guy dressed as a lunatic Indian chief on top of the roof. (Se...",tt0077668
2469,test,0,This movie was really stupid and I thought that it wasn't so bad and I could tolerate a movie about a bed eating people. Then the part near the end where the guy has skeleton hands ended up being ...,tt0385639


In [6]:
imdb_data = imdb_data.sample(800)

## Cleaning Text

In [7]:
max_words = 128

text_http_re  = re.compile(r'http\S+')
text_digit_re = re.compile(r'[0-9]')
text_html_re  = re.compile(r'<[^>]{0,20}>')
text_punc_re  = re.compile('[' + re.escape('\'!"#$%&()*+-/:;<=>?@[\\]^_`{|}~') + ']')
text_ws_re    = re.compile('\s+')

nlp = spacy.load('en', disable=['parser', 'ner'])

def clean_text(text):
    text = text.lower()
    text = text_http_re.sub('', text)
    text = text_html_re.sub('', text)
    text = text_digit_re.sub(' ', text)
    text = text_punc_re.sub('', text)
    text = text_ws_re.sub(' ', text)
    text = text.strip()
    return text

def lemmatize(text):
    return ' '.join([token.lemma_ for token in nlp(text)][0:max_words])

def process_text(text):
    return lemmatize(clean_text(text))

In [8]:
imdb_data['clean_review'] = imdb_data.sentence.apply(process_text)

## Extract ELMo Embeddings

In [9]:
def batches(sentences, batch_size):
    results = []
    chunk = []
    for s in sentences:
        chunk.append(s)
        if len(chunk) >= batch_size:
            yield(chunk)
            chunk = []
    if len(chunk) > 0:
        yield(chunk)

def elmo_create_embedding_extractor(module, batch_size=20):
    with tf.Graph().as_default():
        sentences = tf.placeholder(tf.string)
        embed = hub.Module(module, trainable=True)
        embeddings = embed(sentences)
        session = tf.train.MonitoredSession()
        
    def extract(_sentences):
        results = []
        for s_batch in batches(_sentences, batch_size):
            results.extend(session.run(embeddings, { sentences: s_batch }))
            print(f"[{datetime.now()}] Extracted {len(results)}")
        return results

    return extract

elmo_get_embedding = elmo_create_embedding_extractor("https://tfhub.dev/google/elmo/2", 250)

Instructions for updating:
Colocations handled automatically by placer.


W0604 19:19:01.673768 140382938605312 deprecation.py:323] From /anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


In [10]:
current_time = datetime.now()
imdb_data['embedding'] = elmo_get_embedding(imdb_data.clean_review.values)
print("Extraction took time ", datetime.now() - current_time)

[2019-06-04 19:22:05.545622] Extracted 250
[2019-06-04 19:23:07.639448] Extracted 500
[2019-06-04 19:24:09.621259] Extracted 750
[2019-06-04 19:24:26.948774] Extracted 800
Extraction took time  0:03:24.387041


In [None]:
imdb_data.to_pickle(f"{data_dir}/imdb_data_w_elmo_embedding.pickle.gz")