# Using ELMo Embeddings In Another Classifier

This example uses the extracted ELMo text embedding values from the IMDB dataset as features for an XGBoost classifier.

In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from IPython.core.display import HTML

In [2]:
data_dir = "data"

## Load Data

In [3]:
imdb_data = pd.read_pickle("{}/imdb_data_w_elmo_embedding.pickle.gz".format(data_dir))

In [4]:
train = imdb_data[imdb_data.data_set == "train"]
test  = imdb_data[imdb_data.data_set == "test"]

## Train Classifier

In [5]:
# Utility to convert a column from a pandas dataframe into a multi-dimenstional numpy array.
def emb_to_ndarray(column):
    return np.array([[cell for cell in row] for row in column])

In [6]:
xgb = XGBClassifier(max_depth=6)
xgb.fit(emb_to_ndarray(train.embedding), train.polarity)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=6, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

## Test Classifier

In [7]:
predictions = xgb.predict_proba(emb_to_ndarray(test.embedding)) 
score = roc_auc_score(test.polarity, predictions[:,1])
print('prediction auc of xgb is {}.'.format(score))

prediction auc of xgb is 0.7828920101312457.


## Look At Movies (for fun)

Uses the positive probability for reviews as a way to rank movies.

In [8]:
scores = xgb.predict_proba(emb_to_ndarray(imdb_data.embedding))
imdb_data['score'] = scores[:,1]

In [9]:
# Get the average positive sentiment propability by movie
movie_scores = []
for mid, df in imdb_data.groupby('movie_id'):
    movie_scores.append({
        'movie_id': mid,
        'score': df.score.mean(),
        'min_score': df.score.min(),
        'max_score': df.score.max(),
        'score_range': df.score.max() - df.score.min()
    })    
movie_scores = pd.DataFrame(movie_scores).sort_values(by='score', ascending=False)

In [11]:
def display_movies(movie_scores):
    for m in movie_scores.iterrows():
        display(HTML("<a href='https://www.imdb.com/title/{}'>{}</a> Score: {}<br/>".format(
            m[1].movie_id,
            m[1].movie_id,
            m[1].score
        )))
        for s in imdb_data[imdb_data.movie_id == m[1].movie_id].sentence[0:2]:
            display(HTML("<i>{}</i><p/>".format(s)))

In [12]:
display_movies(movie_scores[movie_scores.score >= 0.9].sample(2))

In [18]:
display_movies(movie_scores[movie_scores.score <= 0.1].sample(2))