# ELMo Create Classifier From Embeddings

In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from IPython.core.display import HTML

In [2]:
data_dir = "data"

In [3]:
imdb_data = pd.read_pickle(f"{data_dir}/imdb_data_w_elmo_embedding.pickle.gz")

In [4]:
train = imdb_data[imdb_data.data_set == "train"]
test  = imdb_data[imdb_data.data_set == "test"]

In [5]:
def emb_to_ndarray(column):
    return np.array([[cell for cell in row] for row in column])

In [6]:
xgb = XGBClassifier(max_depth=6)
xgb.fit(emb_to_ndarray(train.embedding), train.polarity) 

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=6, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [7]:
predictions = xgb.predict_proba(emb_to_ndarray(test.embedding)) 
score = roc_auc_score(test.polarity, predictions[:,1])
print(f'prediction auc of xgb is {score}.')

prediction auc of xgb is 0.7955555555555556.


## Look At Movies

In [8]:
scores = xgb.predict_proba(emb_to_ndarray(imdb_data.embedding))
imdb_data['score'] = scores[:,1]

In [9]:
# Get the average positive sentiment propability by movie
movie_scores = []
for mid, df in imdb_data.groupby('movie_id'):
    movie_scores.append({
        'movie_id': mid,
        'score': df.score.mean(),
        'min_score': df.score.min(),
        'max_score': df.score.max(),
        'score_range': df.score.max() - df.score.min()
    })    
movie_scores = pd.DataFrame(movie_scores).sort_values(by='score', ascending=False)

In [10]:
def display_movies(movie_scores):
    for m in movie_scores.iterrows():
        display(HTML(f"<a href='https://www.imdb.com/title/{m[1].movie_id}'>{m[1].movie_id}</a> Score: {m[1].score}<br/>"))
        for s in imdb_data[imdb_data.movie_id == m[1].movie_id].sentence[0:2]:
            display(HTML(f"<i>{s}</i><p/>"))

In [11]:
display_movies(movie_scores[movie_scores.score >= 0.9].sample(2))

In [12]:
display_movies(movie_scores[movie_scores.score <= 0.1].sample(2))