# Attribute embedding using Neural Networks

### This notebook demonstrates the use of learned embeddings to understand relationships between different attributes associated with movies. 

### --- Attribute Embedding ---

In [4]:
# imports
from gensim.models.word2vec import Word2Vec

# add vectors to see relationships
w2v = Word2Vec.load_word2vec_format('attr_embedding.txt', binary=False)
print '--- Actors ---'
print w2v.most_similar(positive=['jackiechan'], topn=1)
print w2v.most_similar(positive=['brucewillis'], topn=1)
print w2v.most_similar(positive=['benstiller'], topn=1)
print w2v.most_similar(positive=['willsmith'], topn=1)
print w2v.most_similar(positive=['johnnydepp'], topn=1)
print w2v.most_similar(positive=['camerondiaz'], topn=1)
print w2v.most_similar(positive=['charlizetheron'], topn=1)

print '--- Genres ---'
print w2v.most_similar(positive=['comedy'], topn=1)
print w2v.most_similar(positive=['action'], topn=1)
print w2v.most_similar(positive=['drama'], topn=1)
print w2v.most_similar(positive=['romance'], topn=1)
print w2v.most_similar(positive=['horror'], topn=1)
print w2v.most_similar(positive=['adventure'], topn=1)
print w2v.most_similar(positive=['animation'], topn=1)

print '--- Directors ---'
print w2v.most_similar(positive=['stevenspielberg'], topn=1)
print w2v.most_similar(positive=['stanleykubrick'], topn=1)
print w2v.most_similar(positive=['christophernolan'], topn=1)
print w2v.most_similar(positive=['jamescameron'], topn=1)
print w2v.most_similar(positive=['peterjackson'], topn=1)
print w2v.most_similar(positive=['martinscorsese'], topn=1)
print w2v.most_similar(positive=['quentintarantino'], topn=1)

--- Actors ---
[(u'bradbird', 0.5468278527259827)]
[(u'colinfarrell', 0.677302896976471)]
[(u'owenwilson', 0.8045462965965271)]
[(u'barrysonnenfeld', 0.7483898401260376)]
[(u'timburton', 0.7376018762588501)]
[(u'elizabethbanks', 0.6934854984283447)]
[(u'susansarandon', 0.6847547888755798)]
--- Genres ---
[(u'benstiller', 0.4251762628555298)]
[(u'crime', 0.5248401761054993)]
[(u'williamhurt', 0.3950503468513489)]
[(u'woodyallen', 0.47666582465171814)]
[(u'wescraven', 0.575240969657898)]
[(u'johnnydepp', 0.48994776606559753)]
[(u'davidschwimmer', 0.557050883769989)]
--- Directors ---
[(u'harrisonford', 0.6215734481811523)]
[(u'orsonwelles', 0.5145497918128967)]
[(u'ridleyscott', 0.7558119893074036)]
[(u'lindahamilton', 0.700534462928772)]
[(u'ianmckellen', 0.7576411366462708)]
[(u'leonardodicaprio', 0.7167000770568848)]
[(u'samuelljackson', 0.6927003264427185)]


In [5]:
# add vectors to see relationships
print w2v.most_similar(positive=['jackiechan', 'action'], negative=['comedy'], topn=1)
print w2v.most_similar(positive=['comedy', 'romance'], topn=1)
print w2v.most_similar(positive=['stevenspielberg', 'horror'], topn=1)
print w2v.most_similar(positive=['stevenspielberg', 'georgelucas'], negative=['harrisonford'], topn=1)

[(u'jetli', 0.45532864332199097)]
[(u'dianekeaton', 0.4479560852050781)]
[(u'samneill', 0.501855731010437)]
[(u'ewanmcgregor', 0.45728009939193726)]


### --- Movie Embedding ---

In [9]:
# import required modules
from gensim.models.doc2vec import Doc2Vec

# load model
pvdbow_model = Doc2Vec.load('pvdbow_doc_embedding')

print '--- Movies ---'
print pvdbow_model.docvecs.most_similar('Forrest Gump', topn=1)
print pvdbow_model.docvecs.most_similar('Inception', topn=1)
print pvdbow_model.docvecs.most_similar('The Shawshank Redemption', topn=1)
print pvdbow_model.docvecs.most_similar('Pulp Fiction', topn=1)
print pvdbow_model.docvecs.most_similar('Fight Club', topn=1)

print '--- Movies with sequels ---'
print pvdbow_model.docvecs.most_similar('Rush Hour', topn=1)
print pvdbow_model.docvecs.most_similar('Die Hard', topn=1)
print pvdbow_model.docvecs.most_similar('Alien', topn=1)
print pvdbow_model.docvecs.most_similar('The Matrix', topn=1)
print pvdbow_model.docvecs.most_similar('Men in Black', topn=1)

--- Movies ---
[('Cast Away', 0.9283027648925781)]
[('The Dark Knight Rises', 0.9137897491455078)]
[('Mystic River', 0.8028160929679871)]
[('Reservoir Dogs', 0.8898608684539795)]
[('The Curious Case of Benjamin Button', 0.811210036277771)]
--- Movies with sequels ---
[('Rush Hour 2', 0.890998363494873)]
[('Die Hard 2', 0.9552138447761536)]
[('Alien: Resurrection', 0.7708348631858826)]
[('The Matrix Reloaded', 0.891198992729187)]
[('Men in Black II', 0.9043750166893005)]


### Understanding Movie Success

In [10]:
# imports
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

Embedding extraction functions

In [12]:
def create_embedding_text(attribute):
    """
    Creates list of entity strings, one per film.
    :param attribute: Attribute vector (actors, directors or genres).
    :return: List of attribute entities per film.
    """
    entities = []
    for name in attribute:
        name = str(name)
        split_entity = [x.strip() for x in name.split(',')]
        remove_special = [''.join(e for e in word if e.isalnum()) for word in split_entity]
        remove_space = [x.replace(" ", "") for x in remove_special]
        entities.append(' '.join(remove_space))
    return entities

def create_vocabulary(embedding_text):
    """
    Creats vocabulary from list of text.
    :param embedding_text: List of string separated by spaces.
    :return: Array of unique vocabulary.
    """
    vocabulary = []
    for text in embedding_text:
        try:
            vocabulary.extend(text.split())
        except AttributeError:
            print text
    vocabulary = np.unique(vocabulary).tolist()
    return vocabulary

def extract_doc_embeddings(model, success_labels, budget_labels):
    """
    Extract document (movie title) and label (movie profit ratio) embedding vectors.
    :param model: Trained embedding model.
    :param success_labels: Success label names.
    :param budget_labels: budget label names.
    :return: Dictionaries for movie title and movie success embedding vectors.
    """
    film_tags = model.docvecs.doctags.keys()
    success_label_embeddings = {}
    budget_label_embeddings = {}
    for label in success_labels:
        film_tags.remove(label)
        success_label_embeddings[label] = model.docvecs[label]
    for label in budget_labels:
        film_tags.remove(label)
        budget_label_embeddings[label] = model.docvecs[label]
    film_embeddings = {}
    for film in film_tags:
        film_embeddings[film] = model.docvecs[film]
    return film_embeddings, success_label_embeddings, budget_label_embeddings


def extract_word_embeddings(model, names, tolow=False):
    """
    Extract attribute embedding given a list of names.
    :param model: Trained embedding model.
    :param entity: User specified entity (actors, director of genre).
    :param names: List of entity names.
    :return: Dictionary of attribute embeddings.
    """
    word_embeddings = {}
    for name in names:
        if tolow:
            name = name.lower()
        try:
            word_embeddings[name] = model[name]
        except KeyError:
            pass
    return word_embeddings

def find_max_index(values, top_n):
    """
    Find indices of N largest values in array.
    :param values: 1-d array of values.
    :param top_n: Number of highest values to return.
    :return: 1-d array of indices for N largest values.
    """
    return np.argsort(values, axis=None)[::-1][:top_n]

def find_most_similar(embeddings, target_embedding, top_n=5):
    """
    Find the N most similar embedding to a given embedding using cosine similarity.
    :param embeddings: Embedding vectors.
    :param target_embedding: Embedding vector from which distance should be computed.
    :param top_n: Number of most similar embeddings that should be returned.
    :return: 1-d array of names of the N most similar embeddings to target embedding.
    """
    sims = cosine_similarity(embeddings.values(), target_embedding.reshape(1, -1))
    top_index = find_max_index(sims, top_n)
    return np.array(embeddings.keys())[top_index]

Load model and extract embeddings

In [14]:
# load data and model
movie_df = pd.read_csv('movie_data.csv')
model = Doc2Vec.load('movie_success_embedding')

# extract actor, director and genre embeddings
word_embeddings = {}
for name in ['actors', 'director', 'genre']:
    word_embeddings[name] = extract_word_embeddings(model, create_vocabulary(create_embedding_text(movie_df[name])))
    
actor_embeddings = word_embeddings['actors']
director_embeddings = word_embeddings['director']
genre_embeddings = word_embeddings['genre']
    
film_embeddings, success_embeddings, budget_embeddings = extract_doc_embeddings(model, np.unique(movie_df['target']), np.unique(movie_df['budget']))

Examine embeddings

In [15]:
# find most similar per category to profit ratio
print find_most_similar(actor_embeddings, model.docvecs['hit'], top_n=3)
print find_most_similar(director_embeddings, model.docvecs['hit'], top_n=3)
print find_most_similar(genre_embeddings, model.docvecs['hit'], top_n=3)
print find_most_similar(budget_embeddings, model.docvecs['hit'], top_n=3)
print find_most_similar(film_embeddings, model.docvecs['hit'], top_n=3)

['DennisQuaid' 'JamieLeeCurtis' 'KurtRussell']
['StevenSpielbergD' 'MartinScorseseD' 'StevenSoderberghD']
['Documentary' 'Drama' 'Short']
['budget_<10' 'budget_90to100' 'budget_70to80']
['Dead Poets Society' 'Indiana Jones and the Last Crusade'
 'Good Will Hunting']
