# Word Embeddings
Examples on training models and see some applications

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook

## Create corpora based on genre

In [2]:
genres = ['crime', 'mystery', 'romance', 
          'thriller', 'action', 'drama', 'comedy']

In [3]:
db_name = 'movie-dialogs'
collection = 'lines'
u = {'$unwind': '$character.movie.genres'}
m = {'$match': {'character.movie.genres': None}}
p = {'$project': {'_id': 0, 'id': 1, 'text': 1}}

In [4]:
from langmodels.corpora.moviedialog import MovieDialogCollection

In [5]:
corpora = {}
for genre in tqdm_notebook(genres):
    w = {'$match': {'character.movie.genres': genre}}
    pipeline = [w, u, w, p]
    corpora[genre] = MovieDialogCollection(
        db_name, collection, use_pos=False, pipeline=pipeline)

HBox(children=(IntProgress(value=0, max=7), HTML(value='')))




In [6]:
sequences = {}
for genre in tqdm_notebook(genres):
    seq = []
    for doc, tokens in corpora[genre].get_skip_tokens(n=3, s=2):
        for a, b, c in tokens:
            seq.append([a, b, c])
    sequences[genre] = seq

HBox(children=(IntProgress(value=0, max=7), HTML(value='')))




for k, v in sequences.items():
    print(k, len(v))

## Training models

In [33]:
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors

In [22]:
models = {}
for genre, sequence in sequences.items():
    models[genre] = Word2Vec(sequences[genre], min_count=50)

In [27]:
word = 'killer'
for genre, model in models.items():
    print(genre, [x[0] for x in 
                  model.wv.most_similar(positive=word)][:5])

crime ['vicious', 'hostage', 'enemy', 'speck', 'male']
mystery ['monster', 'racket', 'murderer', 'toon', 'existence']
romance ['headache', 'phony', 'background', 'genius', 'sales']
thriller ['speck', 'businessman', 'lunatic', 'psycho', 'male']
action ['cigarette', 'huge', 'citizen', 'drug', 'thief']
drama ['nazi', 'winner', 'waiter', 'member', 'junkie']
comedy ['shitty', 'professional', 'thick', 'heroic', 'victory']


## Pre-trained vectors

In [34]:
file = '/Users/alfio/Dati/wordembeddings/word2vec/GoogleNews-vectors-negative300.bin'
gen = model = KeyedVectors.load_word2vec_format(
    file, binary=True)

In [36]:
gen.most_similar('killer')

[('killers', 0.76349937915802),
 ('murderer', 0.6645797491073608),
 ('Killer', 0.6424514055252075),
 ('serial_killer', 0.5998541712760925),
 ('rapist', 0.5730096697807312),
 ('slayer', 0.5356388092041016),
 ('strangler', 0.5315794944763184),
 ('cop_killer', 0.5244623422622681),
 ('serial_rapist', 0.5186691880226135),
 ('poisoner', 0.5164259076118469)]

In [40]:
gen.most_similar(positive=['king', 'woman'], negative=['man'])

[('queen', 0.7118192911148071),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321243286133),
 ('kings', 0.5236844420433044),
 ('Queen_Consort', 0.5235945582389832),
 ('queens', 0.518113374710083),
 ('sultan', 0.5098593235015869),
 ('monarchy', 0.5087411999702454)]

In [43]:
for genre, model in models.items():
    print(genre, model.wv.most_similar(positive=['king', 'woman'], 
                                    negative=['man'])[0][0])

crime bomb
mystery birds
romance queen
thriller cheese
action queen
drama child
comedy patient
