# Word Embeddings
Examples on training models and see some applications

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

## Create corpora based on genre

In [2]:
genres = ['crime', 'mystery', 'romance', 
          'thriller', 'action', 'drama', 'comedy']

In [3]:
db_name = 'movie-dialogs'
collection = 'lines'
u = {'$unwind': '$character.movie.genres'}
m = {'$match': {'character.movie.genres': None}}
p = {'$project': {'_id': 0, 'id': 1, 'text': 1}}

In [4]:
from langmodels.corpora.moviedialog import MovieDialogCollection

In [5]:
corpora = {}
for genre in tqdm(genres):
    w = {'$match': {'character.movie.genres': genre}}
    pipeline = [w, u, w, p]
    corpora[genre] = MovieDialogCollection(
        db_name, collection, use_pos=False, pipeline=pipeline)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=7.0), HTML(value='')))




In [6]:
sequences = {}
for genre in tqdm(genres):
    seq = []
    for doc, tokens in corpora[genre].get_skip_tokens(n=3, s=2):
        for a, b, c in tokens:
            seq.append([a, b, c])
    sequences[genre] = seq

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=7.0), HTML(value='')))




for k, v in sequences.items():
    print(k, len(v))

## Training models

In [7]:
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors

In [8]:
models = {}
for genre, sequence in sequences.items():
    models[genre] = Word2Vec(sequences[genre], min_count=50)

In [11]:
word = 'love'
for genre, model in models.items():
    print(genre, [(x[0], round(x[1], 2)) for x in 
                  model.wv.most_similar(positive=word)][:5])

crime [('touch', 0.56), ('trust', 0.55), ('hate', 0.54), ('loved', 0.52), ('believe', 0.51)]
mystery [('hate', 0.67), ('remind', 0.62), ('charge', 0.61), ('touch', 0.6), ('believe', 0.6)]
romance [('touch', 0.61), ('hate', 0.58), ('interested', 0.51), ('speak', 0.51), ('trusted', 0.47)]
thriller [('hate', 0.61), ('touch', 0.55), ('trust', 0.54), ('believe', 0.51), ('happy', 0.51)]
action [('hate', 0.61), ('asking', 0.57), ('cry', 0.54), ('watched', 0.53), ('bullshit', 0.53)]
drama [('hate', 0.62), ('trust', 0.59), ('touch', 0.54), ('loved', 0.54), ('believe', 0.51)]
comedy [('hate', 0.56), ('loved', 0.53), ('contact', 0.52), ('talked', 0.49), ('touch', 0.49)]


## Pre-trained vectors

In [12]:
file = '/Users/alfio/Dati/wordembeddings/word2vec/GoogleNews-vectors-negative300.bin'
gen = model = KeyedVectors.load_word2vec_format(
    file, binary=True)

In [18]:
gen.most_similar('doctor')

[('physician', 0.7806021571159363),
 ('doctors', 0.7476574182510376),
 ('gynecologist', 0.6947518587112427),
 ('surgeon', 0.6793397665023804),
 ('dentist', 0.6785441637039185),
 ('pediatrician', 0.664313793182373),
 ('pharmacist', 0.653485894203186),
 ('neurologist', 0.6517742872238159),
 ('cardiologist', 0.635229766368866),
 ('nurse', 0.6319523453712463)]

In [16]:
gen.most_similar(positive=['soccer', 'football', 'rugby', 'tennis'])

[('basketball', 0.7433048486709595),
 ('fooball', 0.6796388626098633),
 ('volleyball', 0.6659425497055054),
 ('hockey', 0.654636800289154),
 ('Soccer', 0.6531394720077515),
 ('baseball', 0.650175929069519),
 ('softball', 0.6397534012794495),
 ('baskeball', 0.6388416290283203),
 ('sports', 0.6365107893943787),
 ('athletics', 0.6245771646499634)]

In [17]:
gen.most_similar(positive=['king', 'woman'], negative=['man'])

[('queen', 0.7118192911148071),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321243286133),
 ('kings', 0.5236844420433044),
 ('Queen_Consort', 0.5235945582389832),
 ('queens', 0.518113374710083),
 ('sultan', 0.5098593235015869),
 ('monarchy', 0.5087411999702454)]

In [23]:
cl = gen.word_vec(word='class')
sc = gen.word_vec(word='school')

In [24]:
gen.similar_by_vector(cl)

[('class', 1.0),
 ('classes', 0.6023270487785339),
 ('classs', 0.560172438621521),
 ('Faithfully_Fit_exercise', 0.5586338043212891),
 ('Class', 0.5171001553535461),
 ('Ramla_mixed', 0.48645803332328796),
 ('grade', 0.4762037992477417),
 ('middle', 0.46737974882125854),
 ('IKON_integrates', 0.4630744457244873),
 ('runic_setup_signifies', 0.45063671469688416)]

In [25]:
gen.similar_by_vector(sc)

[('school', 1.0),
 ('elementary', 0.7868632078170776),
 ('schools', 0.7411909103393555),
 ('shool', 0.6692329049110413),
 ('elementary_schools', 0.6597154140472412),
 ('kindergarten', 0.6529810428619385),
 ('eighth_grade', 0.6488089561462402),
 ('School', 0.6477997899055481),
 ('teacher', 0.63824063539505),
 ('students', 0.6301522850990295)]

In [28]:
mx = np.array([cl, sc]).mean(axis=0)

In [31]:
gen.similar_by_vector(mx)

[('school', 0.8400036096572876),
 ('class', 0.819097101688385),
 ('elementary', 0.6654949188232422),
 ('classes', 0.6472808122634888),
 ('eighth_grade', 0.6427940130233765),
 ('shool', 0.6210751533508301),
 ('schools', 0.6050800085067749),
 ('students', 0.5921556949615479),
 ('classroom', 0.5894216299057007),
 ('classs', 0.5824296474456787)]

In [19]:
for genre, model in models.items():
    print(genre, model.wv.most_similar(positive=['king', 'woman'], 
                                    negative=['man'])[0][0])

crime sting
mystery beauty
romance queen
thriller dumper
action grail
drama child
comedy aware
