In [1]:
# !python -m pip install gensim

In [2]:
from gensim.models import Word2Vec, KeyedVectors

In [3]:
import pandas as pd
import nltk
import json
import string
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Antonio\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
pd.set_option('mode.chained_assignment',None)

## Trazendo o Keyed Vectors do arquivo texto

In [6]:
modelo = KeyedVectors.load_word2vec_format("C:/Users/Antonio/Downloads/en_wiki_word2vec_300/en_wiki_word2vec_300.txt")

## Carregando o Dataset

In [7]:
data_set = pd.read_csv("../../dados/movies_metadata.csv")

  data_set = pd.read_csv("../../dados/movies_metadata.csv")


In [8]:
data_set = data_set[["genres", "overview", "original_title", "original_language"]]

In [9]:
movies = data_set.query("original_language == 'en'")

In [53]:
movies["overview"].dropna(inplace=True)
movies["genero"].dropna(inplace=True)

## Acessando o apenas o primeiro genero

In [54]:
def extrair_primeiro_genero( texto ):
    # print(texto)
    txt = texto.replace("'", '"')
    lista = json.loads(txt)
    if len(lista):
        return lista[0]['name']
    else:
        return "unknown"

In [55]:
movies["genero"] = movies["genres"].apply(extrair_primeiro_genero)

In [56]:
lista_generos = list(movies["genero"].unique())

In [57]:
def extrair_genero_id( genero ):
    global lista_generos
    num_genero = 11
    try:
        num_genero = lista_generos.index(genero)
    except: 
        pass
    return num_genero

In [58]:
movies["genero_id"] = movies["genero"].apply(extrair_genero_id)

## Limpar o texto de descrição do filme (overview)

### Remover os caracteres com \n \r

In [59]:
movies["overview"].dropna(inplace=True)

In [60]:
trans = str.maketrans('\n\r\t', '   ')
movies["overview_no_enter"] = movies["overview"].str.translate(trans)

### Remover contrações textuais do inglês

In [61]:
contraction_dict = {
                    "aren't": "are not",
                    "can't": "can not",
                    "could've": "could have",
                    "couldn't": "could not",
                    "daren't": "dare not",
                    "didn't": "did not",
                    "doesn't": "does not",
                    "don't": "do not",
                    "hadn't": "had not",
                    "hasn't": "has not",
                    "haven't": "have not",
                    "he's": "he is",
                    "how'd": "how had",
                    "how're": "how are",
                    "how's": "how is",
                    "how've": "how have",
                    "i'd": "i had",
                    "i'm": "i am",
                    "i've": "i have",
                    "isn't": "is+ not",
                    "it's": "it is",
                    "might've": "might have",
                    "mightn't": "might not",
                    "must've": "must have",
                    "mustn't": "must not",
                    "needn't": "need not",
                    "oughtn't": "ought not",
                    "shan't": "shall not",
                    "she'd": "she had",
                    "she's": "she is",
                    "should've": "should have",
                    "shouldn't": "should not",
                    "that'd": "that had",
                    "thats's": "that is",
                    "there'd": "there had",
                    "there's": "there is",
                    "they'd": "they had",
                    "they're": "you are",
                    "they've": "they have",
                    "wasn't": "was+ not",
                    "we'd": "we had",
                    "we're": "we are",
                    "we've": "we have",
                    "weren't": "were not",
                    "what'd": "what had",
                    "what're": "what are",
                    "what's": "what is",
                    "what've": "what have",
                    "when'd": "when had",
                    "when're": "when are",
                    "when's": "when is",
                    "when've": "when have",
                    "where'd": "where had",
                    "where're": "where are",
                    "where's": "where is",
                    "where've": "where have",
                    "who'd": "who had",
                    "who're": "who are",
                    "who's": "who is",
                    "who've": "who have",
                    "why'd": "why had",
                    "why're": "why are",
                    "why's": "why is",
                    "why've": "why have",
                    "would've": "would have",
                    "wouldn't": "would not",
                    "you're": "you are",
                    "you've": "you have",
                    "'cause": "because", 
                    "ain't": "is not", 
                    "aren't": "are not",
                    "can't": "cannot", 
                    "could've": "could have",
                    "he's": "he is",
                    "how'll": "how will",
                    "i'll": "i will",
                    "it'll": "it will",
                    "it's": "it is", 
                    "she'll": "she will",
                    "she's": "she is",
                    "that'll": "that will",
                    "there'll": "there will",
                    "they'll": "they will",
                    "they're": "they are",
                    "we'll": "we will",
                    "we're": "we are",
                    "what'll": "what will",
                    "when'll": "when will",
                    "where'll": "where will",
                    "who'll": "who will",
                    "yo're": "you are",
                    "you'll": "you will"
}

In [62]:
def remove_contractions( text ):
    # print(text)
    # print(type(text))
    if (type(text) == str):
        text = text.lower()
        # print(text)
        for chave in list(contraction_dict.keys()):
            valor = contraction_dict[chave]
            text = text.replace(chave, valor)
        text = text.replace("\'s", " have")        
        return text
    else:
        return ""

In [63]:
movies["overview_no_contraction"] = movies["overview_no_enter"].apply(remove_contractions)

### Remover as pontuções

In [64]:
translator = str.maketrans('', '', string.punctuation)
movies["overview_no_punctuation"]  = movies["overview_no_contraction"] .str.translate(translator)

In [65]:
stopwords = nltk.corpus.stopwords.words('english')

In [66]:
def remove_stop_words( text ):
    nova_lista = []
    palavras = text.split(" ")
    for palavra in palavras:
        if palavra not in stopwords:
            nova_lista.append(palavra)
    return nova_lista

In [67]:
movies["overview_no_stops"] = movies["overview_no_punctuation"].apply(remove_stop_words)

In [68]:
textos = movies["overview_no_stops"]

## Preparar matriz de Word2Vec

In [29]:
def vetorizador_texto( lista_palavras ): 
    vetor = np.zeros(300)
    w2v_unkown = modelo["unknown"]
    for palavra in lista_palavras: 
        try:
            w2v = modelo[palavra]
            vetor += w2v
        except:
            vetor += w2v_unkown
    return vetor

In [31]:
height = len(textos)
width = 300
matriz_w2v_textos = np.zeros((height, width))

for i in range(height):
    try:
        lista_palavras = textos[i]
        w2v = vetorizador_texto(lista_palavras)
        matriz_w2v_textos[i] = w2v
    except:
        pass
    
print(matriz_w2v_textos.shape)

(32269, 300)


In [85]:
dict_w2v_generos.keys()

dict_keys(['animation', 'adventure', 'romance', 'comedy', 'action', 'family', 'history', 'drama', 'crime', 'science', 'fantasy', 'unknown', 'music', 'horror', 'documentary', 'mystery', 'thriller', 'western', 'tv', 'war', 'foreign'])

In [82]:
def choose_genero(w2v, lista_generos):
    errors = 0 
    vetor_vazio = np.zeros(300)
    comparison = w2v == vetor_vazio
    if not comparison.all():
        less_distance = 1000000
        choose_genero = "unknown"
        # print(less_distance)
        for genero in dict_w2v_generos.keys():
            if genero in lista_generos:
                gen_model = dict_w2v_generos[genero]
                cosine_similarity = np.dot(w2v, gen_model)/(np.linalg.norm(w2v)* np.linalg.norm(gen_model))
                # print(i, "Genero: ", genero, "   Distance: ", cosine_similarity)
                if cosine_similarity < less_distance:
                    less_distance = cosine_similarity
                    choose_genero = genero
        print(i, "Choose Genero: ", choose_genero, "    Distance: ", less_distance)
        return choose_genero
    return None
            

In [83]:
movies["genres"][0]

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [89]:
height = len(textos)
# height = 10
for i in range(height):
    try: 
        lista_palavras = textos[i]
        w2v = vetorizador_texto(lista_palavras)
        txt_generos = movies["genres"][i]
        txt = txt_generos.replace("'", '"')
        lista = json.loads(txt)
        lista_generos = []
        for gen in lista:
            lista_generos.append(gen['name'].lower())
        genero_escolhido = choose_genero(w2v, lista_generos)
        if genero_escolhido is not None:
            matriz_w2v_textos[i] = w2v
            movies["genero"][i] = genero_escolhido
    except:
        pass


0 Choose Genero:  family     Distance:  0.38760465175694453
1 Choose Genero:  family     Distance:  0.4284786156357168
2 Choose Genero:  comedy     Distance:  0.4060711813955403
3 Choose Genero:  drama     Distance:  0.3670643884758953
4 Choose Genero:  comedy     Distance:  0.30855101881632696
5 Choose Genero:  thriller     Distance:  0.3213471943135032
6 Choose Genero:  comedy     Distance:  0.3941676922473062
7 Choose Genero:  drama     Distance:  0.3946303995255812
8 Choose Genero:  thriller     Distance:  0.4364384920005049
9 Choose Genero:  thriller     Distance:  0.3185777397070855
10 Choose Genero:  comedy     Distance:  0.3331086253110074
11 Choose Genero:  comedy     Distance:  0.37600538940095113
12 Choose Genero:  animation     Distance:  0.24133591308434318
13 Choose Genero:  drama     Distance:  0.3967126050684334
14 Choose Genero:  adventure     Distance:  0.473569044565227
15 Choose Genero:  drama     Distance:  0.3469371175801503
16 Choose Genero:  drama     Distance: 

391 Choose Genero:  thriller     Distance:  0.27604485972232107
392 Choose Genero:  thriller     Distance:  0.4058138220079965
393 Choose Genero:  documentary     Distance:  0.5166756359640822
394 Choose Genero:  drama     Distance:  0.3661836195680924
395 Choose Genero:  thriller     Distance:  0.4442484224901041
396 Choose Genero:  thriller     Distance:  0.3290004364636051
397 Choose Genero:  comedy     Distance:  0.5212256916657653
398 Choose Genero:  unknown     Distance:  1000000
399 Choose Genero:  documentary     Distance:  0.35001610631116614
400 Choose Genero:  fantasy     Distance:  0.43212275906750064
401 Choose Genero:  unknown     Distance:  1000000
402 Choose Genero:  thriller     Distance:  0.4272319936203715
403 Choose Genero:  drama     Distance:  0.3327600021536376
404 Choose Genero:  drama     Distance:  0.36569914248868035
405 Choose Genero:  fantasy     Distance:  0.3635813970880671
406 Choose Genero:  comedy     Distance:  0.5309730507407056
407 Choose Genero:  d

780 Choose Genero:  drama     Distance:  0.39849047449947816
781 Choose Genero:  drama     Distance:  0.30054418094900887
782 Choose Genero:  drama     Distance:  0.4480562599103804
783 Choose Genero:  comedy     Distance:  0.3874014749334314
784 Choose Genero:  thriller     Distance:  0.2655088193500627
785 Choose Genero:  drama     Distance:  0.22100664344438195
787 Choose Genero:  adventure     Distance:  0.4766888459827992
788 Choose Genero:  thriller     Distance:  0.2949651818209597
789 Choose Genero:  family     Distance:  0.2113792634003924
791 Choose Genero:  comedy     Distance:  0.3866715190156887
792 Choose Genero:  unknown     Distance:  1000000
793 Choose Genero:  thriller     Distance:  0.3207080991772313
794 Choose Genero:  war     Distance:  0.37923625974972963
795 Choose Genero:  comedy     Distance:  0.37030329741156287
796 Choose Genero:  comedy     Distance:  0.20926037992894828
803 Choose Genero:  thriller     Distance:  0.31746873753324667
804 Choose Genero:  fam

1119 Choose Genero:  unknown     Distance:  1000000
1120 Choose Genero:  unknown     Distance:  1000000
1121 Choose Genero:  comedy     Distance:  0.2988721755032288
1123 Choose Genero:  drama     Distance:  0.3518890933512481
1126 Choose Genero:  drama     Distance:  0.3613501286502485
1127 Choose Genero:  comedy     Distance:  0.33765782185968807
1128 Choose Genero:  horror     Distance:  0.4436864467512081
1129 Choose Genero:  documentary     Distance:  0.3473283710892056
1130 Choose Genero:  action     Distance:  0.5493941385363957
1131 Choose Genero:  drama     Distance:  0.4806009640151342
1133 Choose Genero:  drama     Distance:  0.42950491843982463
1137 Choose Genero:  drama     Distance:  0.3549458102319819
1138 Choose Genero:  drama     Distance:  0.29558331807539695
1139 Choose Genero:  drama     Distance:  0.3317838850226606
1140 Choose Genero:  foreign     Distance:  0.3488301972571439
1141 Choose Genero:  western     Distance:  0.508381066130768
1142 Choose Genero:  drama

1482 Choose Genero:  thriller     Distance:  0.3433483287457812
1483 Choose Genero:  unknown     Distance:  1000000
1484 Choose Genero:  foreign     Distance:  0.2858565694355176
1485 Choose Genero:  drama     Distance:  0.3676724831476865
1486 Choose Genero:  thriller     Distance:  0.4043532816135089
1487 Choose Genero:  drama     Distance:  0.348976516770561
1488 Choose Genero:  comedy     Distance:  0.3387372285548216
1489 Choose Genero:  unknown     Distance:  1000000
1490 Choose Genero:  comedy     Distance:  0.5360695939127498
1491 Choose Genero:  fantasy     Distance:  0.3929390602025318
1492 Choose Genero:  comedy     Distance:  0.34587693822120197
1493 Choose Genero:  drama     Distance:  0.3155934911129476
1494 Choose Genero:  thriller     Distance:  0.3302290163307174
1495 Choose Genero:  comedy     Distance:  0.3755265606877904
1496 Choose Genero:  drama     Distance:  0.3612446482556463
1497 Choose Genero:  drama     Distance:  0.4466689974571348
1498 Choose Genero:  come

1762 Choose Genero:  comedy     Distance:  0.31569652684503735
1763 Choose Genero:  drama     Distance:  0.3400759574333452
1764 Choose Genero:  history     Distance:  0.3018800650768441
1765 Choose Genero:  drama     Distance:  0.3670347472244661
1766 Choose Genero:  drama     Distance:  0.3708826841225032
1767 Choose Genero:  drama     Distance:  0.31774269336281447
1768 Choose Genero:  unknown     Distance:  1000000
1769 Choose Genero:  comedy     Distance:  0.3680961892989188
1770 Choose Genero:  drama     Distance:  0.3574126065053198
1771 Choose Genero:  drama     Distance:  0.35289648300614146
1772 Choose Genero:  animation     Distance:  0.3225569817746099
1773 Choose Genero:  thriller     Distance:  0.31396179277489106
1774 Choose Genero:  drama     Distance:  0.3576212999783415
1775 Choose Genero:  drama     Distance:  0.33054716131088274
1776 Choose Genero:  drama     Distance:  0.3490870462452204
1777 Choose Genero:  comedy     Distance:  0.31052245067873113
1778 Choose Gen

2173 Choose Genero:  comedy     Distance:  0.3475175603301798
2174 Choose Genero:  horror     Distance:  0.40672396571269276
2175 Choose Genero:  horror     Distance:  0.38901783961691994
2176 Choose Genero:  comedy     Distance:  0.42304745454681325
2177 Choose Genero:  drama     Distance:  0.4716578810653341
2178 Choose Genero:  drama     Distance:  0.30096294959477454
2179 Choose Genero:  comedy     Distance:  0.35261595554674946
2180 Choose Genero:  drama     Distance:  0.32491634406885717
2181 Choose Genero:  animation     Distance:  0.3939187321040217
2182 Choose Genero:  comedy     Distance:  0.5370056796427132
2183 Choose Genero:  comedy     Distance:  0.4217758663904272
2184 Choose Genero:  drama     Distance:  0.368966061263491
2185 Choose Genero:  horror     Distance:  0.4227781843691928
2186 Choose Genero:  comedy     Distance:  0.3114103426274136
2187 Choose Genero:  comedy     Distance:  0.5015145071991968
2188 Choose Genero:  comedy     Distance:  0.34413039240015797
218

2580 Choose Genero:  drama     Distance:  0.33176942131783943
2582 Choose Genero:  comedy     Distance:  0.3178222145248814
2583 Choose Genero:  unknown     Distance:  1000000
2584 Choose Genero:  comedy     Distance:  0.3240018200857269
2585 Choose Genero:  animation     Distance:  0.3515403416494967
2586 Choose Genero:  comedy     Distance:  0.35305827833812153
2587 Choose Genero:  thriller     Distance:  0.28236146241429894
2588 Choose Genero:  drama     Distance:  0.32025745488535223
2591 Choose Genero:  comedy     Distance:  0.3303333834060243
2592 Choose Genero:  thriller     Distance:  0.33340183400626777
2594 Choose Genero:  family     Distance:  0.38294327692077407
2595 Choose Genero:  horror     Distance:  0.39180700217618036
2596 Choose Genero:  comedy     Distance:  0.35909901273136663
2597 Choose Genero:  drama     Distance:  0.31712004509188163
2598 Choose Genero:  thriller     Distance:  0.32724148175818896
2599 Choose Genero:  drama     Distance:  0.3909105324033019
260

3020 Choose Genero:  thriller     Distance:  0.2683227582644678
3021 Choose Genero:  drama     Distance:  0.37324477350451174
3022 Choose Genero:  action     Distance:  0.3861058955130436
3023 Choose Genero:  drama     Distance:  0.5371753767169397
3024 Choose Genero:  drama     Distance:  0.45702345801511757
3025 Choose Genero:  documentary     Distance:  0.4104022469154085
3026 Choose Genero:  drama     Distance:  0.3315327875980775
3027 Choose Genero:  comedy     Distance:  0.3535096558855027
3028 Choose Genero:  drama     Distance:  0.5802512997391477
3029 Choose Genero:  comedy     Distance:  0.4157909683729257
3030 Choose Genero:  drama     Distance:  0.3696061340521346
3031 Choose Genero:  drama     Distance:  0.38303643657315445
3032 Choose Genero:  comedy     Distance:  0.3429235947960195
3033 Choose Genero:  drama     Distance:  0.31749862052643835
3034 Choose Genero:  thriller     Distance:  0.215688694154765
3035 Choose Genero:  drama     Distance:  0.25237683261012156
3036

3484 Choose Genero:  comedy     Distance:  0.41794513981005044
3485 Choose Genero:  family     Distance:  0.40172515985058194
3486 Choose Genero:  comedy     Distance:  0.4234507078862278
3487 Choose Genero:  documentary     Distance:  0.5233244190035182
3488 Choose Genero:  drama     Distance:  0.3885033741616538
3489 Choose Genero:  family     Distance:  0.34520325696836984
3490 Choose Genero:  fantasy     Distance:  0.3838415302994996
3491 Choose Genero:  drama     Distance:  0.3903766723038045
3492 Choose Genero:  thriller     Distance:  0.37283874530070243
3493 Choose Genero:  animation     Distance:  0.3447593942544138
3494 Choose Genero:  comedy     Distance:  0.32532725059544954
3495 Choose Genero:  comedy     Distance:  0.3281178260936521
3496 Choose Genero:  comedy     Distance:  0.32179885736012964
3497 Choose Genero:  comedy     Distance:  0.4708003015490168
3498 Choose Genero:  comedy     Distance:  0.31352252806814856
3500 Choose Genero:  comedy     Distance:  0.297185788

3784 Choose Genero:  horror     Distance:  0.540934164342543
3785 Choose Genero:  drama     Distance:  0.44901566520188924
3786 Choose Genero:  drama     Distance:  0.3898195363386826
3787 Choose Genero:  comedy     Distance:  0.3783938305274311
3788 Choose Genero:  comedy     Distance:  0.4093674916676125
3789 Choose Genero:  documentary     Distance:  0.4458203346732732
3790 Choose Genero:  drama     Distance:  0.41005466856056544
3791 Choose Genero:  drama     Distance:  0.36289481006382873
3792 Choose Genero:  drama     Distance:  0.3164364251319899
3793 Choose Genero:  horror     Distance:  0.4181102527435848
3794 Choose Genero:  thriller     Distance:  0.33956114371983526
3795 Choose Genero:  war     Distance:  0.3455944882311742
3797 Choose Genero:  comedy     Distance:  0.40377239660638
3798 Choose Genero:  comedy     Distance:  0.35562141912511264
3800 Choose Genero:  comedy     Distance:  0.36796346727098317
3801 Choose Genero:  drama     Distance:  0.3052850261551151
3802 Ch

4323 Choose Genero:  drama     Distance:  0.4288862064854196
4324 Choose Genero:  thriller     Distance:  0.2696507620867273
4325 Choose Genero:  documentary     Distance:  0.4225801097071885
4327 Choose Genero:  documentary     Distance:  0.41699677808395424
4328 Choose Genero:  thriller     Distance:  0.27170012557244544
4329 Choose Genero:  unknown     Distance:  1000000
4330 Choose Genero:  documentary     Distance:  0.39892101731500096
4331 Choose Genero:  documentary     Distance:  0.3454367601024816
4332 Choose Genero:  documentary     Distance:  0.5007214232048669
4333 Choose Genero:  documentary     Distance:  0.4982991847019925
4334 Choose Genero:  fantasy     Distance:  0.4722695367718978
4335 Choose Genero:  drama     Distance:  0.3466879473400895
4336 Choose Genero:  drama     Distance:  0.3281088283106552
4337 Choose Genero:  drama     Distance:  0.3211838839713358
4338 Choose Genero:  thriller     Distance:  0.31975401171575113
4339 Choose Genero:  action     Distance:  

4850 Choose Genero:  family     Distance:  0.3820135584035949
4851 Choose Genero:  comedy     Distance:  0.3187639481584108
4852 Choose Genero:  horror     Distance:  0.3353915510782286
4853 Choose Genero:  thriller     Distance:  0.2533984677386847
4854 Choose Genero:  drama     Distance:  0.3565049885826517
4855 Choose Genero:  comedy     Distance:  0.376609450924663
4856 Choose Genero:  thriller     Distance:  0.33130544232815784
4857 Choose Genero:  adventure     Distance:  0.4866623507715961
4858 Choose Genero:  drama     Distance:  0.31409126444503455
4859 Choose Genero:  comedy     Distance:  0.395995498180102
4860 Choose Genero:  fantasy     Distance:  0.3574382603899305
4861 Choose Genero:  comedy     Distance:  0.38154339419091227
4862 Choose Genero:  comedy     Distance:  0.31693704809337003
4863 Choose Genero:  fantasy     Distance:  0.42861981560339835
4864 Choose Genero:  drama     Distance:  0.3860393299873765
4865 Choose Genero:  drama     Distance:  0.34750626910920196

5323 Choose Genero:  drama     Distance:  0.44459732712495664
5324 Choose Genero:  drama     Distance:  0.35028775179980814
5325 Choose Genero:  comedy     Distance:  0.35955224811527203
5326 Choose Genero:  family     Distance:  0.38981571991161634
5327 Choose Genero:  drama     Distance:  0.36785422479603536
5328 Choose Genero:  family     Distance:  0.42588647148606645
5329 Choose Genero:  fantasy     Distance:  0.44414817532834244
5330 Choose Genero:  thriller     Distance:  0.3153377312276443
5333 Choose Genero:  comedy     Distance:  0.34216489095341845
5334 Choose Genero:  unknown     Distance:  1000000
5335 Choose Genero:  action     Distance:  0.35654460604454774
5336 Choose Genero:  drama     Distance:  0.2646193314788279
5337 Choose Genero:  drama     Distance:  0.4318623404595734
5338 Choose Genero:  comedy     Distance:  0.31053302798292814
5340 Choose Genero:  drama     Distance:  0.4609858253043355
5342 Choose Genero:  thriller     Distance:  0.3180753192372201
5344 Choo

5872 Choose Genero:  drama     Distance:  0.3301663902065045
5873 Choose Genero:  thriller     Distance:  0.2882454556597932
5874 Choose Genero:  comedy     Distance:  0.4336683925380616
5875 Choose Genero:  comedy     Distance:  0.4013125494980917
5876 Choose Genero:  thriller     Distance:  0.2138108050858265
5879 Choose Genero:  foreign     Distance:  0.32074202465765317
5881 Choose Genero:  thriller     Distance:  0.3051114380326295
5882 Choose Genero:  drama     Distance:  0.3815635372862337
5884 Choose Genero:  drama     Distance:  0.41332487896586556
5886 Choose Genero:  drama     Distance:  0.4206597222332484
5887 Choose Genero:  comedy     Distance:  0.41938405371429743
5889 Choose Genero:  drama     Distance:  0.33348138908041874
5890 Choose Genero:  comedy     Distance:  0.306325999977598
5891 Choose Genero:  comedy     Distance:  0.37198230878002825
5892 Choose Genero:  comedy     Distance:  0.3836748891677139
5893 Choose Genero:  drama     Distance:  0.39634049312487557
58

6344 Choose Genero:  comedy     Distance:  0.42067595204663427
6345 Choose Genero:  unknown     Distance:  1000000
6346 Choose Genero:  thriller     Distance:  0.33359046217594623
6349 Choose Genero:  drama     Distance:  0.32650758809637537
6350 Choose Genero:  history     Distance:  0.3346564267752102
6351 Choose Genero:  horror     Distance:  0.3688236203609835
6352 Choose Genero:  comedy     Distance:  0.367834510761844
6353 Choose Genero:  thriller     Distance:  0.29997875709412786
6354 Choose Genero:  comedy     Distance:  0.2756478674417703
6355 Choose Genero:  crime     Distance:  0.4233217680277096
6356 Choose Genero:  comedy     Distance:  0.3319318637924581
6358 Choose Genero:  comedy     Distance:  0.35769538344414553
6359 Choose Genero:  drama     Distance:  0.3450097017183796
6361 Choose Genero:  drama     Distance:  0.4206540911765131
6362 Choose Genero:  drama     Distance:  0.4417425032934628
6363 Choose Genero:  drama     Distance:  0.4152431891251466
6364 Choose Gen

6776 Choose Genero:  thriller     Distance:  0.34777748108211076
6777 Choose Genero:  comedy     Distance:  0.44423270735235965
6778 Choose Genero:  comedy     Distance:  0.39288431824036846
6779 Choose Genero:  documentary     Distance:  0.460470639063422
6780 Choose Genero:  thriller     Distance:  0.32944656179888215
6781 Choose Genero:  drama     Distance:  0.43897248519266835
6782 Choose Genero:  thriller     Distance:  0.3463444233528684
6783 Choose Genero:  thriller     Distance:  0.27570537761167824
6784 Choose Genero:  documentary     Distance:  0.3644295221713235
6785 Choose Genero:  comedy     Distance:  0.3776938242750332
6786 Choose Genero:  music     Distance:  0.49796684355255466
6787 Choose Genero:  adventure     Distance:  0.5257002275078463
6790 Choose Genero:  unknown     Distance:  1000000
6791 Choose Genero:  comedy     Distance:  0.3428834331869263
6792 Choose Genero:  drama     Distance:  0.48137035413520385
6793 Choose Genero:  comedy     Distance:  0.3643356646

7277 Choose Genero:  comedy     Distance:  0.36590660568385874
7278 Choose Genero:  thriller     Distance:  0.32591039112727854
7280 Choose Genero:  documentary     Distance:  0.46789422923112556
7281 Choose Genero:  comedy     Distance:  0.30330759561000503
7282 Choose Genero:  thriller     Distance:  0.30410854151982775
7283 Choose Genero:  thriller     Distance:  0.2356885580145538
7284 Choose Genero:  comedy     Distance:  0.3537914630847683
7285 Choose Genero:  family     Distance:  0.4334393163257301
7286 Choose Genero:  comedy     Distance:  0.4268643581676518
7287 Choose Genero:  action     Distance:  0.41078994286908593
7288 Choose Genero:  drama     Distance:  0.32692208481927304
7290 Choose Genero:  comedy     Distance:  0.37848918547056354
7291 Choose Genero:  drama     Distance:  0.2925952085883344
7293 Choose Genero:  drama     Distance:  0.45303882872109513
7295 Choose Genero:  drama     Distance:  0.385752285180108
7296 Choose Genero:  drama     Distance:  0.35348298864

7790 Choose Genero:  drama     Distance:  0.33271012398092514
7792 Choose Genero:  drama     Distance:  0.4660154556147642
7793 Choose Genero:  history     Distance:  0.4610013186931503
7794 Choose Genero:  western     Distance:  0.47320692162405886
7795 Choose Genero:  thriller     Distance:  0.3272089814785138
7796 Choose Genero:  drama     Distance:  0.23686015414747394
7798 Choose Genero:  drama     Distance:  0.346879224530197
7799 Choose Genero:  comedy     Distance:  0.3365664396164525
7800 Choose Genero:  comedy     Distance:  0.46666127577661776
7801 Choose Genero:  drama     Distance:  0.43489042570741054
7802 Choose Genero:  thriller     Distance:  0.39990736509229663
7803 Choose Genero:  drama     Distance:  0.3959277313993432
7804 Choose Genero:  drama     Distance:  0.2637592053814293
7805 Choose Genero:  drama     Distance:  0.286588419007155
7806 Choose Genero:  drama     Distance:  0.31192060288773804
7807 Choose Genero:  drama     Distance:  0.42981383836709497
7808 C

8229 Choose Genero:  thriller     Distance:  0.3402250522666356
8230 Choose Genero:  horror     Distance:  0.4507495958431911
8231 Choose Genero:  drama     Distance:  0.36730342995804954
8232 Choose Genero:  thriller     Distance:  0.30102614224411545
8233 Choose Genero:  foreign     Distance:  0.3646742342298722
8234 Choose Genero:  animation     Distance:  0.3794626189605912
8235 Choose Genero:  documentary     Distance:  0.46026016485873233
8236 Choose Genero:  thriller     Distance:  0.374187639749799
8237 Choose Genero:  romance     Distance:  0.4286825340642435
8238 Choose Genero:  animation     Distance:  0.3466661362188644
8239 Choose Genero:  drama     Distance:  0.36689004295660826
8240 Choose Genero:  drama     Distance:  0.6018506389578346
8241 Choose Genero:  drama     Distance:  0.337310950563178
8242 Choose Genero:  comedy     Distance:  0.3580312697855376
8243 Choose Genero:  drama     Distance:  0.4926231128036985
8245 Choose Genero:  thriller     Distance:  0.3059503

8842 Choose Genero:  drama     Distance:  0.41827502199030814
8843 Choose Genero:  drama     Distance:  0.3918825552063819
8844 Choose Genero:  family     Distance:  0.4479018291609931
8845 Choose Genero:  comedy     Distance:  0.36831572259046014
8847 Choose Genero:  drama     Distance:  0.3661775795289624
8848 Choose Genero:  fantasy     Distance:  0.37142581600392954
8849 Choose Genero:  thriller     Distance:  0.3213883124783416
8850 Choose Genero:  comedy     Distance:  0.24679561626160568
8851 Choose Genero:  drama     Distance:  0.32670494508183856
8852 Choose Genero:  unknown     Distance:  1000000
8853 Choose Genero:  comedy     Distance:  0.3502055790458021
8855 Choose Genero:  drama     Distance:  0.35670570286093717
8856 Choose Genero:  thriller     Distance:  0.24909484654364392
8857 Choose Genero:  documentary     Distance:  0.5239236832397547
8860 Choose Genero:  drama     Distance:  0.31060581220349215
8862 Choose Genero:  fantasy     Distance:  0.28214582108001623
8863

9525 Choose Genero:  drama     Distance:  0.4125126499936306
9526 Choose Genero:  drama     Distance:  0.3718473166655792
9529 Choose Genero:  drama     Distance:  0.3461524558697891
9532 Choose Genero:  music     Distance:  0.40558023854063124
9533 Choose Genero:  drama     Distance:  0.2990751492119226
9535 Choose Genero:  thriller     Distance:  0.40383408321939895
9536 Choose Genero:  comedy     Distance:  0.36405808068018947
9537 Choose Genero:  thriller     Distance:  0.3531300775705782
9538 Choose Genero:  comedy     Distance:  0.234267191242133
9540 Choose Genero:  comedy     Distance:  0.2890656995553215
9541 Choose Genero:  drama     Distance:  0.5403096809576531
9542 Choose Genero:  drama     Distance:  0.4420456374237814
9543 Choose Genero:  thriller     Distance:  0.3442625371003125
9544 Choose Genero:  drama     Distance:  0.4593811808912417
9545 Choose Genero:  drama     Distance:  0.2990601475942881
9546 Choose Genero:  drama     Distance:  0.32797188043860226
9547 Choo

9867 Choose Genero:  drama     Distance:  0.38062292998853436
9868 Choose Genero:  drama     Distance:  0.33766705483760495
9869 Choose Genero:  comedy     Distance:  0.4312710388441521
9870 Choose Genero:  comedy     Distance:  0.587667703428459
9871 Choose Genero:  documentary     Distance:  0.4315207961424359
9874 Choose Genero:  drama     Distance:  0.288715248995566
9876 Choose Genero:  drama     Distance:  0.28909723052723074
9878 Choose Genero:  drama     Distance:  0.29517192780808166
9881 Choose Genero:  comedy     Distance:  0.3561511165321844
9884 Choose Genero:  animation     Distance:  0.35575762378029707
9885 Choose Genero:  comedy     Distance:  0.40065036841293644
9886 Choose Genero:  documentary     Distance:  0.3695552688114111
9887 Choose Genero:  drama     Distance:  0.337425781251096
9888 Choose Genero:  thriller     Distance:  0.40874773824218996
9889 Choose Genero:  comedy     Distance:  0.3817003904738397
9891 Choose Genero:  documentary     Distance:  0.3868238

10517 Choose Genero:  drama     Distance:  0.3963561376234015
10518 Choose Genero:  comedy     Distance:  0.36243787723527077
10519 Choose Genero:  comedy     Distance:  0.2645532231336176
10520 Choose Genero:  comedy     Distance:  0.37997622904122286
10521 Choose Genero:  music     Distance:  0.3994279772196709
10522 Choose Genero:  drama     Distance:  0.40428786379483506
10523 Choose Genero:  thriller     Distance:  0.3218211859291825
10525 Choose Genero:  horror     Distance:  0.43837369935388254
10526 Choose Genero:  horror     Distance:  0.40776717234043736
10527 Choose Genero:  drama     Distance:  0.3020711794275667
10528 Choose Genero:  horror     Distance:  0.28830856311975217
10529 Choose Genero:  thriller     Distance:  0.31310896243492053
10530 Choose Genero:  fantasy     Distance:  0.38244718418291246
10531 Choose Genero:  horror     Distance:  0.2862129936888477
10534 Choose Genero:  comedy     Distance:  0.3090004844032773
10535 Choose Genero:  drama     Distance:  0.4

11156 Choose Genero:  adventure     Distance:  0.4632931842974497
11157 Choose Genero:  drama     Distance:  0.3035163518975259
11158 Choose Genero:  thriller     Distance:  0.3315072001091022
11159 Choose Genero:  drama     Distance:  0.3106085498675584
11160 Choose Genero:  thriller     Distance:  0.3025295609062238
11161 Choose Genero:  family     Distance:  0.38993453692305896
11163 Choose Genero:  mystery     Distance:  0.34347418655440665
11164 Choose Genero:  drama     Distance:  0.3253936851009755
11165 Choose Genero:  drama     Distance:  0.3512731150450214
11166 Choose Genero:  thriller     Distance:  0.2896113135302919
11167 Choose Genero:  adventure     Distance:  0.29478594513959605
11168 Choose Genero:  drama     Distance:  0.42623202789303266
11169 Choose Genero:  thriller     Distance:  0.2538766166116601
11170 Choose Genero:  drama     Distance:  0.391657350964726
11171 Choose Genero:  comedy     Distance:  0.39935676404196824
11173 Choose Genero:  fantasy     Distance

11722 Choose Genero:  documentary     Distance:  0.33946809858524574
11724 Choose Genero:  drama     Distance:  0.3585377495354149
11725 Choose Genero:  thriller     Distance:  0.35226464431662446
11726 Choose Genero:  comedy     Distance:  0.38195162677991096
11727 Choose Genero:  family     Distance:  0.3902583071966331
11728 Choose Genero:  music     Distance:  0.5134752643817745
11730 Choose Genero:  drama     Distance:  0.31553851638729014
11731 Choose Genero:  drama     Distance:  0.3475255146677465
11732 Choose Genero:  thriller     Distance:  0.23222704605701788
11733 Choose Genero:  comedy     Distance:  0.2801820836830316
11734 Choose Genero:  drama     Distance:  0.395827356461163
11736 Choose Genero:  drama     Distance:  0.38285726619179966
11737 Choose Genero:  thriller     Distance:  0.30809994090115306
11742 Choose Genero:  animation     Distance:  0.36489215368752564
11744 Choose Genero:  drama     Distance:  0.3711984494392915
11745 Choose Genero:  thriller     Distan

12190 Choose Genero:  thriller     Distance:  0.425780535500145
12191 Choose Genero:  drama     Distance:  0.3117284939444733
12193 Choose Genero:  comedy     Distance:  0.2942326507148772
12194 Choose Genero:  drama     Distance:  0.35375825323918075
12195 Choose Genero:  comedy     Distance:  0.49372775587424517
12196 Choose Genero:  thriller     Distance:  0.29888771542159287
12197 Choose Genero:  drama     Distance:  0.31534187342735426
12198 Choose Genero:  crime     Distance:  0.46628391240245476
12199 Choose Genero:  drama     Distance:  0.28140296730100794
12201 Choose Genero:  western     Distance:  0.3128682066516474
12202 Choose Genero:  comedy     Distance:  0.37277002684852717
12203 Choose Genero:  comedy     Distance:  0.4116941663368258
12206 Choose Genero:  drama     Distance:  0.283169626793827
12207 Choose Genero:  documentary     Distance:  0.4168883039299626
12209 Choose Genero:  thriller     Distance:  0.28815145566398925
12210 Choose Genero:  family     Distance: 

12793 Choose Genero:  drama     Distance:  0.3995184249522054
12795 Choose Genero:  horror     Distance:  0.4205662577254457
12804 Choose Genero:  comedy     Distance:  0.4300946389102344
12806 Choose Genero:  family     Distance:  0.4139572251173663
12807 Choose Genero:  comedy     Distance:  0.3599781584548581
12808 Choose Genero:  thriller     Distance:  0.32981004644868883
12809 Choose Genero:  comedy     Distance:  0.3366525055297684
12810 Choose Genero:  thriller     Distance:  0.3859722740538558
12811 Choose Genero:  animation     Distance:  0.38827801338803986
12813 Choose Genero:  documentary     Distance:  0.3944551414012279
12814 Choose Genero:  adventure     Distance:  0.4387505854855926
12815 Choose Genero:  mystery     Distance:  0.534636256413018
12816 Choose Genero:  war     Distance:  0.525720622012325
12817 Choose Genero:  thriller     Distance:  0.28230516629951746
12818 Choose Genero:  drama     Distance:  0.2945291955638657
12819 Choose Genero:  documentary     Dis

13380 Choose Genero:  drama     Distance:  0.29961948295962504
13381 Choose Genero:  drama     Distance:  0.34850708971198746
13383 Choose Genero:  comedy     Distance:  0.3426375939841298
13385 Choose Genero:  western     Distance:  0.4398074892200339
13386 Choose Genero:  western     Distance:  0.4395263567634329
13387 Choose Genero:  comedy     Distance:  0.38682689130919345
13388 Choose Genero:  comedy     Distance:  0.3148696913720631
13389 Choose Genero:  western     Distance:  0.429285304368211
13390 Choose Genero:  drama     Distance:  0.40697826408414056
13393 Choose Genero:  animation     Distance:  0.32937403808279586
13395 Choose Genero:  comedy     Distance:  0.353775491809981
13396 Choose Genero:  western     Distance:  0.370082546405804
13397 Choose Genero:  thriller     Distance:  0.2995783168727528
13399 Choose Genero:  documentary     Distance:  0.41488731716273136
13400 Choose Genero:  unknown     Distance:  1000000
13401 Choose Genero:  drama     Distance:  0.434758

13991 Choose Genero:  music     Distance:  0.4991692800221202
13992 Choose Genero:  drama     Distance:  0.36189283098220426
13993 Choose Genero:  foreign     Distance:  0.34820992934904377
13995 Choose Genero:  western     Distance:  0.3086299121537901
13996 Choose Genero:  drama     Distance:  0.27975715494701314
13998 Choose Genero:  drama     Distance:  0.4073144946324349
13999 Choose Genero:  documentary     Distance:  0.48513278757815753
14000 Choose Genero:  horror     Distance:  0.42781172186623256
14001 Choose Genero:  western     Distance:  0.3818420340854453
14002 Choose Genero:  western     Distance:  0.3447926273190709
14004 Choose Genero:  unknown     Distance:  1000000
14005 Choose Genero:  unknown     Distance:  1000000
14006 Choose Genero:  western     Distance:  0.26693781871201827
14012 Choose Genero:  drama     Distance:  0.3668704368805327
14013 Choose Genero:  drama     Distance:  0.5431028180470276
14014 Choose Genero:  thriller     Distance:  0.4415111592772198


14615 Choose Genero:  comedy     Distance:  0.3533860400003501
14616 Choose Genero:  drama     Distance:  0.3124958139709579
14617 Choose Genero:  thriller     Distance:  0.3622520277281374
14618 Choose Genero:  drama     Distance:  0.34995414112692036
14619 Choose Genero:  drama     Distance:  0.42273120741371023
14620 Choose Genero:  thriller     Distance:  0.31668110745748473
14622 Choose Genero:  war     Distance:  0.4311202134979607
14623 Choose Genero:  thriller     Distance:  0.3276509513666601
14624 Choose Genero:  thriller     Distance:  0.2927984458004388
14625 Choose Genero:  family     Distance:  0.5004377157858433
14626 Choose Genero:  comedy     Distance:  0.36392166288958455
14627 Choose Genero:  comedy     Distance:  0.3222030169382899
14628 Choose Genero:  documentary     Distance:  0.44050889778970675
14630 Choose Genero:  comedy     Distance:  0.3822775694765109
14631 Choose Genero:  thriller     Distance:  0.3676535242378028
14634 Choose Genero:  crime     Distance:

15293 Choose Genero:  drama     Distance:  0.38313032027357147
15294 Choose Genero:  drama     Distance:  0.44307429590786074
15295 Choose Genero:  drama     Distance:  0.49390655778734943
15296 Choose Genero:  animation     Distance:  0.4846786756190051
15297 Choose Genero:  drama     Distance:  0.43450547112618304
15299 Choose Genero:  comedy     Distance:  0.35951554297401916
15300 Choose Genero:  thriller     Distance:  0.3153492788249229
15307 Choose Genero:  comedy     Distance:  0.3739370249643327
15308 Choose Genero:  foreign     Distance:  0.33075082769183134
15309 Choose Genero:  drama     Distance:  0.3488493185750415
15310 Choose Genero:  comedy     Distance:  0.40413514641184217
15311 Choose Genero:  thriller     Distance:  0.32130851084424367
15312 Choose Genero:  comedy     Distance:  0.4528888627083565
15316 Choose Genero:  comedy     Distance:  0.39088533395539515
15318 Choose Genero:  drama     Distance:  0.4055237718853881
15320 Choose Genero:  thriller     Distance:

15979 Choose Genero:  documentary     Distance:  0.49829381511811605
15980 Choose Genero:  documentary     Distance:  0.4570748067431617
15981 Choose Genero:  romance     Distance:  0.4132371481571869
15982 Choose Genero:  drama     Distance:  0.3626957187551155
15983 Choose Genero:  romance     Distance:  0.4436179250473395
15984 Choose Genero:  drama     Distance:  0.38082428196696994
15985 Choose Genero:  thriller     Distance:  0.27159895586024374
15986 Choose Genero:  drama     Distance:  0.32007206849925524
15987 Choose Genero:  drama     Distance:  0.2881103997915413
15988 Choose Genero:  comedy     Distance:  0.4668179220210647
15989 Choose Genero:  comedy     Distance:  0.29711837207276165
15990 Choose Genero:  unknown     Distance:  1000000
15992 Choose Genero:  drama     Distance:  0.25092180636280426
15993 Choose Genero:  comedy     Distance:  0.40605454292604604
15995 Choose Genero:  thriller     Distance:  0.29265680081696815
15997 Choose Genero:  documentary     Distance

16778 Choose Genero:  documentary     Distance:  0.6595457771354598
16779 Choose Genero:  drama     Distance:  0.3501989164880372
16780 Choose Genero:  drama     Distance:  0.37709208399625643
16783 Choose Genero:  drama     Distance:  0.42267638140345515
16784 Choose Genero:  comedy     Distance:  0.3163624971808409
16785 Choose Genero:  comedy     Distance:  0.43259004598028955
16787 Choose Genero:  animation     Distance:  0.2984443465983194
16788 Choose Genero:  family     Distance:  0.2882724031620819
16789 Choose Genero:  comedy     Distance:  0.5030751892468279
16790 Choose Genero:  thriller     Distance:  0.33182095498356495
16791 Choose Genero:  documentary     Distance:  0.36250386456346334
16792 Choose Genero:  documentary     Distance:  0.39554418515858414
16793 Choose Genero:  drama     Distance:  0.40576272291325877
16795 Choose Genero:  comedy     Distance:  0.2982993904288869
16796 Choose Genero:  thriller     Distance:  0.41757462589458216
16797 Choose Genero:  fantasy

17512 Choose Genero:  drama     Distance:  0.40215043081051427
17513 Choose Genero:  drama     Distance:  0.33828977866077303
17514 Choose Genero:  thriller     Distance:  0.24048555845110256
17515 Choose Genero:  drama     Distance:  0.25129694651624224
17516 Choose Genero:  comedy     Distance:  0.3314506659351775
17517 Choose Genero:  drama     Distance:  0.34745939536666676
17519 Choose Genero:  thriller     Distance:  0.26074413302858296
17520 Choose Genero:  thriller     Distance:  0.4304718772327979
17522 Choose Genero:  animation     Distance:  0.3541467025416793
17523 Choose Genero:  drama     Distance:  0.3172582831486727
17524 Choose Genero:  drama     Distance:  0.44508019496667883
17525 Choose Genero:  comedy     Distance:  0.47519003801512016
17526 Choose Genero:  drama     Distance:  0.3427066155890177
17527 Choose Genero:  thriller     Distance:  0.2883156439928495
17528 Choose Genero:  drama     Distance:  0.3475666180375862
17529 Choose Genero:  romance     Distance: 

18244 Choose Genero:  fantasy     Distance:  0.41700848930625967
18246 Choose Genero:  romance     Distance:  0.4767985860662888
18250 Choose Genero:  horror     Distance:  0.3077702058919999
18252 Choose Genero:  thriller     Distance:  0.32008218235892183
18255 Choose Genero:  thriller     Distance:  0.3140824635893883
18257 Choose Genero:  thriller     Distance:  0.2865922317356042
18258 Choose Genero:  adventure     Distance:  0.4473538308915212
18259 Choose Genero:  documentary     Distance:  0.5389518518811208
18260 Choose Genero:  drama     Distance:  0.44995068634599855
18262 Choose Genero:  foreign     Distance:  0.22348166478061052
18264 Choose Genero:  adventure     Distance:  0.5221106837477285
18267 Choose Genero:  thriller     Distance:  0.24473356367456928
18269 Choose Genero:  western     Distance:  0.397064822583563
18272 Choose Genero:  drama     Distance:  0.429530067654226
18274 Choose Genero:  documentary     Distance:  0.2787365288892299
18278 Choose Genero:  dram

18961 Choose Genero:  documentary     Distance:  0.38407157466426184
18963 Choose Genero:  drama     Distance:  0.300332780363715
18964 Choose Genero:  drama     Distance:  0.40529533021133207
18965 Choose Genero:  drama     Distance:  0.4503227366990752
18969 Choose Genero:  drama     Distance:  0.2771888491609307
18971 Choose Genero:  drama     Distance:  0.3188303357310221
18972 Choose Genero:  comedy     Distance:  0.34513891183075884
18973 Choose Genero:  drama     Distance:  0.31832250119093075
18974 Choose Genero:  unknown     Distance:  1000000
18976 Choose Genero:  unknown     Distance:  1000000
18977 Choose Genero:  drama     Distance:  0.4976243691067439
18978 Choose Genero:  drama     Distance:  0.35240650145567043
18979 Choose Genero:  drama     Distance:  0.35253915925038243
18980 Choose Genero:  thriller     Distance:  0.2710965197596918
18981 Choose Genero:  horror     Distance:  0.4495927804843136
18985 Choose Genero:  thriller     Distance:  0.2187589149661583
18988 C

19677 Choose Genero:  drama     Distance:  0.5268186039873062
19679 Choose Genero:  documentary     Distance:  0.3419820246739347
19680 Choose Genero:  drama     Distance:  0.4085033874002266
19681 Choose Genero:  thriller     Distance:  0.3666961747021046
19682 Choose Genero:  documentary     Distance:  0.5637946715982253
19683 Choose Genero:  animation     Distance:  0.3594302237594968
19684 Choose Genero:  horror     Distance:  0.3648417188103022
19685 Choose Genero:  horror     Distance:  0.37199396435816356
19686 Choose Genero:  comedy     Distance:  0.41466926121066694
19687 Choose Genero:  family     Distance:  0.4603900066599154
19691 Choose Genero:  documentary     Distance:  0.4104620340299437
19692 Choose Genero:  documentary     Distance:  0.36226220185963715
19693 Choose Genero:  drama     Distance:  0.2879080652641827
19694 Choose Genero:  drama     Distance:  0.24437291202412054
19698 Choose Genero:  drama     Distance:  0.2938531933572305
19699 Choose Genero:  adventure

20371 Choose Genero:  unknown     Distance:  1000000
20373 Choose Genero:  drama     Distance:  0.5587132165388499
20374 Choose Genero:  drama     Distance:  0.30586443042192807
20376 Choose Genero:  documentary     Distance:  0.3401590164351677
20377 Choose Genero:  documentary     Distance:  0.45182518358407275
20383 Choose Genero:  drama     Distance:  0.4498541390571909
20384 Choose Genero:  thriller     Distance:  0.42335015967261774
20386 Choose Genero:  drama     Distance:  0.321164614654835
20387 Choose Genero:  horror     Distance:  0.3769317736738252
20388 Choose Genero:  thriller     Distance:  0.268988248813378
20390 Choose Genero:  comedy     Distance:  0.38444204176728175
20391 Choose Genero:  drama     Distance:  0.33871655469048473
20392 Choose Genero:  thriller     Distance:  0.35040367548178375
20396 Choose Genero:  unknown     Distance:  1000000
20398 Choose Genero:  documentary     Distance:  0.45129558815592946
20399 Choose Genero:  drama     Distance:  0.460940268

21047 Choose Genero:  unknown     Distance:  1000000
21048 Choose Genero:  drama     Distance:  0.3258116938583801
21050 Choose Genero:  comedy     Distance:  0.34705852652152136
21051 Choose Genero:  drama     Distance:  0.42826868130523604
21052 Choose Genero:  drama     Distance:  0.3348448400226602
21053 Choose Genero:  drama     Distance:  0.4552388235291598
21054 Choose Genero:  comedy     Distance:  0.3083998215069283
21055 Choose Genero:  drama     Distance:  0.29226392384882893
21056 Choose Genero:  drama     Distance:  0.3618216569113791
21057 Choose Genero:  documentary     Distance:  0.3773311744451591
21058 Choose Genero:  drama     Distance:  0.3978314088832217
21059 Choose Genero:  horror     Distance:  0.4258786052559593
21063 Choose Genero:  comedy     Distance:  0.6008220963531187
21064 Choose Genero:  drama     Distance:  0.31013721100143826
21067 Choose Genero:  comedy     Distance:  0.2994406105954234
21068 Choose Genero:  fantasy     Distance:  0.40736567538940543

21688 Choose Genero:  horror     Distance:  0.27856337189033736
21689 Choose Genero:  drama     Distance:  0.3484277094282238
21690 Choose Genero:  documentary     Distance:  0.4734808788268432
21691 Choose Genero:  documentary     Distance:  0.4037122556418669
21693 Choose Genero:  comedy     Distance:  0.4962910879060415
21694 Choose Genero:  drama     Distance:  0.31863897694979737
21695 Choose Genero:  comedy     Distance:  0.49774821946863274
21697 Choose Genero:  drama     Distance:  0.39907022688207927
21698 Choose Genero:  drama     Distance:  0.41037709714450454
21699 Choose Genero:  documentary     Distance:  0.45544725348606674
21700 Choose Genero:  comedy     Distance:  0.32131959834839846
21701 Choose Genero:  crime     Distance:  0.3779880327232783
21702 Choose Genero:  documentary     Distance:  0.35557978170982096
21703 Choose Genero:  comedy     Distance:  0.5265608933164815
21704 Choose Genero:  fantasy     Distance:  0.3776906685068157
21705 Choose Genero:  thriller 

22414 Choose Genero:  documentary     Distance:  0.401486739588535
22415 Choose Genero:  thriller     Distance:  0.3488413395476304
22416 Choose Genero:  drama     Distance:  0.36611680672676894
22419 Choose Genero:  drama     Distance:  0.39885738772441215
22420 Choose Genero:  thriller     Distance:  0.406381679633295
22421 Choose Genero:  drama     Distance:  0.2823560424089283
22422 Choose Genero:  adventure     Distance:  0.39025449661415124
22423 Choose Genero:  documentary     Distance:  0.46407504410908895
22427 Choose Genero:  horror     Distance:  0.4688317979959619
22428 Choose Genero:  comedy     Distance:  0.2987841116662941
22431 Choose Genero:  horror     Distance:  0.37022460622893805
22432 Choose Genero:  thriller     Distance:  0.22784598873501838
22433 Choose Genero:  action     Distance:  0.4916746741282258
22434 Choose Genero:  thriller     Distance:  0.2826924553831334
22435 Choose Genero:  unknown     Distance:  1000000
22436 Choose Genero:  unknown     Distance:

23100 Choose Genero:  comedy     Distance:  0.5287531782266142
23107 Choose Genero:  unknown     Distance:  1000000
23108 Choose Genero:  thriller     Distance:  0.31624538719466766
23109 Choose Genero:  comedy     Distance:  0.4964733816949108
23110 Choose Genero:  adventure     Distance:  0.4849053485613932
23111 Choose Genero:  crime     Distance:  0.36683715546990014
23112 Choose Genero:  comedy     Distance:  0.5957155557228664
23113 Choose Genero:  comedy     Distance:  0.6224114687380475
23116 Choose Genero:  horror     Distance:  0.37603393095474885
23117 Choose Genero:  foreign     Distance:  0.2950569682838797
23118 Choose Genero:  drama     Distance:  0.278519993009278
23120 Choose Genero:  drama     Distance:  0.434453055454449
23121 Choose Genero:  drama     Distance:  0.39911438297979673
23124 Choose Genero:  unknown     Distance:  1000000
23126 Choose Genero:  thriller     Distance:  0.2926133699013228
23128 Choose Genero:  documentary     Distance:  0.3986955813149141
2

23735 Choose Genero:  drama     Distance:  0.4338770265143599
23736 Choose Genero:  unknown     Distance:  1000000
23737 Choose Genero:  comedy     Distance:  0.3744815558781218
23738 Choose Genero:  drama     Distance:  0.458837522184347
23740 Choose Genero:  comedy     Distance:  0.39131542351288967
23741 Choose Genero:  western     Distance:  0.38373550575872634
23742 Choose Genero:  thriller     Distance:  0.309795314373565
23743 Choose Genero:  drama     Distance:  0.3095151557325492
23746 Choose Genero:  drama     Distance:  0.3219180642897392
23748 Choose Genero:  thriller     Distance:  0.3013056555871637
23749 Choose Genero:  crime     Distance:  0.3398489211145802
23750 Choose Genero:  music     Distance:  0.4320563504706408
23751 Choose Genero:  adventure     Distance:  0.4102987025261873
23752 Choose Genero:  comedy     Distance:  0.3571119411927568
23753 Choose Genero:  adventure     Distance:  0.39886942319746344
23754 Choose Genero:  comedy     Distance:  0.3112092260522

24199 Choose Genero:  documentary     Distance:  0.4361966509549955
24200 Choose Genero:  thriller     Distance:  0.4699049487832002
24201 Choose Genero:  comedy     Distance:  0.39305463067962365
24203 Choose Genero:  thriller     Distance:  0.26447950390625596
24205 Choose Genero:  drama     Distance:  0.3272392634256336
24208 Choose Genero:  comedy     Distance:  0.3856732580065241
24211 Choose Genero:  thriller     Distance:  0.38572320991862286
24212 Choose Genero:  documentary     Distance:  0.4827224713055275
24213 Choose Genero:  documentary     Distance:  0.539123183610922
24215 Choose Genero:  drama     Distance:  0.37147668914174575
24216 Choose Genero:  comedy     Distance:  0.30246051727472006
24217 Choose Genero:  comedy     Distance:  0.32102878930629875
24218 Choose Genero:  adventure     Distance:  0.5012547372849392
24224 Choose Genero:  drama     Distance:  0.45539436481168005
24226 Choose Genero:  comedy     Distance:  0.36057341230094664
24227 Choose Genero:  docum

24867 Choose Genero:  thriller     Distance:  0.25127206941911917
24868 Choose Genero:  horror     Distance:  0.3349862838727792
24869 Choose Genero:  horror     Distance:  0.40359617360864714
24870 Choose Genero:  comedy     Distance:  0.36673070406529246
24871 Choose Genero:  animation     Distance:  0.25939193388779747
24873 Choose Genero:  thriller     Distance:  0.25204755375478743
24874 Choose Genero:  comedy     Distance:  0.3200394261408937
24875 Choose Genero:  thriller     Distance:  0.3504683435208494
24880 Choose Genero:  drama     Distance:  0.43424452894549126
24881 Choose Genero:  documentary     Distance:  0.3441396032239592
24882 Choose Genero:  unknown     Distance:  1000000
24883 Choose Genero:  comedy     Distance:  0.301206719297754
24884 Choose Genero:  fantasy     Distance:  0.30806979665223877
24885 Choose Genero:  thriller     Distance:  0.3016824405364237
24887 Choose Genero:  drama     Distance:  0.37451818570554674
24888 Choose Genero:  drama     Distance:  

25583 Choose Genero:  drama     Distance:  0.36692314763725337
25584 Choose Genero:  animation     Distance:  0.5068688993126264
25586 Choose Genero:  drama     Distance:  0.3977311333450016
25588 Choose Genero:  drama     Distance:  0.3917476969183523
25590 Choose Genero:  comedy     Distance:  0.3600082831888761
25591 Choose Genero:  action     Distance:  0.4709965261366632
25592 Choose Genero:  comedy     Distance:  0.4201340561189661
25593 Choose Genero:  comedy     Distance:  0.3307694432324014
25596 Choose Genero:  unknown     Distance:  1000000
25597 Choose Genero:  drama     Distance:  0.37543536956266527
25598 Choose Genero:  unknown     Distance:  1000000
25599 Choose Genero:  horror     Distance:  0.39836844795557896
25600 Choose Genero:  action     Distance:  0.37099460830562875
25601 Choose Genero:  adventure     Distance:  0.35698625289643376
25602 Choose Genero:  drama     Distance:  0.37094814495601575
25603 Choose Genero:  comedy     Distance:  0.5723324229253012
25604

26256 Choose Genero:  family     Distance:  0.353564264517978
26257 Choose Genero:  comedy     Distance:  0.4010915325362751
26258 Choose Genero:  fantasy     Distance:  0.4381984553512628
26259 Choose Genero:  comedy     Distance:  0.29463468265586423
26261 Choose Genero:  action     Distance:  0.5047348596325578
26263 Choose Genero:  comedy     Distance:  0.4382488055726259
26265 Choose Genero:  drama     Distance:  0.33606592244389805
26266 Choose Genero:  drama     Distance:  0.3475018964450652
26267 Choose Genero:  thriller     Distance:  0.3333967416486833
26268 Choose Genero:  drama     Distance:  0.29765136491696437
26269 Choose Genero:  thriller     Distance:  0.30563075152736535
26270 Choose Genero:  drama     Distance:  0.34023832106874313
26271 Choose Genero:  thriller     Distance:  0.32157659314219383
26272 Choose Genero:  comedy     Distance:  0.3650866502596369
26273 Choose Genero:  thriller     Distance:  0.21499818552700467
26274 Choose Genero:  comedy     Distance:  

26855 Choose Genero:  drama     Distance:  0.33933031037396094
26856 Choose Genero:  animation     Distance:  0.39191347702034174
26857 Choose Genero:  family     Distance:  0.29042708425090086
26858 Choose Genero:  comedy     Distance:  0.35518564799039315
26859 Choose Genero:  comedy     Distance:  0.31764161562689847
26863 Choose Genero:  comedy     Distance:  0.45465471607497
26864 Choose Genero:  western     Distance:  0.4789618514849972
26866 Choose Genero:  history     Distance:  0.2739403742881116
26867 Choose Genero:  drama     Distance:  0.31257637381909065
26869 Choose Genero:  thriller     Distance:  0.4237225378131828
26870 Choose Genero:  comedy     Distance:  0.3708530212809661
26871 Choose Genero:  thriller     Distance:  0.4296027021986432
26872 Choose Genero:  comedy     Distance:  0.4590430096431122
26874 Choose Genero:  comedy     Distance:  0.45856018643796936
26876 Choose Genero:  drama     Distance:  0.36371764535126877
26877 Choose Genero:  comedy     Distance: 

27492 Choose Genero:  thriller     Distance:  0.3332782877487642
27493 Choose Genero:  documentary     Distance:  0.3366069500546381
27494 Choose Genero:  comedy     Distance:  0.44186768236256885
27495 Choose Genero:  drama     Distance:  0.3407603258695833
27496 Choose Genero:  animation     Distance:  0.3976997481029717
27497 Choose Genero:  comedy     Distance:  0.3878044719003463
27498 Choose Genero:  unknown     Distance:  1000000
27499 Choose Genero:  unknown     Distance:  1000000
27501 Choose Genero:  animation     Distance:  0.40522358756961674
27502 Choose Genero:  drama     Distance:  0.3462738670546683
27503 Choose Genero:  family     Distance:  0.3260039372648744
27504 Choose Genero:  animation     Distance:  0.3838559380400957
27505 Choose Genero:  family     Distance:  0.31770381931372677
27506 Choose Genero:  fantasy     Distance:  0.4696472515173424
27507 Choose Genero:  comedy     Distance:  0.34407368482913897
27508 Choose Genero:  animation     Distance:  0.6788556

28302 Choose Genero:  comedy     Distance:  0.3593234863837511
28311 Choose Genero:  drama     Distance:  0.33622044947972696
28317 Choose Genero:  comedy     Distance:  0.3588983197469461
28318 Choose Genero:  documentary     Distance:  0.49343452858390724
28319 Choose Genero:  unknown     Distance:  1000000
28320 Choose Genero:  family     Distance:  0.400555190787979
28322 Choose Genero:  foreign     Distance:  0.28980525046850514
28324 Choose Genero:  drama     Distance:  0.47561668188313483
28325 Choose Genero:  comedy     Distance:  0.7247823574331604
28326 Choose Genero:  unknown     Distance:  1000000
28327 Choose Genero:  comedy     Distance:  0.38786843751178934
28328 Choose Genero:  comedy     Distance:  0.3756504538512981
28329 Choose Genero:  comedy     Distance:  0.5330407156300219
28330 Choose Genero:  comedy     Distance:  0.32398432265877636
28331 Choose Genero:  unknown     Distance:  1000000
28332 Choose Genero:  horror     Distance:  0.3028623439802718
28333 Choose 

29069 Choose Genero:  drama     Distance:  0.31705171769064244
29070 Choose Genero:  thriller     Distance:  0.2294151936769334
29071 Choose Genero:  drama     Distance:  0.30666563762221327
29072 Choose Genero:  drama     Distance:  0.3412103243675793
29073 Choose Genero:  drama     Distance:  0.3441515048567979
29074 Choose Genero:  thriller     Distance:  0.42929754008350374
29077 Choose Genero:  thriller     Distance:  0.2992667156983834
29078 Choose Genero:  history     Distance:  0.39649264546689067
29081 Choose Genero:  comedy     Distance:  0.3470512263894572
29082 Choose Genero:  horror     Distance:  0.414784362996699
29083 Choose Genero:  thriller     Distance:  0.29967134287311475
29086 Choose Genero:  thriller     Distance:  0.30772798018826064
29088 Choose Genero:  thriller     Distance:  0.3622737377599598
29090 Choose Genero:  thriller     Distance:  0.2536781838302004
29091 Choose Genero:  family     Distance:  0.4383588524661449
29092 Choose Genero:  horror     Distan

29785 Choose Genero:  thriller     Distance:  0.28432174125239174
29788 Choose Genero:  drama     Distance:  0.38558874254584913
29789 Choose Genero:  comedy     Distance:  0.262405143614379
29791 Choose Genero:  thriller     Distance:  0.3978542561671883
29793 Choose Genero:  drama     Distance:  0.33096489939344886
29794 Choose Genero:  documentary     Distance:  0.4774023340597927
29795 Choose Genero:  unknown     Distance:  1000000
29796 Choose Genero:  comedy     Distance:  0.4317898474577635
29798 Choose Genero:  comedy     Distance:  0.3374050277686866
29799 Choose Genero:  comedy     Distance:  0.30794959506446656
29800 Choose Genero:  documentary     Distance:  0.5103788538956828
29802 Choose Genero:  western     Distance:  0.4597946400705639
29804 Choose Genero:  documentary     Distance:  0.36943763996517864
29805 Choose Genero:  documentary     Distance:  0.43371299168660005
29806 Choose Genero:  unknown     Distance:  1000000
29811 Choose Genero:  thriller     Distance:  0

30595 Choose Genero:  thriller     Distance:  0.34698318088232294
30596 Choose Genero:  fantasy     Distance:  0.40304704576816724
30598 Choose Genero:  documentary     Distance:  0.41560518241112004
30600 Choose Genero:  unknown     Distance:  1000000
30601 Choose Genero:  unknown     Distance:  1000000
30602 Choose Genero:  music     Distance:  0.5558659809797103
30603 Choose Genero:  unknown     Distance:  1000000
30604 Choose Genero:  documentary     Distance:  0.46207889047925044
30609 Choose Genero:  unknown     Distance:  1000000
30610 Choose Genero:  comedy     Distance:  0.5658523637350001
30611 Choose Genero:  comedy     Distance:  0.5299202673523293
30612 Choose Genero:  comedy     Distance:  0.3870250085483255
30613 Choose Genero:  family     Distance:  0.47999019618377614
30614 Choose Genero:  horror     Distance:  0.5043002246787253
30615 Choose Genero:  documentary     Distance:  0.36969455921887345
30616 Choose Genero:  drama     Distance:  0.35466377376777664
30617 Cho

31314 Choose Genero:  unknown     Distance:  1000000
31315 Choose Genero:  unknown     Distance:  1000000
31316 Choose Genero:  documentary     Distance:  0.39258415157404025
31317 Choose Genero:  unknown     Distance:  1000000
31318 Choose Genero:  unknown     Distance:  1000000
31319 Choose Genero:  unknown     Distance:  1000000
31320 Choose Genero:  thriller     Distance:  0.5086026680679767
31322 Choose Genero:  drama     Distance:  0.2862950433761412
31323 Choose Genero:  drama     Distance:  0.32774552631525244
31324 Choose Genero:  thriller     Distance:  0.36932883693450375
31326 Choose Genero:  drama     Distance:  0.3536233045140104
31327 Choose Genero:  thriller     Distance:  0.2582389746094215
31330 Choose Genero:  fantasy     Distance:  0.41616199137170146
31337 Choose Genero:  fantasy     Distance:  0.28415400750533154
31344 Choose Genero:  unknown     Distance:  1000000
31345 Choose Genero:  documentary     Distance:  0.4123222427094908
31346 Choose Genero:  comedy    

32122 Choose Genero:  drama     Distance:  0.3327131369586586
32123 Choose Genero:  drama     Distance:  0.3203889782640159
32124 Choose Genero:  drama     Distance:  0.2963557788521639
32125 Choose Genero:  comedy     Distance:  0.37156060973749305
32126 Choose Genero:  comedy     Distance:  0.2834124519822374
32128 Choose Genero:  drama     Distance:  0.533465791403816
32130 Choose Genero:  drama     Distance:  0.31181139124163354
32131 Choose Genero:  thriller     Distance:  0.35281638973896484
32133 Choose Genero:  comedy     Distance:  0.5098522673343188
32134 Choose Genero:  documentary     Distance:  0.5007695214097737
32135 Choose Genero:  documentary     Distance:  0.36948359731479724
32136 Choose Genero:  documentary     Distance:  0.48783201766263445
32137 Choose Genero:  documentary     Distance:  0.4495560630507752
32138 Choose Genero:  family     Distance:  0.35440688778323554
32139 Choose Genero:  drama     Distance:  0.3006318728783964
32140 Choose Genero:  unknown     

In [90]:
train_X, test_X, train_Y, test_Y = train_test_split(matriz_w2v_textos, movies["genero"], test_size=0.25, random_state=50)

In [91]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(train_Y)
Test_Y = Encoder.fit_transform(test_Y)

In [92]:
print(train_X.shape)
print(test_X.shape)

(24201, 300)
(8068, 300)


In [93]:
lista_generos

['action', 'science fiction']

In [38]:
# idx = lista_generos.index('Science Fiction')
# lista_generos[idx]='science'
# idx = lista_generos.index('TV Movie')
# lista_generos[idx]='tv'


# dict_w2v_generos = {}
# for genero in lista_generos: 
#     genero = genero.lower()
#     dict_w2v_generos[genero] = modelo[genero]

In [51]:
for genero in dict_w2v_generos.keys():
    print(genero, np.linalg.norm(dict_w2v_generos[genero]))

animation 3.2788527
adventure 3.1866658
romance 2.9361107
comedy 3.4596713
action 3.0095065
family 2.9532094
history 2.6248894
drama 3.4183264
crime 3.108495
science 3.2264402
fantasy 3.4163253
unknown 3.4409328
music 2.8462336
horror 3.3719115
documentary 2.9217546
mystery 2.7685804
thriller 3.7359772
western 3.1287322
tv 2.9879045
war 3.3584478
foreign 3.4518964


In [78]:
# errors = 0 
# vetor_vazio = np.zeros(300)
# # for i in range(len(matriz_w2v_textos)):
# for i in range(55):
#     w2v = matriz_w2v_textos[i]
#     comparison = w2v == vetor_vazio
#     if not comparison.all():
#         less_distance = 1000000
#         choose_genero = "unknown"
#         # print(less_distance)
#         for genero in dict_w2v_generos.keys():
#             gen_model = dict_w2v_generos[genero]
#             cosine_similarity = np.dot(w2v, gen_model)/(np.linalg.norm(w2v)* np.linalg.norm(gen_model))
#             # print(i, "Genero: ", genero, "   Distance: ", cosine_similarity)
#             if cosine_similarity < less_distance:
#                 less_distance = cosine_similarity
#                 choose_genero = genero
#         print(i, "Choose Genero: ", choose_genero, "    Distance: ", less_distance)
    

0 Choose Genero:  science     Distance:  0.22697585832968678
1 Choose Genero:  science     Distance:  0.3255544678249788
2 Choose Genero:  science     Distance:  0.2870006496934407
3 Choose Genero:  science     Distance:  0.25996198228403117
4 Choose Genero:  thriller     Distance:  0.2533441832400287
5 Choose Genero:  thriller     Distance:  0.3213471943135032
6 Choose Genero:  science     Distance:  0.22492624892130925
7 Choose Genero:  science     Distance:  0.27721111130088144
8 Choose Genero:  science     Distance:  0.3114616514230757
9 Choose Genero:  animation     Distance:  0.2399089009527797
10 Choose Genero:  animation     Distance:  0.27975890300554496
11 Choose Genero:  science     Distance:  0.28642889971614655
12 Choose Genero:  music     Distance:  0.2093536793708521
13 Choose Genero:  animation     Distance:  0.3078015602525113
14 Choose Genero:  thriller     Distance:  0.2869797077707812
15 Choose Genero:  foreign     Distance:  0.24399393472689285
16 Choose Genero:  t

In [94]:
# fit the training dataset on the NB classifier
model = SVC()
model.fit(train_X,Train_Y)
# predict the labels on validation dataset
predictions_model = model.predict(test_X)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_model, Test_Y)*100)

Naive Bayes Accuracy Score ->  22.99206742687159


In [115]:
modelo.most_similar(positive=["woman", "king"], negative=["man"])


[('queen', 0.6688153743743896),
 ('princess', 0.552842915058136),
 ('prince', 0.5264531970024109),
 ('nanasipau', 0.4919755756855011),
 ('maathorneferure', 0.49078336358070374),
 ('regnant', 0.49029964208602905),
 ('chandrmondol', 0.4897799491882324),
 ('regent', 0.48750442266464233),
 ('crown', 0.4872990548610687),
 ('elizabeth', 0.48600274324417114)]