<a href="https://colab.research.google.com/github/cbadenes/notebooks/blob/main/nlp/HarryPotter_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from urllib.request import urlopen

def load_text(url):
  print("loading text from:",url,"..")
  return urlopen(url)

In [None]:
texts = [
         "https://bit.ly/01-stone",
         "https://bit.ly/02-chamber",
         "https://bit.ly/03-prisoner",
         "https://bit.ly/04-fire",
         "https://bit.ly/05-phoenix",
         "https://bit.ly/06-prince",
         "https://bit.ly/07-deathly",
]

In [None]:
import spacy
from spacy.lang.en import English

#we disable all the annotators except the tokenizer so its fast
nlp = English(disable=['tagger','parser','ner'])

def tokenize(text):
  return [token.text.lower() for token in nlp(text)]

In [None]:
corpus_sentences = []
for text in texts:
  for line in load_text(text):
    decoded_line = line.decode("utf-8")
    tokens = tokenize(decoded_line)
    corpus_sentences.append(tokens)
print(len(corpus_sentences),"sentences")

loading text from: https://bit.ly/01-stone ..
loading text from: https://bit.ly/02-chamber ..
loading text from: https://bit.ly/03-prisoner ..
loading text from: https://bit.ly/04-fire ..
loading text from: https://bit.ly/05-phoenix ..
loading text from: https://bit.ly/06-prince ..
loading text from: https://bit.ly/07-deathly ..
211035 sentences


In [None]:
from gensim.models import Word2Vec

model = Word2Vec(sentences=corpus_sentences, size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")

In [None]:
w2vmodel = Word2Vec.load("word2vec.model")
word_vectors = w2vmodel.wv
print("dimension of vectors:",word_vectors.vector_size)
print("number of vectors:",len(word_vectors.index2word) )

dimension of vectors: 100
number of vectors: 23511


In [None]:
word_vectors.most_similar('harry')

[('neville', 0.6940442323684692),
 ('snape', 0.6659226417541504),
 ('he', 0.6650320291519165),
 ('malfoy', 0.6603965163230896),
 ('cedric', 0.6530553698539734),
 ('cho', 0.6447467803955078),
 ('hagrid', 0.6433640718460083),
 ('krum', 0.6370573043823242),
 ('hermione', 0.631946325302124),
 ('dudley', 0.6276787519454956)]

In [None]:
word_vectors.similarity('voldemort','harry')

0.5542105

In [None]:
sim_snape=word_vectors.similarity('voldemort','snape')
sim_dumbledore=word_vectors.similarity('voldemort','dumbledore')
print("Similarity Voldemort-Snape:", sim_snape)
print("Similarity Voldemort-Dumbledore:", sim_dumbledore)

Similarity Voldemort-Snape: 0.76479995
Similarity Voldemort-Dumbledore: 0.8354391


In [None]:
s1 = tokenize('sirius black is a wizard')
s2 = tokenize('the prisoner of azkaban')
word_vectors.n_similarity(s1,s2)

0.19850536

In [None]:
word_vectors.most_similar_cosmul(positive=['ron','hermione'], negative=['harry'])

[('ginny', 0.9963164329528809),
 ('luna', 0.9748616218566895),
 ('hagrid', 0.9652337431907654),
 ('neville', 0.9566739797592163),
 ('bill', 0.9371435642242432),
 ('percy', 0.9179114103317261),
 ('fred', 0.9101880788803101),
 ('george', 0.9000527858734131),
 ('lupin', 0.8895122408866882),
 ('dean', 0.8889251351356506)]

In [None]:
word_vectors.most_similar('aircraft')

KeyError: ignored

In [None]:
from gensim.models import FastText
model = FastText(sentences=corpus_sentences, size=100, window=5, min_count=1, workers=4)
model.save("fasttext.model")

In [None]:
ftmodel = FastText.load("fasttext.model")
ft_vectors = ftmodel.wv
print("dimension of vectors:",ft_vectors.vector_size)
print("number of vectors:",len(ft_vectors.index2word) )

dimension of vectors: 100
number of vectors: 23511


In [None]:
ft_vectors.most_similar('aircraft')

[('eeylops', 0.8819980025291443),
 ('uvula', 0.8461031317710876),
 ('emma', 0.8385815620422363),
 ('edgy', 0.8341172337532043),
 ('gulps', 0.8255451917648315),
 ('oops', 0.8202794790267944),
 ('symbols', 0.8183526992797852),
 ('oval', 0.8111798763275146),
 ('psychology', 0.8092900514602661),
 ('numerology', 0.8039298057556152)]

In [None]:
ft_vectors.most_similar('harry')

[('harryv', 0.9873182773590088),
 ('harryy', 0.987255871295929),
 ('harry\\', 0.9871963262557983),
 ('harryl', 0.9871581792831421),
 ('harry-', 0.9865343570709229),
 ('harrys', 0.9843622446060181),
 ('harry’d', 0.9617148637771606),
 ('lolharry', 0.9099390506744385),
 ('louharry', 0.906660258769989),
 ('loloharry', 0.9037372469902039)]