#  Demo - Word Embeddings

## Download embeddings file

In [7]:
import wget
import zipfile
import os

In [2]:
url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip'
filename = wget.download(url)
with zipfile.ZipFile(filename,"r") as zip_ref:
    zip_ref.extractall("data")

In [8]:
os.remove(filename)

## Loading the embedding and defining some functions

In [9]:
from typing import Any, Iterable, List, Optional, Set, Tuple

from we_load import load_words
import math
import we_vectors as v
from we_vectors import Vector
from we_word import Word

# Timing info for most_similar (100k words):
# Original version: 7.3s
# Normalized vectors: 3.4s

def most_similar(base_vector: Vector, words: List[Word]) -> List[Tuple[float, Word]]:
    """Finds n words with smallest cosine similarity to a given word"""
    words_with_distance = [(v.cosine_similarity_normalized(base_vector, w.vector), w) for w in words]
    # We want cosine similarity to be as large as possible (close to 1)
    sorted_by_distance = sorted(words_with_distance, key=lambda t: t[0], reverse=True)
    return sorted_by_distance

def print_most_similar(words: List[Word], text: str) -> None:
    base_word = find_word(text, words)
    if not base_word:
        print(f"Uknown word: {text}")
        return
    print(f"Words related to {base_word.text}:")
    sorted_by_distance = [
        word.text for (dist, word) in
            most_similar(base_word.vector, words)
            if word.text.lower() != base_word.text.lower()
        ]
    print(', '.join(sorted_by_distance[:10]))

def read_word() -> str:
    return input("Type a word: ")

def find_word(text: str, words: List[Word]) -> Optional[Word]:
    try:
        return next(w for w in words if text == w.text)
    except StopIteration:
        return None

def closest_analogies(
    left2: str, left1: str, right2: str, words: List[Word]
) -> List[Tuple[float, Word]]:
    word_left1 = find_word(left1, words)
    word_left2 = find_word(left2, words)
    word_right2 = find_word(right2, words)
    if (not word_left1) or (not word_left2) or (not word_right2):
        return []
    vector = v.add(
        v.sub(word_left1.vector, word_left2.vector),
        word_right2.vector)
    closest = most_similar(vector, words)[:10]
    def is_redundant(word: str) -> bool:
        """
        Sometimes the two left vectors are so close the answer is e.g.
        "shirt-clothing is like phone-phones". Skip 'phones' and get the next
        suggestion, which might be more interesting.
        """
        word_lower = word.lower()
        return (
            left1.lower() in word_lower or
            left2.lower() in word_lower or
            right2.lower() in word_lower)
    closest_filtered = [(dist, w) for (dist, w) in closest if not is_redundant(w.text)]
    return closest_filtered

def print_analogy(left2: str, left1: str, right2: str, words: List[Word]) -> None:
    analogies = closest_analogies(left2, left1, right2, words)
    if (len(analogies) == 0):
        print(f"{left2}-{left1} is like {right2}-?")
    else:
        (dist, w) = analogies[0]
        #alternatives = ', '.join([f"{w.text} ({dist})" for (dist, w) in analogies])
        print(f"{left2}-{left1} is like {right2}-{w.text}")



In [10]:
words = load_words('data/wiki-news-300d-1M.vec')

Loading data/wiki-news-300d-1M.vec...
Loaded 999995 words.
Removed stop words, 970448 remain.
Removed duplicates, 921219 remain.


In [11]:
words[1]

and [-0.02372573377498372, 0.011258389593861702, ...]

In [12]:
print("Dimension of the embedding: {:d}".format(len(words[0].vector)))

Dimension of the embedding: 300


## Looking at similar words

In [13]:
print_most_similar(words, words[190].text)
print_most_similar(words, words[230].text)
print_most_similar(words, words[330].text)
print_most_similar(words, words[430].text)

Words related to section:
subsection, sections, paragraph, secion, secton, subheading, subsections, sction, setion, seciton
Words related to question:
answer, quesiton, questions, quesion, queston, qustion, ask, whether, answers, non-question
Words related to development:
growth, developments, implementation, developement, construction, developing, advancement, research, developmental, design
Words related to staff:
staffs, personnel, staffers, faculty, members, employees, assistants, staffer, officers, consultants


In [14]:
text = read_word()
w = find_word(text, words)
if not w:
    print("Sorry, I don't know that word.")
else:
    print_most_similar(words, w.text)

Type a word: Austria
Words related to Austria:
Vienna, Austrian, Germany, Salzburg, Graz, Hungary, Innsbruck, Switzerland, Styria, Bavaria


##  Looking at analogies -- or computing with words

In [15]:
print_analogy('man', 'him' , 'woman', words)
# You'll need to download the pretrained word vectors to complete the analogies
# below:
# https://fasttext.cc/docs/en/english-vectors.html
print_analogy('quick', 'quickest' , 'far', words)
print_analogy('sushi', 'rice', 'pizza', words)
print_analogy('Paris', 'France', 'Rome', words)
print_analogy('dog', 'mammal', 'eagle', words)
print_analogy('German', 'BMW' , 'American', words)
print_analogy('German', 'Opel', 'American', words)

man-him is like woman-her
quick-quickest is like far-furthest
sushi-rice is like pizza-wheat
Paris-France is like Rome-Italy
dog-mammal is like eagle-bird
German-BMW is like American-Lexus
German-Opel is like American-Chrysler


In [16]:
left2 = find_word(read_word(), words)
if not left2:
    print("Sorry, I don't know that word.")
left1 = find_word(read_word(), words)
if not left1:
    print("Sorry, I don't know that word.")
right2 = find_word(read_word(), words)
if not right2:
    print("Sorry, I don't know that word.")
print_analogy(left2.text, left1.text, right2.text, words)

Type a word: a
Sorry, I don't know that word.
Type a word: q
Sorry, I don't know that word.
Type a word: 
Sorry, I don't know that word.


AttributeError: 'NoneType' object has no attribute 'text'