## Seminar 1: Fun with Word Embeddings

In [0]:
import numpy as np
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
from gensim.models import Word2Vec
import gensim.downloader as api
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook

In [0]:
# download the data:
!wget https://www.dropbox.com/s/obaitrix9jyu84r/quora.txt?dl=1 -O ./quora.txt
# alternative download link: https://yadi.sk/i/BPQrUu1NaTduEw

--2019-01-22 18:45:59--  https://www.dropbox.com/s/obaitrix9jyu84r/quora.txt?dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.8.1, 2620:100:6018:1::a27d:301
Connecting to www.dropbox.com (www.dropbox.com)|162.125.8.1|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/dl/obaitrix9jyu84r/quora.txt [following]
--2019-01-22 18:45:59--  https://www.dropbox.com/s/dl/obaitrix9jyu84r/quora.txt
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uccd60065e95341db88a3789997a.dl.dropboxusercontent.com/cd/0/get/AZ5ga8c0_OXTVjUHlu0UpTIO81vUX1pvb-DbAbA0UeYQqKrlvIApc8Gp3jeW1NN6zSxfuuJrOF_Hl6nzMyh60yQttYK6w-xv4Pa6yPkmGg6GnyaMsQDpGm3kewt0PnGsFCYyc2zStviOu0oRR2NBOhI3D4NhEhXxrJ6eiv6KqPVaYwPb3Ixa838AxXlMuSVuJe4/file?dl=1# [following]
--2019-01-22 18:45:59--  https://uccd60065e95341db88a3789997a.dl.dropboxusercontent.com/cd/0/get/AZ5ga8c0_OXTVjUHlu0UpTIO81vUX1pvb-DbAbA0UeYQqKrlvIApc8Gp3

In [0]:
data = list(open("./quora.txt"))
data[:10]

["Can I get back with my ex even though she is pregnant with another guy's baby?\n",
 'What are some ways to overcome a fast food addiction?\n',
 'Who were the great Chinese soldiers and leaders who fought in WW2?\n',
 'What are ZIP codes in the Bay Area?\n',
 'Why was George RR Martin critical of JK Rowling after losing the Hugo award?\n',
 'What can I do to improve my immune system?\n',
 'How is your relationship with your mother in law?\n',
 'How does one get Free PSN codes/Vita Codes?\n',
 'What is your review of osquery?\n',
 'How can I look smart and act smart?\n']

### Tokenization

In [0]:
# TASK: lowercase everything and extract tokens with tokenizer. 
# data_tok should be a list of lists of tokens for each line in data.

data_tok = [tokenizer.tokenize(sent.lower()) for sent in data]
print(data_tok[:10])

[['can', 'i', 'get', 'back', 'with', 'my', 'ex', 'even', 'though', 'she', 'is', 'pregnant', 'with', 'another', 'guy', "'", 's', 'baby', '?'], ['what', 'are', 'some', 'ways', 'to', 'overcome', 'a', 'fast', 'food', 'addiction', '?'], ['who', 'were', 'the', 'great', 'chinese', 'soldiers', 'and', 'leaders', 'who', 'fought', 'in', 'ww2', '?'], ['what', 'are', 'zip', 'codes', 'in', 'the', 'bay', 'area', '?'], ['why', 'was', 'george', 'rr', 'martin', 'critical', 'of', 'jk', 'rowling', 'after', 'losing', 'the', 'hugo', 'award', '?'], ['what', 'can', 'i', 'do', 'to', 'improve', 'my', 'immune', 'system', '?'], ['how', 'is', 'your', 'relationship', 'with', 'your', 'mother', 'in', 'law', '?'], ['how', 'does', 'one', 'get', 'free', 'psn', 'codes', '/', 'vita', 'codes', '?'], ['what', 'is', 'your', 'review', 'of', 'osquery', '?'], ['how', 'can', 'i', 'look', 'smart', 'and', 'act', 'smart', '?']]


In [0]:
assert all(isinstance(row, (list, tuple)) for row in data_tok), "please convert each line into a list of tokens (strings)"
assert all(all(isinstance(tok, str) for tok in row) for row in data_tok), "please convert each line into a list of tokens (strings)"
is_latin = lambda tok: all('a' <= x.lower() <= 'z' for x in tok)
assert all(map(lambda l: not is_latin(l) or l.islower(), map(' '.join, data_tok))), "please make sure to lowercase the data"

In [0]:
print([' '.join(row) for row in data_tok[:2]])

["can i get back with my ex even though she is pregnant with another guy ' s baby ?", 'what are some ways to overcome a fast food addiction ?']


### Word vectors

In [0]:
model = Word2Vec(data_tok, 
                 size=32,      # embedding vector size
                 min_count=5,  # consider words that occured at least 5 times
                 window=5).wv  # define context as a 5-word window around the target word

In [0]:
# now you can get word vectors !
model.get_vector('anything')

array([ 1.0774256 ,  1.625316  , -5.2086143 , -1.2453322 ,  3.7453845 ,
       -2.7668033 , -0.04672458, -0.9482076 ,  0.2411085 ,  1.1070505 ,
       -2.2975726 ,  2.8542068 ,  1.8881707 ,  1.662626  , -1.1923248 ,
       -2.0222774 ,  3.5789979 ,  0.353161  , -1.6864799 ,  0.11050456,
        1.5138984 ,  2.0701733 , -2.6106458 ,  0.39736873, -3.3654737 ,
       -2.1171901 , -0.29967487, -0.6144483 ,  0.7566752 ,  1.5979873 ,
        0.5317121 , -1.0954815 ], dtype=float32)

In [0]:
# or query similar words directly. Go play with it!
model.most_similar('bread')

  if np.issubdtype(vec.dtype, np.int):


[('rice', 0.9577526450157166),
 ('pasta', 0.9195684790611267),
 ('sauce', 0.9181680679321289),
 ('butter', 0.9147717356681824),
 ('cheese', 0.9129592776298523),
 ('fruit', 0.9093804955482483),
 ('chocolate', 0.9054661393165588),
 ('corn', 0.9047466516494751),
 ('chicken', 0.898668646812439),
 ('beans', 0.8984371423721313)]

### Using pre-trained model

Took it a while, huh? Now imagine training life-sized (100~300D) word embeddings on gigabytes of text: wikipedia articles or twitter posts. 

Thankfully, nowadays you can get a pre-trained word embedding model in 2 lines of code (no sms required, promise).

In [0]:
model = api.load('glove-twitter-100')



In [0]:
model.most_similar(positive=["coder", "money"], negative=["brain"])

  if np.issubdtype(vec.dtype, np.int):


[('broker', 0.5820155739784241),
 ('bonuses', 0.5424473285675049),
 ('banker', 0.538511335849762),
 ('designer', 0.5197198390960693),
 ('merchandising', 0.4964233934879303),
 ('treet', 0.49220192432403564),
 ('shopper', 0.4920561909675598),
 ('part-time', 0.49128279089927673),
 ('freelance', 0.4843311905860901),
 ('aupair', 0.4796452522277832)]

### Visualizing word vectors

One way to see if our vectors are any good is to plot them. Thing is, those vectors are in 30D+ space and we humans are more used to 2-3D.

Luckily, we machine learners know about __dimensionality reduction__ methods.

Let's use that to plot 1000 most frequent words

In [0]:
words = sorted(model.vocab.keys(), 
               key=lambda word: model.vocab[word].count,
               reverse=True)[:1000]

print(words[::100])

['<user>', '_', 'please', 'apa', 'justin', 'text', 'hari', 'playing', 'once', 'sei']


In [0]:
# for each word, compute it's vector with model
word_vectors = np.array([model.get_vector(word) for word in words])

In [0]:
assert isinstance(word_vectors, np.ndarray)
assert word_vectors.shape == (len(words), 100)
assert np.isfinite(word_vectors).all()

#### Linear projection: PCA

In [0]:
# map word vectors onto 2d plane with PCA. Use good old sklearn api (fit, transform)
# after that, normalize vectors to make sure they have zero mean and unit variance
pca = PCA(n_components=2)
scaler = StandardScaler()
word_vectors_pca = pca.fit_transform(word_vectors)
word_vectors_pca = scaler.fit_transform(word_vectors_pca)

In [0]:
assert word_vectors_pca.shape == (len(word_vectors), 2), "there must be a 2d vector for each word"
assert max(abs(word_vectors_pca.mean(0))) < 1e-5, "points must be zero-centered"
assert max(abs(1.0 - word_vectors_pca.std(0))) < 1e-2, "points must have unit variance"

#### Let's draw it!

In [0]:
def draw_vectors(x, y, radius=10, alpha=0.25, color='blue',
                 width=600, height=400, show=True, **kwargs):
    """ draws an interactive plot for data points with auxilirary info on hover """
    if isinstance(color, str): color = [color] * len(x)
    data_source = bm.ColumnDataSource({ 'x' : x, 'y' : y, 'color': color, **kwargs })

    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height)
    fig.scatter('x', 'y', size=radius, color='color', alpha=alpha, source=data_source)

    fig.add_tools(bm.HoverTool(tooltips=[(key, "@" + key) for key in kwargs.keys()]))
    if show: pl.show(fig)
    return fig

In [0]:
output_notebook()
draw_vectors(word_vectors_pca[:, 0], word_vectors_pca[:, 1], token=words)

# hover a mouse over there and see if you can identify the clusters

### Visualizing neighbors with t-SNE
PCA is nice but it's strictly linear and thus only able to capture coarse high-level structure of the data.

If we instead want to focus on keeping neighboring points near, we could use TSNE, which is itself an embedding method. Here you can read __[more on TSNE](https://distill.pub/2016/misread-tsne/)__.

In [0]:
# map word vectors onto 2d plane with TSNE. hint: use verbose=100 to see what it's doing.
# normalize them as just lke with pca

tsne = TSNE(n_components=2) # без verbose tsne считается шустрее почему-то

word_tsne = tsne.fit_transform(word_vectors)
word_tsne = scaler.fit_transform(word_tsne) # скейлeр уже заводили

In [0]:
output_notebook()
draw_vectors(word_tsne[:, 0], word_tsne[:, 1], color='green', token=words)

### Visualizing phrases

Word embeddings can also be used to represent short phrases. The simplest way is to take __an average__ of vectors for all tokens in the phrase with some weights.

This trick is useful to identify what data are you working with: find if there are any outliers, clusters or other artefacts.

Let's try this new hammer on our data!


In [0]:
def get_phrase_embedding(phrase):
    """
    Convert phrase to a vector by aggregating it's word embeddings. See description above.
    """
    # 1. lowercase phrase
    # 2. tokenize phrase
    # 3. average word vectors for all words in tokenized phrase
    # skip words that are not in model's vocabulary
    # if all words are missing from vocabulary, return zeros
    
    vector = np.zeros([model.vector_size], dtype='float32')
    
    toks = tokenizer.tokenize(phrase.lower())
    for tok in toks:
        if tok in model.vocab: # если слово есть в словаре word2vec
            vec = model[tok] # получаем его вектор
            vector = np.vstack((vector, vec)) # добавляем в массив к остальным токенам
    if len(vector.shape) == 1: # если так ничего и не добавили
        return vector
    else:
        vector = vector[1:] # отбрасываем нулевую строку
        vector = vector.mean(axis=0) # собираем массив в один вектор, выбирая среднее значение столбца
    
    return vector
        
    

In [0]:
vector = get_phrase_embedding("I'm very sure. This never happened to me before...")

assert np.allclose(vector[::10],
                   np.array([ 0.31807372, -0.02558171,  0.0933293 , -0.1002182 , -1.0278689 ,
                             -0.16621883,  0.05083408,  0.17989802,  1.3701859 ,  0.08655966],
                              dtype=np.float32))

In [0]:
# let's only consider ~5k phrases for a first run.
chosen_phrases = data[::len(data) // 1000]

# compute vectors for chosen phrases
phrase_vectors = np.array([get_phrase_embedding(phrase) for phrase in chosen_phrases])

In [0]:
assert isinstance(phrase_vectors, np.ndarray) and np.isfinite(phrase_vectors).all()
assert phrase_vectors.shape == (len(chosen_phrases), model.vector_size)

In [0]:
# map vectors into 2d space with pca, tsne or your other method of choice
# don't forget to normalize

phrase_vectors_2d = tsne.fit_transform(phrase_vectors)

#phrase_vectors_2d = (phrase_vectors_2d - phrase_vectors_2d.mean(axis=0)) / phrase_vectors_2d.std(axis=0)
# не знаю, зачем они делают это вручную, хотя у нас есть
phrase_vectors_2d = scaler.fit_transform(phrase_vectors_2d)

In [0]:
output_notebook()
draw_vectors(phrase_vectors_2d[:, 0], phrase_vectors_2d[:, 1],
             phrase=[phrase[:50] for phrase in chosen_phrases],
             radius=20,)

### Similarity
Finally, let's build a simple "similar question" engine with phrase embeddings we've built.

In [0]:
# compute vector embedding for all lines in data
data_vectors = np.array([get_phrase_embedding(l) for l in data])

In [0]:
def find_nearest(query, k=10):
    """
    given text line (query), return k most similar lines from data, sorted from most to least similar
    similarity should be measured as cosine between query and line embedding vectors
    hint: it's okay to use global variables: data and data_vectors. see also: np.argpartition, np.argsort
    """
    query_vec = get_phrase_embedding(query)
    sims = {} # словарь косинусных близостей
    for i, data_vec in enumerate(data_vectors):
        qr = query_vec.reshape(1, -1)
        dr = data_vec.reshape(1, -1)
        sim = cosine_similarity(qr, dr)[0]
        sims[i] = sim[0]
    
    simsort = sorted(sims, key=sims.get, reverse=True) # сортируем ключи словаря по значению
    top_idx = simsort[:k]
    top_data = [data[i] for i in top_idx]
    
    return top_data

In [0]:
results = find_nearest(query="How do i enter the matrix?", k=10)

print(''.join(results))

assert len(results) == 10 and isinstance(results[0], str)
assert results[0] == 'How do I get to the dark web?\n'
assert results[3] == 'What can I do to save the world?\n'

How do I get to the dark web?
What should I do to enter hollywood?
How do I use the Greenify app?
What can I do to save the world?
How do I win this?
How do I think out of the box? How do I learn to think out of the box?
How do I find the 5th dimension?
How do I use the pad in MMA?
How do I estimate the competition?
What do I do to enter the line of event management?



In [0]:
find_nearest(query="How does Trump?", k=10)

['What does Donald Trump think about Israel?\n',
 'What books does Donald Trump like?\n',
 'What does India think of Donald Trump?\n',
 'What does Donald Trump think of India?\n',
 'What does Donald Trump think of China?\n',
 'What does Donald Trump think about Pakistan?\n',
 'What companies does Donald Trump own?\n',
 'What does Dushka Zapata think about Donald Trump?\n',
 'How does it feel to date Ivanka Trump?\n',
 'What does salesforce mean?\n']

In [0]:
find_nearest(query="Why don't i ask a question myself?", k=10)

["Why don't I get a date?\n",
 "Why do you always answer a question with a question? I don't, or do I?\n",
 "Why can't I ask a question anonymously?\n",
 "Why don't I get a girlfriend?\n",
 "Why don't I have a boyfriend?\n",
 "I don't have no question?\n",
 "Why can't I take a joke?\n",
 "Why don't I ever get a girl?\n",
 "Can I ask a girl out that I don't know?\n",
 "Why don't I have a girlfriend?\n"]