### Word Embeddings

- We'll be using the [pymagnitude](https://github.com/plasticityai/magnitude) library

In [1]:
from pymagnitude import *

In [197]:
#path = 'data/fasttext-wiki-news-300d-1M.magnitude'
#path = 'data/glove.6B.300d.magnitude'
path = 'data/GoogleNews-vectors-negative300.magnitude'
# this isn't working: path = 'data/elmo_2x4096_512_2048cnn_2xhighway_weights.magnitude'

vectors = Magnitude(path)

In [198]:
len(vectors)

3000000

In [199]:
vectors.dim # this is how big the vectors are for each word

300

In [200]:
"cat" in vectors

True

In [201]:
for key, vector in vectors[500:510]:
    print(key, vector[:3])

doing [ 0.0281802  0.0825745 -0.021299 ]
face [0.0627915 0.0505594 0.1051962]
low [ 0.0799475 -0.0323317 -0.0850422]
higher [-0.0591503 -0.0751052 -0.0233488]
site [-0.0102047 -0.0349875  0.0920935]
once [-0.0143685  0.0190645  0.0356058]
yet [ 0.0641931 -0.0712143  0.0529092]
hours [-0.1092402  0.0376956  0.0030291]
America [0.0156636 0.1196859 0.0214872]
control [ 0.0571143  0.0257574 -0.0354632]


In [202]:
vectors.query("cat")[:3]

array([ 0.0040587,  0.0671903, -0.0938735], dtype=float32)

In [203]:
vectors.query(["cat","dog"])[0][:3]

array([ 0.0040587,  0.0671903, -0.0938735], dtype=float32)

In [204]:
vectors.distance("cat", "dog")

0.69145405

In [205]:
vectors.distance("cat", "car")

1.2527715

In [206]:
vectors.most_similar_to_given("cat", ["dog", "television", "laptop"]) 

'dog'

In [207]:
vectors.doesnt_match(["breakfast", "cereal", "dinner", "lunch"])

'cereal'

In [208]:
#vectors.most_similar("cat", topn = 5)

In [209]:
#vectors.most_similar(positive = ["woman", "king"], negative = ["man"])

### Topic Modeling

- Given a document, determine the topic of the document
- For this task, we'll use the Brown corpus of texts accessible via NLTK

In [210]:
from nltk.corpus import brown
from collections import defaultdict
import tqdm # tqdm displays a progress bar
from tqdm import tqdm_notebook as tqdm

category_vectors = []

cats = brown.categories()
    
# for each category
for cat in cats:
    print(cat)
    # grab all of the documents
    for fileid in tqdm(brown.fileids(categories=[cat])):
        words = list(map(str.lower, brown.words(fileids=[fileid])))
        # grab all of the words, find their embedding, sum all embeddings
        word_sum = np.sum([vectors.query([w]) for w in words if w in vectors], axis=0) # why axis=0?
        # add the now summed embedding to the list for this category
        category_vectors.append((cat,word_sum))
    

adventure


HBox(children=(IntProgress(value=0, max=29), HTML(value='')))


belles_lettres


HBox(children=(IntProgress(value=0, max=75), HTML(value='')))


editorial


HBox(children=(IntProgress(value=0, max=27), HTML(value='')))


fiction


HBox(children=(IntProgress(value=0, max=29), HTML(value='')))


government


HBox(children=(IntProgress(value=0, max=30), HTML(value='')))


hobbies


HBox(children=(IntProgress(value=0, max=36), HTML(value='')))


humor


HBox(children=(IntProgress(value=0, max=9), HTML(value='')))


learned


HBox(children=(IntProgress(value=0, max=80), HTML(value='')))


lore


HBox(children=(IntProgress(value=0, max=48), HTML(value='')))


mystery


HBox(children=(IntProgress(value=0, max=24), HTML(value='')))


news


HBox(children=(IntProgress(value=0, max=44), HTML(value='')))


religion


HBox(children=(IntProgress(value=0, max=17), HTML(value='')))


reviews


HBox(children=(IntProgress(value=0, max=17), HTML(value='')))


romance


HBox(children=(IntProgress(value=0, max=29), HTML(value='')))


science_fiction


HBox(children=(IntProgress(value=0, max=6), HTML(value='')))




In [211]:
import pandas as pd

keys,values=zip(*category_vectors) # unzip using a *

data = pd.DataFrame({'cat':keys,'vectors':values})

In [212]:
data[:3]

Unnamed: 0,cat,vectors
0,adventure,"[[48.89379, 36.945625, 35.379204, 67.52233, -3..."
1,adventure,"[[61.11974, 47.26474, 40.981194, 58.809498, -3..."
2,adventure,"[[55.242455, 49.139767, 35.219963, 48.935223, ..."


In [213]:
total = len(data)

#### compute the baselines

In [214]:
print('random baseline {}'.format(1.0/len(cat)))

print('most common baseline?')
for cat in cats:
    print(cat, len(data[data.cat==cat])/total)

random baseline 0.06666666666666667
most common baseline?
adventure 0.058
belles_lettres 0.15
editorial 0.054
fiction 0.058
government 0.06
hobbies 0.072
humor 0.018
learned 0.16
lore 0.096
mystery 0.048
news 0.088
religion 0.034
reviews 0.034
romance 0.058
science_fiction 0.012


#### split the data into train/test

In [215]:
test = data.sample(frac=0.1,random_state=200)
train = data.drop(test.index)

test.shape, train.shape 

((50, 2), (450, 2))

#### train a classifier

In [220]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(data.cat) 
X = [x[0] for x in train.vectors]
y = le.transform(train.cat)

In [221]:
from sklearn.linear_model import LogisticRegression

In [222]:
clfr = LogisticRegression(multi_class='multinomial', solver='lbfgs')

In [223]:
clfr.fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

#### evaluate 

In [224]:
from sklearn.metrics import accuracy_score

In [225]:
test_y = le.transform(test.cat)
test_X = [x[0] for x in test.vectors]

score = accuracy_score(clfr.predict(test_X), test_y)

In [226]:
print(path, score)

data/GoogleNews-vectors-negative300.magnitude 0.4


### Embeddings with `flair` library

- [reference](https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md)
- flair can use your GPU if you have CUDA installed

In [1]:
from flair.embeddings import WordEmbeddings

# init embedding
glove_embedding = WordEmbeddings('glove')

2019-02-21 12:43:12,013 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim.vectors.npy not found in cache, downloading to /tmp/tmp8y3dqzbd


100%|██████████| 160000128/160000128 [00:40<00:00, 3975605.73B/s]

2019-02-21 12:43:53,177 copying /tmp/tmp8y3dqzbd to cache at /home/casey/.flair/embeddings/glove.gensim.vectors.npy





2019-02-21 12:43:54,054 removing temp file /tmp/tmp8y3dqzbd
2019-02-21 12:43:54,941 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim not found in cache, downloading to /tmp/tmpkicyvxlb


100%|██████████| 21494764/21494764 [00:06<00:00, 3385082.84B/s]

2019-02-21 12:44:02,187 copying /tmp/tmpkicyvxlb to cache at /home/casey/.flair/embeddings/glove.gensim





2019-02-21 12:44:02,329 removing temp file /tmp/tmpkicyvxlb


In [4]:
from flair.data import Sentence
# create sentence.
sentence = Sentence('The grass is green .')

# embed a sentence using glove.
glove_embedding.embed(sentence)

# now check out the embedded tokens.
for token in sentence:
    print(token)
    print(token.embedding[:10])

Token: 1 The
tensor([-0.0382, -0.2449,  0.7281, -0.3996,  0.0832,  0.0440, -0.3914,  0.3344,
        -0.5755,  0.0875])
Token: 2 grass
tensor([-0.8135,  0.9404, -0.2405, -0.1350,  0.0557,  0.3363,  0.0802, -0.1015,
        -0.5478, -0.3537])
Token: 3 is
tensor([-0.5426,  0.4148,  1.0322, -0.4024,  0.4669,  0.2182, -0.0749,  0.4733,
         0.0810, -0.2208])
Token: 4 green
tensor([-6.7907e-01,  3.4908e-01, -2.3984e-01, -9.9652e-01,  7.3782e-01,
        -6.5911e-04,  2.8010e-01,  1.7287e-02, -3.6063e-01,  3.6955e-02])
Token: 5 .
tensor([-0.3398,  0.2094,  0.4635, -0.6479, -0.3838,  0.0380,  0.1713,  0.1598,
         0.4662, -0.0192])


In [6]:
from flair.embeddings import BertEmbeddings

# init embedding
embedding = BertEmbeddings()

# create a sentence
sentence = Sentence('The grass is green .')

# embed words in sentence
embedding.embed(sentence)

RuntimeError: CUDA error: out of memory