### Word Embeddings

- We'll be using the [pymagnitude](https://github.com/plasticityai/magnitude) library

In [1]:
from pymagnitude import *
import nltk
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\arjun\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [2]:
#path = 'data/fasttext-wiki-news-300d-1M.magnitude'
#path = 'data/glove.6B.300d.magnitude'
#path = 'data/GoogleNews-vectors-negative300.magnitude'
path = 'data/glove.6B.50d.magnitude'
# this isn't working: path = 'data/elmo_2x4096_512_2048cnn_2xhighway_weights.magnitude'

vectors = Magnitude(path)

In [3]:
len(vectors)

400000

In [4]:
vectors.dim # this is how big the vectors are for each word

50

In [5]:
"cat" in vectors

True

In [6]:
for key, vector in vectors[500:510]:
    print(key, vector[:3])

working [ 0.0547345 -0.0305866 -0.0075621]
community [ 0.0276732  0.117468  -0.1533174]
eight [0.0133356 0.0815326 0.1307856]
groups [ 0.0933181 -0.0622403 -0.0163335]
despite [-0.0066614  0.0074928 -0.0322814]
level [-0.0736265  0.1976634  0.0354784]
largest [0.1119611 0.0235172 0.0475007]
whose [ 0.0633574  0.144303  -0.0080723]
attacks [ 0.2780417 -0.1416092  0.1276424]
germany [0.0529495 0.009489  0.0464709]


In [7]:
vectors.query("cat")[:3]

array([ 0.1027278, -0.1136787, -0.1218595], dtype=float32)

In [8]:
vectors.query(["cat","dog"])[0][:3]

array([ 0.1027278, -0.1136787, -0.1218595], dtype=float32)

In [9]:
vectors.distance("cat", "dog")

0.395473

In [10]:
vectors.distance("cat", "car")

1.1279846

In [11]:
vectors.most_similar_to_given("cat", ["dog", "television", "laptop"]) 

'dog'

In [12]:
vectors.doesnt_match(["breakfast", "cereal", "dinner", "lunch"])

'cereal'

In [13]:
#vectors.most_similar("cat", topn = 5)

In [14]:
#vectors.most_similar(positive = ["woman", "king"], negative = ["man"])

### Topic Modeling

- Given a document, determine the topic of the document
- For this task, we'll use the Brown corpus of texts accessible via NLTK

In [15]:
from nltk.corpus import brown
from collections import defaultdict
import tqdm # tqdm displays a progress bar
from tqdm import tqdm_notebook as tqdm

category_vectors = []

cats = brown.categories()
    
# for each category
for cat in cats:
    print(cat)
    # grab all of the documents
    for fileid in tqdm(brown.fileids(categories=[cat])):
        words = list(map(str.lower, brown.words(fileids=[fileid])))
        # grab all of the words, find their embedding, sum all embeddings
        word_sum = np.sum([vectors.query([w]) for w in words if w in vectors], axis=0) # why axis=0?
        # add the now summed embedding to the list for this category
        category_vectors.append((cat,word_sum))
    

adventure


HBox(children=(IntProgress(value=0, max=29), HTML(value='')))


belles_lettres


HBox(children=(IntProgress(value=0, max=75), HTML(value='')))


editorial


HBox(children=(IntProgress(value=0, max=27), HTML(value='')))


fiction


HBox(children=(IntProgress(value=0, max=29), HTML(value='')))


government


HBox(children=(IntProgress(value=0, max=30), HTML(value='')))


hobbies


HBox(children=(IntProgress(value=0, max=36), HTML(value='')))


humor


HBox(children=(IntProgress(value=0, max=9), HTML(value='')))


learned


HBox(children=(IntProgress(value=0, max=80), HTML(value='')))


lore


HBox(children=(IntProgress(value=0, max=48), HTML(value='')))


mystery


HBox(children=(IntProgress(value=0, max=24), HTML(value='')))


news


HBox(children=(IntProgress(value=0, max=44), HTML(value='')))


religion


HBox(children=(IntProgress(value=0, max=17), HTML(value='')))


reviews


HBox(children=(IntProgress(value=0, max=17), HTML(value='')))


romance


HBox(children=(IntProgress(value=0, max=29), HTML(value='')))


science_fiction


HBox(children=(IntProgress(value=0, max=6), HTML(value='')))




In [16]:
import pandas as pd

keys,values=zip(*category_vectors) # unzip using a *

data = pd.DataFrame({'cat':keys,'vectors':values})

In [17]:
data[:3]

Unnamed: 0,cat,vectors
0,adventure,"[[101.48395, 49.925434, -35.528347, -92.13484,..."
1,adventure,"[[93.14879, 49.615086, -33.30255, -89.50236, 1..."
2,adventure,"[[93.08724, 25.093605, 1.5583314, -87.50236, 1..."


In [18]:
total = len(data)

#### compute the baselines

In [19]:
print('random baseline {}'.format(1.0/len(cat)))

print('most common baseline?')
for cat in cats:
    print(cat, len(data[data.cat==cat])/total)

random baseline 0.06666666666666667
most common baseline?
adventure 0.058
belles_lettres 0.15
editorial 0.054
fiction 0.058
government 0.06
hobbies 0.072
humor 0.018
learned 0.16
lore 0.096
mystery 0.048
news 0.088
religion 0.034
reviews 0.034
romance 0.058
science_fiction 0.012


#### split the data into train/test

In [20]:
test = data.sample(frac=0.1,random_state=200)
train = data.drop(test.index)

test.shape, train.shape 

((50, 2), (450, 2))

#### train a classifier

In [21]:
import keras
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import Dropout
from keras.optimizers import SGD
from keras import regularizers
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
ohe = preprocessing.OneHotEncoder()
le.fit(data.cat)
y = le.transform(train.cat).reshape(-1, 1) # this is magic
ohe.fit(y)
y = ohe.transform(y)

X = np.array([x[0] for x in train.vectors])

X.shape, y.shape

Using TensorFlow backend.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


((450, 50), (450, 15))

In [47]:
model = Sequential()
# Dense(64) is a fully-connected layer with 64 hidden units.
# in the first layer, you must specify the expected input data shape:
# here, 20-dimensional vectors.
act = 'tanh'
model.add(Dense(64, activation=act, input_dim=vectors.dim))
model.add(Dropout(.5))
model.add(Dense(48, activation=act))
model.add(Dropout(.5))
model.add(Dense(32, activation=act))
model.add(Dropout(.5))
model.add(Dense(15, activation='softmax'))

sgd = SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy',
              optimizer=sgd,
              metrics=['accuracy'])

model.fit(X, y, epochs=10)
score = model.evaluate(X, y)
print(model.metrics_names)
score

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
['loss', 'acc']


[2.5545950253804524, 0.16]

In [1]:
from client.api.notebook import Notebook
ok = Notebook('a6.ok')
ok.auth(inline=True)

Assignment: A6 Topic Modeling with MLPs
OK, version v1.13.11



ERROR  | auth.py:91 | {'error': 'invalid_grant'}



Open the following URL:

https://okpy.org/client/login/

After logging in, copy the code from the web page and paste it into the box.
Then press the "Enter" key on your keyboard.

Paste your code here: p7sC95YFuE5DJ566uydOmVbKdZ52VQ
Successfully logged in as arjunshukla@u.boisestate.edu


In [2]:
ok.submit()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Saving notebook... Saved 'A6-mlp-topic.ipynb'.
Submit... 100% complete
Submission successful for user: arjunshukla@u.boisestate.edu
URL: https://okpy.org/boisestate/cs4-533/sp19/a6/submissions/0YDkxX

