# Example 11-1. Word Embeddings with GloVe and Sentiment Analysis 


- GloVe
    - window 안에 속하는 단어들만을 반영하는 word2vec의 단점을 해결하기 위한 아이디어
    - 전체 dictionary에서 두 단어의 동시등장(co-occurrence)하는 확률을 계산하고 동시에 등장하는 확률이 높을 수록 두 단어 벡터가 가까워지도록 학습
    

## Toy example

In [1]:
!pip install glove_python

Collecting glove_python
[?25l  Downloading https://files.pythonhosted.org/packages/3e/79/7e7e548dd9dcb741935d031117f4bed133276c2a047aadad42f1552d1771/glove_python-0.1.0.tar.gz (263kB)
[K    100% |████████████████████████████████| 266kB 11.3MB/s ta 0:00:01
Building wheels for collected packages: glove-python
  Building wheel for glove-python (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/work/.cache/pip/wheels/88/4b/6d/10c0d2ad32c9d9d68beec9694a6f0b6e83ab1662a90a089a4b
Successfully built glove-python
Installing collected packages: glove-python
Successfully installed glove-python-0.1.0
[33mYou are using pip version 19.0.3, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
input_text = [['the', 'da', 'vinci', 'code', 'book', 'is', 'just', 'awesome', '.'],
              ['i', 'liked', 'the', 'da', 'vinci', 'code', 'a', 'lot', '.']]

- 입력 데이터는 각 문장을 단어들의 list로 표현하여 준비

In [3]:
input_text

[['the', 'da', 'vinci', 'code', 'book', 'is', 'just', 'awesome', '.'],
 ['i', 'liked', 'the', 'da', 'vinci', 'code', 'a', 'lot', '.']]

In [4]:
from glove import Corpus, Glove
corpus = Corpus() 
corpus.fit(input_text, window=10)

- Co-occurrence를 계산

In [6]:
corpus.dictionary

{'.': 8,
 'a': 11,
 'awesome': 7,
 'book': 4,
 'code': 3,
 'da': 1,
 'i': 9,
 'is': 5,
 'just': 6,
 'liked': 10,
 'lot': 12,
 'the': 0,
 'vinci': 2}

In [5]:
print(corpus.matrix)

  (0, 1)	2.0
  (0, 2)	0.5
  (0, 2)	0.5
  (0, 3)	0.6666666865348816
  (0, 4)	0.25
  (0, 5)	0.20000000298023224
  (0, 6)	0.1666666716337204
  (0, 7)	0.1428571492433548
  (0, 8)	0.2916666865348816
  (0, 9)	0.5
  (0, 10)	1.0
  (0, 11)	0.25
  (0, 12)	0.20000000298023224
  (1, 2)	1.0
  (1, 2)	1.0
  (1, 3)	1.0
  (1, 4)	0.3333333432674408
  (1, 5)	0.25
  (1, 6)	0.20000000298023224
  (1, 7)	0.1666666716337204
  (1, 8)	0.34285715222358704
  (1, 9)	0.3333333432674408
  (1, 10)	0.5
  (1, 11)	0.3333333432674408
  (1, 12)	0.25
  :	:
  (3, 8)	0.5333333611488342
  (3, 9)	0.20000000298023224
  (3, 10)	0.25
  (3, 11)	1.0
  (3, 12)	0.5
  (4, 5)	1.0
  (4, 6)	0.5
  (4, 7)	0.3333333432674408
  (4, 8)	0.25
  (5, 6)	1.0
  (5, 7)	0.5
  (5, 8)	0.3333333432674408
  (6, 7)	1.0
  (6, 8)	0.5
  (7, 8)	1.0
  (8, 9)	0.125
  (8, 10)	0.1428571492433548
  (8, 11)	0.5
  (8, 12)	1.0
  (9, 10)	1.0
  (9, 11)	0.1666666716337204
  (9, 12)	0.1428571492433548
  (10, 11)	0.20000000298023224
  (10, 12)	0.1666666716337204
  (11, 12

In [7]:
glove = Glove(no_components=5)
glove.fit(corpus.matrix, epochs=30)

- 위에서 생성한 co-occurrence matrix를 입력값으로 받아 glove 학습

In [8]:
glove.add_dictionary(corpus.dictionary)

In [9]:
glove.word_vectors # embedding matrix

array([[-0.00571506, -0.07490707,  0.06170742, -0.05462285, -0.06623276],
       [-0.09634959,  0.02058038, -0.05428312,  0.076029  ,  0.05931962],
       [-0.09359689,  0.08001433,  0.05762888, -0.0484546 ,  0.06456515],
       [-0.03261592,  0.0912928 ,  0.05551798, -0.06455152,  0.05396033],
       [ 0.0111836 ,  0.08125799,  0.08738593,  0.03845196, -0.02604924],
       [ 0.09827774, -0.00585431, -0.00523   ,  0.00721282,  0.08182504],
       [-0.05834477,  0.01547166,  0.03455233, -0.08697362,  0.04096139],
       [-0.0658658 , -0.01118439,  0.08704402, -0.03216629, -0.03447983],
       [ 0.03785402, -0.08968755, -0.08456509, -0.01508005, -0.05850994],
       [ 0.01515708,  0.06430392,  0.08989079, -0.02950438, -0.07062391],
       [-0.05711068, -0.05089665,  0.03456115, -0.06364022, -0.01460702],
       [-0.00610676,  0.03511082,  0.01297673, -0.01749829, -0.00386495],
       [ 0.04157727,  0.09334284, -0.06312205, -0.08330263, -0.07715888]])

## Sentiment Analysis

#### Data preprocessing


- 문서를 line별로 읽어들이면서 단어의 빈도 계산

In [10]:

MAX_FEATURES = 2000   
MAX_SENTENCE_LENGTH = 40  

import collections
import os 
import numpy as np
import nltk
nltk.download('punkt')

maxlen = 0
word_freqs = collections.Counter()
num_recs = 0
ftrain = open("data/umich-sentiment-train.txt", 'rb')
for line in ftrain:
    label, sentence = line.decode('utf8').strip().split("\t")
    words = nltk.word_tokenize(sentence.lower())
    if len(words) > maxlen:
        maxlen = len(words)  # the maximum number of words in a sentence
    for word in words:
        word_freqs[word] += 1  # frequency for each word
    num_recs += 1 # total number of records
ftrain.close()

[nltk_data] Downloading package punkt to /home/work/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


- 등장 빈도를 기준으로 `MAX_FEATURES` 만큼의 단어를 vocabulary로 결정
- vocabulary에 속하지 않는 단어는 "UNK"로 표시하면서 문장을 단어 단위로 tokenize 하고 list로 저장

In [11]:
vocab = [v for v, _ in word_freqs.most_common(MAX_FEATURES)]

sentences = np.empty((num_recs, ), dtype=list)
i = 0
ftrain = open("data/umich-sentiment-train.txt", 'rb')

for line in ftrain:
    label, sentence = line.decode('utf8').strip().split("\t")
    words = nltk.word_tokenize(sentence.lower())
    sentence = []
    for word in words:
        if word in vocab:
            sentence.append(word)
        else:
            sentence.append("UNK")
    sentences[i] = sentence
    i += 1
    
ftrain.close()

sentences=list(sentences)

In [12]:
len(sentences)

7086

In [13]:
sentences[0]

['the', 'da', 'vinci', 'code', 'book', 'is', 'just', 'awesome', '.']

<font color=blue>

TO DO: GloVe 알고리즘을 통해 word embedding을 하고 embedding matrix를 `embedding_matrix_glove`의 이름으로 저장하시오. 

In [14]:
EMBEDDING_SIZE = 128
corpus = Corpus() 
corpus.fit(sentences, window=10)
glove = Glove(no_components=EMBEDDING_SIZE)
glove.fit(corpus.matrix, epochs=30)
glove.add_dictionary(corpus.dictionary)

In [15]:
embedding_matrix_glove = glove.word_vectors

In [23]:
embedding_matrix_glove.shape

(2002, 128)

<font color=blue>

TO DO: Word2vec 알고리즘을 사용하는 Lecture 11의 예제와 동일하게 이후 과정 진행

- Embedding matrix에 "UNK"을 나타내는 0 행을 추가 
- Look-up dictionary 생성


In [16]:
embedding_matrix_glove = np.append(np.zeros((1,EMBEDDING_SIZE)), embedding_matrix_glove, axis=0)

index2word = {i+1: w for i, w in enumerate(glove.dictionary)} 
index2word[0] = 'PAD'
word2index = {w: i for i, w in index2word.items() }

vocab_size = len(index2word)

- Keras embedding layer에 입력하기 위해 단어 인덱스를 사용하여 문장을 list로 변환하여 저장하고 각 문장의 sentiment label 저장

In [17]:
from keras.preprocessing import sequence


X = np.empty((num_recs, ), dtype=list)
y = np.zeros((num_recs, ))
i = 0
ftrain = open("data/umich-sentiment-train.txt", 'rb')

for line in ftrain:
    label, sentence = line.decode('utf8').strip().split("\t")
    words = nltk.word_tokenize(sentence.lower())
    seqs = []
    for word in words:
        if word in word2index:
            seqs.append(word2index[word])
        else:
            seqs.append(word2index["UNK"])
    X[i] = seqs
    y[i] = int(label)
    i += 1
ftrain.close()
X = sequence.pad_sequences(X, maxlen=MAX_SENTENCE_LENGTH)

Using TensorFlow backend.


In [24]:
X[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0, 1189, 1761,
        373,  456,  233,  497, 1361, 1843,  406], dtype=int32)

In [18]:
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=0)
print(Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape)

(5668, 40) (1418, 40) (5668,) (1418,)


<font color=blue>

TO DO: 

GloVe에 의해 학습된 embedding matrix를 사용하여 word2vec를 사용해 학습했던 모형과 동일한 구조의 모형을 학습하고 test set에 대한 accuracy를 계산하시오.

In [19]:
BATCH_SIZE = 512
NUM_EPOCHS = 100
EMBEDDING_SIZE = 128

from keras.models import Sequential
from keras.layers import Embedding, Dropout, LSTM, Dense
from keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping

model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_SIZE, input_length = MAX_SENTENCE_LENGTH, mask_zero = True,
                    weights = [embedding_matrix_glove], trainable = False))
model.add(LSTM(32, recurrent_dropout = 0.2, return_sequences = False))
model.add(Dense(1, activation = "sigmoid"))
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["acc"])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 40, 128)           256256    
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                20608     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 276,897
Trainable params: 20,641
Non-trainable params: 256,256
_________________________________________________________________


In [20]:
import time
now = time.strftime("%c")
callbacks_list = [
    ModelCheckpoint(filepath='models/sentiment_analysis_glove.h5', monitor='val_loss', save_best_only=True),
    TensorBoard(log_dir='logs/sentiment_analysis_glove/'+now),
    EarlyStopping(monitor='val_loss',patience=3)
]
model.fit(Xtrain, ytrain, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, validation_data=(Xtest, ytest), callbacks=callbacks_list)

Train on 5668 samples, validate on 1418 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100


<keras.callbacks.History at 0x7ff2f022b898>

In [21]:
loss_test, acc_test = model.evaluate(Xtest, ytest, batch_size=BATCH_SIZE)
print("Test loss: %.3f, accuracy: %.3f" % (loss_test, acc_test))

Test loss: 0.326, accuracy: 0.862


In [22]:
for i in range(5):
    idx = np.random.randint(len(Xtest))
    xtest = Xtest[idx].reshape(1,40)
    ylabel = ytest[idx]
    ypred = model.predict(xtest)[0][0]
    sent = " ".join([index2word[x] for x in xtest[0].tolist() if x != 0])
    print("%.0f\t%d\t%s" % (ypred, ylabel, sent))

1	1	i love harry potter ...
0	0	i hate harry potter , it 's retarted , gay and stupid and there 's only one black guy ...
0	0	by the way , the da vinci code sucked , just letting you know ...
1	1	brokeback mountain was an awesome movie .
1	0	i hate harry potter .
