# 케라스 Word2Vec 구현

* 참고: https://wikidocs.net/69141

## 데이터 전처리

In [1]:
from sklearn.datasets import fetch_20newsgroups

dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                            remove=('headers', 'footers', 'quotes'))

documents = dataset.data

C:\Users\bini\anaconda3\envs\anaconda\lib\site-packages\numpy\.libs\libopenblas.TXA6YQSD3GCQQC22GEQ54J2UDCXDXHWN.gfortran-win_amd64.dll
C:\Users\bini\anaconda3\envs\anaconda\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll
  stacklevel=1)


In [2]:
print(len(documents))
documents[1]

11314


"\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism?  No, you need a little leap of faith, Jimmy.  Your logic runs out\nof steam!\n\n\n\n\n\n\n\nJim,\n\nSorry I can't pity you, Jim.  And I'm sorry that you have these feelings of\ndenial about the faith you need to get by.  Oh well, just pretend that it will\nall end happily ever after anyway.  Maybe if you start a new newsgroup,\nalt.atheist.hard, you won't be bummin' so much?\n\n\n\n\n\n\nBye-Bye, Big Jim.  Don't forget your Flintstone's Chewables!  :) \n--\nBake Timmons, III"

In [3]:
import re
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bini\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bini\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
def clean_text(d) :
    pattern = r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', d)
    return text

def clean_stopword(d) :
    stop_words = stopwords.words('english')
    return ' '.join([w.lower() for w in d.split() if w.lower() not in stop_words and len(w)>3])

def tokenize(d) :
    return word_tokenize(d)

In [5]:
import pandas as pd

news_df = pd.DataFrame({'article' : documents})
len(news_df)

11314

In [6]:
news_df.replace("", float("NaN"), inplace=True)
news_df.dropna(inplace=True)
len(news_df)

11096

In [7]:
news_df['article'] = news_df['article'].apply(clean_text)
news_df['article']

0        Well im not sure about the story nad it did se...
1        \n\n\n\n\n\n\nYeah do you expect people to rea...
2        Although I realize that principle is not one o...
3        Notwithstanding all the legitimate fuss about ...
4        Well I will have to change the scoring on my p...
                               ...                        
11309    Danny Rubenstein an Israeli journalist will be...
11310                                                   \n
11311    \nI agree  Home runs off Clemens are always me...
11312    I used HP DeskJet with Orange Micros Grappler ...
11313                                          \nNo arg...
Name: article, Length: 11096, dtype: object

In [8]:
news_df['article'] = news_df['article'].apply(clean_stopword)
news_df['article']

0        well sure story seem biased disagree statement...
1        yeah expect people read actually accept hard a...
2        although realize principle strongest points wo...
3        notwithstanding legitimate fuss proposal much ...
4        well change scoring playoff pool unfortunately...
                               ...                        
11309    danny rubenstein israeli journalist speaking t...
11310                                                     
11311    agree home runs clemens always memorable kinda...
11312    used deskjet orange micros grappler system upd...
11313    argument murphy scared hell came last year han...
Name: article, Length: 11096, dtype: object

In [9]:
tokenized_news = news_df['article'].apply(tokenize)
tokenized_news = tokenized_news.to_list()

In [10]:
import numpy as np

#token개수가 1개 이하인 문장 삭제
drop_news = [index for index, sentence in enumerate(tokenized_news) if len(sentence) <= 1]
news_texts = np.delete(tokenized_news, drop_news, axis=0)
print(len(news_texts))

10939


  return array(a, dtype, copy=False, order=order)


In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer

#너무 오래걸려서 2000개만
news_2000 = news_texts[:2000]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(news_2000)

idx2word = {value:key for key,value in tokenizer.word_index.items()}
sequences = tokenizer.texts_to_sequences(news_2000)

In [12]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

29706


In [13]:
print(sequences[1])

[1232, 443, 2, 56, 113, 405, 57, 1342, 20, 65, 3454, 384, 6822, 1142, 360, 2217, 444, 55, 12419, 444, 1862, 3804, 384, 20, 9, 4279, 8694, 171, 291, 130, 148, 648, 12420, 304, 12421, 14, 12422, 4, 770, 12423, 12424, 4870, 8695]


## Skipgram

**W2V**  
* 단어 간 유사성을 고려하기 위해서는 단어의 의미를 벡터화
* 단어를 밀도 높은 벡터(dense vector)로 나타내는 모델
  
**Skip - gram**은 CBOW와는 반대로 `하나의 단어에서 여러 단어를 예측하는 방법`이다. 즉 중심단어에서 주변단어를 예측하는 방식인데 CBOW보다 성능이 좋아 더 많이 쓰인다.
![image.png](attachment:image.png)

출처 : https://simonezz.tistory.com/35

### skip gram 전처리

* 네거티브 샘플링(Negative Sampling)

  + Word2Vec은 출력층이 내놓는 값에 소프트맥스 함수를 적용해 확률값으로 변환한 후 이를 정답과 비교해 역전파(backpropagation)
  + 소프트맥스를 적용하려면 분모에 해당하는 값, 즉 중심단어와 나머지 모든 단어의 내적을 한 뒤, 이를 다시 exp 계산을 하는데 전체 단어가 많을 경우 엄청난 계산량 발생
  + 네거티브 샘플링은 소프트맥스 확률을 구할 때 전체 단어를 대상으로 구하지 않고, `일부 단어만 뽑아서 계산`을 하는 방식
  + 네거티브 샘플링 동작은 사용자가 지정한 `윈도우 사이즈 내에 등장하지 않는 단어(negative sample)`를 5~20개 정도 뽑고, 이를 정답단어와 합쳐 전체 단어처럼 소프트맥스 확률을 계산하여 파라미터 업데이트

In [14]:
from tensorflow.keras.preprocessing.sequence import skipgrams

skip_grams = [skipgrams(sample, vocabulary_size=vocab_size, window_size=10) for sample in sequences[:10]]

In [15]:
pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(100) :
    print("{:s}({:d}), {:s}({:d}) -> {:d}".format(
        idx2word[pairs[i][0]], pairs[i][0],
        idx2word[pairs[i][1]], pairs[i][1],
        labels[i]
    ))

government(46), unfortunate(5677) -> 1
treating(3803), makes(201) -> 1
well(9), media(483) -> 1
away(183), raven(12231) -> 0
guilt(6821), brickell(22997) -> 0
power(59), pubavrganithsrctarz(24172) -> 0
reputation(5676), ruin(12415) -> 1
reputation(5676), fryer(24564) -> 0
well(9), statement(433) -> 1
well(9), reputation(5676) -> 1
might(37), existance(2704) -> 1
reason(196), pubspacecdrom(12844) -> 0
europe(1604), mwnglrybgmebhhrberywmbyyxibnorhhryxhr(28933) -> 0
treating(3803), look(68) -> 1
rediculous(12416), whether(174) -> 0
reason(196), report(611) -> 1
races(8693), look(68) -> 1
described(1231), media(483) -> 1
media(483), europe(1604) -> 1
israels(3452), whole(195) -> 1
world(120), described(1231) -> 1
incidences(8690), codex(18134) -> 0
media(483), equivalence(9533) -> 0
inhuman(8692), warentee(25106) -> 0
reason(196), rafiq(27835) -> 0
realize(937), whole(195) -> 1
makes(201), duty(1594) -> 0
world(120), ruin(12415) -> 1
austria(4278), think(6) -> 1
biased(2905), floyd(18094) 

In [16]:
skip_grams = [skipgrams(seq, vocabulary_size=vocab_size, window_size=10) for seq in sequences]

### Skipgram 모델 구성

In [17]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Reshape, Activation, Input, Dot
from tensorflow.keras.utils import plot_model

In [18]:
embed_size = 50

In [19]:
def word2vec() :
    target_inputs = Input(shape=(1,), dtype='int32')
    target_embedding = Embedding(vocab_size, embed_size)(target_inputs)
    
    context_inputs = Input(shape=(1,), dtype='int32')
    context_embedding = Embedding(vocab_size, embed_size)(target_inputs)
    
    dot_product = Dot(axes=2)([target_embedding, context_embedding])
    dot_product = Reshape((1,), input_shape=(1,1))(dot_product)
    output = Activation('sigmoid')(dot_product)
    
    model = Model(inputs=[target_inputs, context_inputs], outputs=output)
    model.compile(loss='binary_crossentropy', optimizer='adam')
    
    return model

In [20]:
model = word2vec()
model.summary()
plot_model(model, show_shapes=True, show_layer_names=True)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 50)        1485300     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 50)        1485300     input_1[0][0]                    
__________________________________________________________________________________________________
dot (Dot)                       (None, 1, 1)         0           embedding[0][0]                  
                                                                 embedding_1[0][0]            

In [21]:
for epoch in range(1,3) :
    loss = 0 
    for _, elem in enumerate(skip_grams) : 
        first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
        second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
        labels = np.array(elem[1], dtype='int32')
        X = [first_elem, second_elem]
        Y = labels
        loss += model.train_on_batch(X,Y)
        
    print('Eopoch : ', epoch, 'Loss : ' , loss)

Eopoch :  1 Loss :  1386.2984467744827
Eopoch :  2 Loss :  1386.305252313614


In [22]:
import gensim

f = open('skipgram.txt', 'w')
f.write('{} {}\n'.format(vocab_size-1, embed_size))
vectors = model.get_weights()[0]
for word, i in tokenizer.word_index.items():
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()

skipgram = gensim.models.KeyedVectors.load_word2vec_format('skipgram.txt', binary=False)

In [23]:
skipgram.most_similar(positive=['soldiers'])

[('mwrc', 0.5059067010879517),
 ('enrico', 0.5007731318473816),
 ('respiratory', 0.4905298948287964),
 ('mcxcxcxcxcxccscxczdhojdidori', 0.4818585216999054),
 ('adeospolder', 0.47779637575149536),
 ('beta', 0.477295845746994),
 ('regularplusc', 0.4734668731689453),
 ('scopemeters', 0.4682510495185852),
 ('lease', 0.4636659324169159),
 ('daves', 0.4528583884239197)]

## CBOW

### CBOW 전처리

In [32]:
def skipgram2cbow(skipgrams) :
    cbows = []
    flag = 0
    for n in skipgrams :
        temp1 = []
        for t in n :
            if flag == 1:
                flag = 0
                temp1.append(t)
            else : 
                flag = 1
                temp2 = []
                for x in t :
                    temp2.append([x[1], x[0]])
                temp1.append(temp2)
        cbows.append(temp1)
    return cbows

In [33]:
cbows = skipgram2cbow(skip_grams)

In [34]:
pairs, labels = cbows[0][0], cbows[0][1]
for i in range(100) :
    print("{:s}({:d}), {:s}({:d}) -> {:d}".format(
        idx2word[pairs[i][0]], pairs[i][0],
        idx2word[pairs[i][1]], pairs[i][1],
        labels[i]
    ))

proisraeli(12417), incidences(8690) -> 1
degree(1397), austria(4278) -> 1
received(374), inhuman(8692) -> 1
seem(167), rediculous(12416) -> 1
races(8693), blessing(12418) -> 1
jesuitical(17710), daily(1605) -> 0
inhuman(8692), clearly(564) -> 1
least(108), media(483) -> 1
adobedpsextension(22293), media(483) -> 0
accolade(18612), letter(647) -> 0
media(483), europeans(3453) -> 1
food(464), lived(981) -> 0
degree(1397), europeans(3453) -> 1
nolan(7165), israeli(680) -> 0
russcourtnall(24343), lived(981) -> 0
seem(167), sure(60) -> 1
reports(982), might(37) -> 1
austria(4278), acts(1277) -> 1
acts(1277), government(46) -> 1
biased(2905), reputation(5676) -> 1
mtxwcwwtextfxtxxcxxxwocxc(28803), letter(647) -> 0
europeans(3453), shame(3177) -> 1
europe(1604), ignore(1396) -> 1
msnfhtfrtoaxzwpzdfjipfexpte(26275), seem(167) -> 0
extremely(1948), proisraeli(12417) -> 0
mkcxkcxtgcbncbhjbhjwlrscbdyxxibhjgmxxi(28697), government(46) -> 0
realy(5328), israeli(680) -> 0
film(2206), commited(4869) -

In [35]:
print(len(cbows))
print(len(pairs))
print(len(labels))

2000
2220
2220


### CBOW 모델 구성

In [36]:
model = word2vec()
model.summary()
plot_model(model, show_shapes=True, show_layer_names=True)

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 1, 50)        1485300     input_5[0][0]                    
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 1, 50)        1485300     input_5[0][0]                    
__________________________________________________________________________________________________
dot_2 (Dot)                     (None, 1, 1)         0           embedding_4[0][0]                
                                                                 embedding_5[0][0]          

In [37]:
for epoch in range(1,3) :
    loss = 0 
    for _, elem in enumerate(cbows) : 
        first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
        second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
        labels = np.array(elem[1], dtype='int32')
        X = [first_elem, second_elem]
        Y = labels
        loss += model.train_on_batch(X,Y)
        
    print('Eopoch : ', epoch, 'Loss : ' , loss)

Eopoch :  1 Loss :  1145.197807699442
Eopoch :  2 Loss :  865.7585501074791


In [38]:
import gensim

f = open('cbow.txt', 'w')
f.write('{} {}\n'.format(vocab_size-1, embed_size))
vectors = model.get_weights()[0]
for word, i in tokenizer.word_index.items():
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()

cbow = gensim.models.KeyedVectors.load_word2vec_format('cbow.txt', binary=False)

In [39]:
cbow.most_similar(positive = ['soldier'])

[('forged', 0.5784620046615601),
 ('confortable', 0.5206409692764282),
 ('allegiance', 0.4927935004234314),
 ('wrongdoing', 0.48995572328567505),
 ('chiron', 0.48564016819000244),
 ('schorninfethzch', 0.4817042648792267),
 ('wording', 0.4714176058769226),
 ('ahem', 0.46867498755455017),
 ('rwflehighedu', 0.4632822871208191),
 ('procom', 0.459921270608902)]