In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import fetch_20newsgroups
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
# 뉴스 데이터 생성

dataset = fetch_20newsgroups(shuffle=True, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
print('총 샘플 수 :',len(documents))

총 샘플 수 : 11314


In [3]:
news_df = pd.DataFrame({'document':documents})
# 특수 문자 제거
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")
# 길이가 3이하인 단어는 제거 (길이가 짧은 단어 제거)
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
# 전체 단어에 대한 소문자 변환
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

  news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")


In [4]:
#null 확인
news_df.isnull().sum()

document     0
clean_doc    0
dtype: int64

In [5]:
# empty 값 확인
news_df.replace("", float("NaN"), inplace=True)
news_df.isnull().sum()

document     218
clean_doc    319
dtype: int64

In [6]:
news_df.dropna(inplace=True)
print('총 샘플 수 :',len(news_df))

총 샘플 수 : 10995


In [7]:
# 불용어 제거 후 리스트 생성
stop_words = stopwords.words('english')
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split())
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])
tokenized_doc = tokenized_doc.to_list()

In [8]:
tokenized_doc[:1]

[['wondering',
  'anyone',
  'could',
  'enlighten',
  'door',
  'sports',
  'looked',
  'late',
  'early',
  'called',
  'bricklin',
  'doors',
  'really',
  'small',
  'addition',
  'front',
  'bumper',
  'separate',
  'rest',
  'body',
  'know',
  'anyone',
  'tellme',
  'model',
  'name',
  'engine',
  'specs',
  'years',
  'production',
  'made',
  'history',
  'whatever',
  'info',
  'funky',
  'looking',
  'please',
  'mail']]

In [9]:
# 단어길이 2개 미만 샘플 제거
tokenized_doc = [w for w in tokenized_doc if len(w) >= 2]
tokenized_doc

[['wondering',
  'anyone',
  'could',
  'enlighten',
  'door',
  'sports',
  'looked',
  'late',
  'early',
  'called',
  'bricklin',
  'doors',
  'really',
  'small',
  'addition',
  'front',
  'bumper',
  'separate',
  'rest',
  'body',
  'know',
  'anyone',
  'tellme',
  'model',
  'name',
  'engine',
  'specs',
  'years',
  'production',
  'made',
  'history',
  'whatever',
  'info',
  'funky',
  'looking',
  'please',
  'mail'],
 ['fair',
  'number',
  'brave',
  'souls',
  'upgraded',
  'clock',
  'oscillator',
  'shared',
  'experiences',
  'poll',
  'please',
  'send',
  'brief',
  'message',
  'detailing',
  'experiences',
  'procedure',
  'speed',
  'attained',
  'rated',
  'speed',
  'cards',
  'adapters',
  'heat',
  'sinks',
  'hour',
  'usage',
  'floppy',
  'disk',
  'functionality',
  'floppies',
  'especially',
  'requested',
  'summarizing',
  'next',
  'days',
  'please',
  'network',
  'knowledge',
  'base',
  'done',
  'clock',
  'upgrade',
  'answered',
  'poll',


In [10]:
print('총 샘플 수 :',len(tokenized_doc))

총 샘플 수 : 10940


In [11]:
# 단어 집합 생성
tkzr = Tokenizer()
tkzr.fit_on_texts(tokenized_doc)

tkzr.word_index

In [12]:
len(tkzr.word_index.items())

64276

In [13]:
#각 단어, 인덱스에 대한 딕셔너리 생성
wrd2idx = tkzr.word_index
idx2wrd = {value : key for key, value in wrd2idx.items()}

In [14]:
#정수 인코딩
encoded = tkzr.texts_to_sequences(tokenized_doc)

In [15]:
print(encoded[:2])

[[957, 22, 8, 8530, 875, 1855, 653, 820, 459, 103, 24704, 2786, 33, 188, 746, 503, 5892, 1148, 409, 367, 4, 22, 33568, 374, 71, 620, 2274, 37, 1903, 53, 292, 436, 147, 13686, 151, 28, 55], [1102, 45, 6733, 4730, 4036, 906, 5703, 1606, 2183, 5533, 28, 105, 3135, 207, 9470, 2183, 2223, 228, 12465, 3517, 228, 568, 5704, 1473, 11491, 1692, 1751, 718, 164, 3050, 3759, 381, 3334, 10668, 129, 245, 28, 521, 590, 672, 127, 906, 1308, 2025, 5533, 48]]


In [16]:
# 네거티브 샘플링
from tensorflow.keras.preprocessing.sequence import skipgrams
vocab_size = len(wrd2idx) + 1
skip_grams = [skipgrams(sample, vocabulary_size=vocab_size, window_size=10) for sample in encoded[:8000]]
# 프로세싱 시간의 제약으로 8000개 샘플만 적용

In [17]:
sample, label = skip_grams[0][0], skip_grams[0][1]

print('샘플 : {}, {}' .format(sample, label))

In [18]:
for i in range(5):
    print('(({} : {}), ({} : {})) -> {}'. format(idx2wrd[sample[i][0]], sample[i][0],
                                                idx2wrd[sample[i][1]], sample[i][1],
                                                label[i]))

((sports : 1855), (stylus : 18835)) -> 0
((know : 4), (doors : 2786)) -> 1
((front : 503), (addition : 746)) -> 1
((please : 28), (looking : 151)) -> 1
((looking : 151), (racked : 17431)) -> 0


In [19]:
print('전체 샘플 수 :',len(skip_grams))
print('전체 샘플 쌍의 수 :',len(sample))
print('전체 샘플 쌍의 레이블 수 :',len(label))

전체 샘플 수 : 8000
전체 샘플 쌍의 수 : 1260
전체 샘플 쌍의 레이블 수 : 1260


In [20]:
#SGNS 모델 구현
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Reshape, Activation, Input
from tensorflow.keras.layers import Dot
from tensorflow.keras.utils import plot_model
from IPython.display import SVG

In [21]:
# 임베딩 벡터 100
embedding_dim = 100

# 중심 단어를 위한 임베딩 테이블
w_inputs = Input(shape=(1, ))
word_embedding = Embedding(vocab_size, embedding_dim)(w_inputs)

# 주변 단어를 위한 임베딩 테이블
c_inputs = Input(shape=(1, ))
context_embedding  = Embedding(vocab_size, embedding_dim)(c_inputs)

# 점곱셈 수행으로 내적 만들기
dot_product = Dot(axes=2)([word_embedding, context_embedding])
dot_product = Reshape((1,), input_shape=(1, 1))(dot_product)
#출력 함수는 시그모이드
output = Activation('sigmoid')(dot_product)

#모델 생성
model = Model(inputs=[w_inputs, c_inputs], outputs=output)
model.summary()
model.compile(loss='binary_crossentropy', optimizer='adam')
plot_model(model, to_file='model3.png', show_shapes=True, show_layer_names=True, rankdir='TB')

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 embedding (Embedding)       (None, 1, 100)               6427700   ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, 1, 100)               6427700   ['input_2[0][0]']             
                                                                                              

In [22]:
# epoch 크기 5로 샘플 쌍과 레이블 대상으로 학습 수행

for epoch in range(1, 6):
    loss = 0
    for _, elem in enumerate(skip_grams):
        first_elem = np.array(list(zip(*elem[0]))[0])
        second_elem = np.array(list(zip(*elem[0]))[1])
        labels = np.array(elem[1])
        X = [first_elem, second_elem]
        Y = labels
        loss += model.train_on_batch(X,Y)  
    print('Epoch :',epoch, 'Loss :',loss)

Epoch : 1 Loss : 3532.530283663422
Epoch : 2 Loss : 2634.3835594914854
Epoch : 3 Loss : 2485.282470867038
Epoch : 4 Loss : 2338.6698402129114
Epoch : 5 Loss : 2163.8283067364246


In [23]:
#학습모델 저장

import gensim

f = open('vectors.txt' ,'w')
f.write('{} {}\n'.format(vocab_size-1, embedding_dim))
vectors = model.get_weights()[0]
for word, i in tkzr.word_index.items():
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()

# 모델 불러오기
w2v = gensim.models.KeyedVectors.load_word2vec_format('./vectors.txt', binary=False)

In [24]:
w2v.most_similar(positive=['soldiers'])

[('wounded', 0.8187633156776428),
 ('killed', 0.8153121471405029),
 ('attacking', 0.8088293075561523),
 ('slaughter', 0.8085916638374329),
 ('villages', 0.8076775670051575),
 ('nazis', 0.8050066828727722),
 ('moslem', 0.8040858507156372),
 ('exterminated', 0.8028150200843811),
 ('extermination', 0.802670955657959),
 ('occupation', 0.7969975471496582)]

In [25]:
w2v.most_similar(positive=['doctor'])

[('symptoms', 0.664776623249054),
 ('stones', 0.6331056952476501),
 ('migraine', 0.6291884183883667),
 ('patient', 0.5791604518890381),
 ('infections', 0.5752955675125122),
 ('antibiotics', 0.5630971193313599),
 ('onset', 0.5615555644035339),
 ('treatments', 0.557520866394043),
 ('hurt', 0.5487589240074158),
 ('vaginal', 0.5400879383087158)]