#### FastText 실습

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import urllib.request
from gensim.models.word2vec import Word2Vec
from konlpy.tag import Okt

In [3]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/GaoleMeng/RNN-and-FFNN-textClassification/master/ted_en-20160408.xml",
                           filename="ted_en-20160408.xml")

('ted_en-20160408.xml', <http.client.HTTPMessage at 0x2aeda6bab60>)

In [4]:
import re
import urllib.request
import zipfile
from lxml import etree
from nltk.tokenize import word_tokenize, sent_tokenize

In [5]:
targetXML = open('ted_en-20160408.xml', 'r', encoding='UTF8')
target_text = etree.parse(targetXML)
# xml 파일로부터 <content>와 </content> 사이의 내용만 가져온다.
parse_text = '\n'.join(target_text.xpath('//content/text()'))
# 정규 표현식의 sub 모듈을 통해 content 중간에 등장하는 (Audio), (Laughter) 등의 배경음 부분을 제거.
# 해당 코드는 괄호로 구성된 내용을 제거.
content_text = re.sub(r'\([^)]*\)', '', parse_text) #sub는 치환할 때 사용하는 개념, 소괄호로 시작하는 모든 것들을 빈문자로 치환
# 입력 코퍼스에 대해서 NLTK를 이용하여 문장 토큰화를 수행.
sent_text = sent_tokenize(content_text)
# 각 문장에 대해서 구두점을 제거하고, 대문자를 소문자로 변환.
normalized_text = []
for string in sent_text:
     tokens = re.sub(r"[^a-z0-9]+", " ", string.lower())
     normalized_text.append(tokens)
result = [word_tokenize(sentence) for sentence in normalized_text]

In [6]:
len(result)

273424

In [7]:
from gensim.models import Word2Vec, FastText

In [8]:
model = Word2Vec(sentences = result, vector_size = 100, window = 5, min_count = 5, workers = 4, sg = 0)

In [9]:
model.wv.most_similar("electrofishing")

KeyError: "Key 'electrofishing' not present in vocabulary"

In [10]:
model = FastText(result, vector_size = 100, window = 5, min_count = 5, workers = 4, sg = 0)

In [11]:
model.wv.most_similar("electrofishing")

[('fishing', 0.9186096787452698),
 ('flushing', 0.8999102115631104),
 ('flourishing', 0.8978756070137024),
 ('flashing', 0.8956512808799744),
 ('vanishing', 0.8939927220344543),
 ('ingrid', 0.8936792612075806),
 ('licensing', 0.8930103778839111),
 ('smashing', 0.8919094800949097),
 ('transplanting', 0.8912322521209717),
 ('refreshing', 0.8889642953872681)]

### 한국어 FastText

In [12]:
import re
import pandas as pd
import urllib.request
from tqdm import tqdm
import hgtk
from konlpy.tag import Mecab

In [13]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/bab2min/corpus/master/sentiment/naver_shopping.txt",
                           filename="ratings_total.txt")

('ratings_total.txt', <http.client.HTTPMessage at 0x2ae99d02fe0>)

In [14]:
total_data = pd.read_table("ratings_total.txt", names = ['ratings', 'reviews'])
print("전체 리뷰 개수 : ", len(total_data))

전체 리뷰 개수 :  200000


In [15]:
# 한글인지 체크
hgtk.checker.is_hangul('ㄱ')

True

In [16]:
hgtk.checker.is_hangul('28')

False

In [17]:
# 음절을 초성, 중성, 종성으로 분해
hgtk.letter.decompose('남')

('ㄴ', 'ㅏ', 'ㅁ')

In [18]:
# 초성, 중성을 결합
hgtk.letter.compose('ㄴ', 'ㅏ')

'나'

In [19]:
# 초성, 중성, 종성을 결합
hgtk.letter.compose('ㄴ', 'ㅏ', 'ㅁ')

'남'

In [20]:
# 결합할 수 없는 상황에서는 에러 발생
hgtk.letter.compose('ㄴ', 'ㅁ', 'ㅁ')

NotHangulException: No valid Hangul character index

In [21]:
def word_to_jamo(token):
    def to_special_token(jamo):
        if not jamo:
            return '-'
        else:
            return jamo

    decomposed_token = ''
    for char in token:
        try:
            # char(음절)을 초성, 중성, 종성으로 분리
            cho, jung, jong = hgtk.letter.decompose(char)

            # 자모가 빈 문자일 경우 특수문자 -로 대체
            cho = to_special_token(cho)
            jung = to_special_token(jung)
            jong = to_special_token(jong)
            decomposed_token = decomposed_token + cho + jung + jong

        # 만약 char(음절)이 한글이 아닐 경우 자모를 나누지 않고 추가
        except Exception as exception:
            if type(exception).__name__ == 'NotHangulException':
                decomposed_token += char

    # 단어 토큰의 자모 단위 분리 결과를 추가
    return decomposed_token
    

In [22]:
word_to_jamo('남동생')

'ㄴㅏㅁㄷㅗㅇㅅㅐㅇ'

In [23]:
mecab = Mecab(r'C:\mecab\mecab-ko-dic')

In [24]:
print(mecab.morphs('선물용으로 빨리 받아서 전달했어야 하는 상품이었는데 머그컵만 와서 당황했습니다.'))

['선물', '용', '으로', '빨리', '받', '아서', '전달', '했어야', '하', '는', '상품', '이', '었', '는데', '머그', '컵', '만', '와서', '당황', '했', '습니다', '.']


In [25]:
def tokenize_by_jamo(s):
    return [word_to_jamo(token) for token in mecab.morphs(s)]

print(tokenize_by_jamo('선물용으로 빨리 받아서 전달했어야 하는 상품이었는데 머그컵만 와서 당황했습니다.'))

['ㅅㅓㄴㅁㅜㄹ', 'ㅇㅛㅇ', 'ㅇㅡ-ㄹㅗ-', 'ㅃㅏㄹㄹㅣ-', 'ㅂㅏㄷ', 'ㅇㅏ-ㅅㅓ-', 'ㅈㅓㄴㄷㅏㄹ', 'ㅎㅐㅆㅇㅓ-ㅇㅑ-', 'ㅎㅏ-', 'ㄴㅡㄴ', 'ㅅㅏㅇㅍㅜㅁ', 'ㅇㅣ-', 'ㅇㅓㅆ', 'ㄴㅡㄴㄷㅔ-', 'ㅁㅓ-ㄱㅡ-', 'ㅋㅓㅂ', 'ㅁㅏㄴ', 'ㅇㅘ-ㅅㅓ-', 'ㄷㅏㅇㅎㅘㅇ', 'ㅎㅐㅆ', 'ㅅㅡㅂㄴㅣ-ㄷㅏ-', '.']


In [26]:
from tqdm import tqdm

tokenized_data = []

for sample in total_data['reviews'].to_list():
    tokenzied_sample = tokenize_by_jamo(sample) # 자소 단위 토큰화
    tokenized_data.append(tokenzied_sample)

In [27]:
tokenized_data[0]

['ㅂㅐ-ㄱㅗㅇ', 'ㅃㅏ-ㄹㅡ-', 'ㄱㅗ-', 'ㄱㅜㅅ']

In [28]:
def jamo_to_word(jamo_sequence):
    tokenized_jamo = []
    index = 0

    # 1. 초기 입력
    # jamo_sequence = 'ㄴㅏㅁㄷㅗㅇㅅㅐㅇ'

    while index < len(jamo_sequence):
        # 문자가 한글(정상적인 자모)이 아닐 경우
        if not hgtk.checker.is_hangul(jamo_sequence[index]):
            tokenized_jamo.append(jamo_sequence[index])
            index = index + 1

        # 문자가 정상적인 자모라면 초성, 중성, 종성을 하나의 토큰으로 간주.
        else:
            tokenized_jamo.append(jamo_sequence[index:index + 3])
            index = index + 3

    # 2. 자모 단위 토큰화 완료
    # tokenized_jamo : ['ㄴㅏㅁ', 'ㄷㅗㅇ', 'ㅅㅐㅇ']

    word = ''
    try:
        for jamo in tokenized_jamo:
            # 초성, 중성, 종성의 묶음으로 추정되는 경우
            if len(jamo) == 3:
                if jamo[2] == "-":
                    # 종성이 존재하지 않는 경우
                    word = word + hgtk.letter.compose(jamo[0], jamo[1])
                else:
                    # 종성이 존재하는 경우
                    word = word + hgtk.letter.compose(jamo[0], jamo[1], jamo[2])
            # 한글이 아닌 경우
            else:
                word = word + jamo

    # 복원 중(hgtk.letter.compose) 에러 발생 시 초기 입력 리턴.
    # 복원이 불가능한 경우 예시) 'ㄴ!ㅁㄷㅗㅇㅅㅐㅇ'
    except Exception as exception:
        if type(exception).__name__ == 'NotHangulException':
            return jamo_sequence

    # 3. 단어로 복원 완료
    # word : '남동생'

    return word

In [29]:
jamo_to_word('ㄴㅏㅁㄷㅗㅇㅅㅐㅇ')

'남동생'

In [38]:
import fasttext

In [39]:
with open('tokenized_data.txt', 'w', encoding = 'utf-8') as out:
    for line in tqdm(tokenized_data, unit=' line'):
        out.write(' '.join(line) + '\n')

100%|███████████████████████████████████████████████████████████████████████████████████████████| 200000/200000 [00:02<00:00, 90424.94 line/s]


In [40]:
model = fasttext.train_unsupervised('tokenized_data.txt', model='cbow')
model.save_model("fasttext.bin") # 모델 저장
model = fasttext.load_model("fasttext.bin") # 모델 로드

In [41]:
model[word_to_jamo('남동생')]

array([-0.27820098,  0.01429991,  0.31213486, -0.35917503, -0.21511072,
       -0.9479429 , -0.43248788,  0.6337056 ,  0.07769312,  0.04124201,
       -0.44832966, -0.06288446,  0.17858742,  0.36418223,  0.0443569 ,
        0.7182074 ,  0.11097914,  0.88904524,  0.04093075, -0.0245559 ,
        0.37597132,  0.19367361, -0.5170682 , -0.29743513,  0.9607574 ,
        0.51476693, -0.0248489 ,  0.54350096,  0.50881547, -0.544389  ,
       -0.1345461 ,  0.31495535,  0.7024569 , -0.89429885,  0.5910413 ,
        1.1590027 , -0.1177251 ,  0.34002963, -0.01661343,  0.22770078,
       -0.1731957 , -0.22914377, -0.24304502,  0.274633  ,  0.45931977,
        0.26006564, -0.12765561, -0.24375473,  0.33554888,  0.48846737,
       -0.18786666, -0.29633668, -0.43678093,  0.8253616 ,  0.92256624,
       -0.808224  ,  0.01936796, -0.18253934, -0.0726556 , -0.64245796,
        0.0465894 , -0.18543795,  0.36493558, -0.35149637, -0.02741783,
        0.4495129 , -0.8531769 ,  0.11590157, -0.22333945, -0.17

In [42]:
model.get_nearest_neighbors(word_to_jamo('남동생'), k=10)

[(0.8953864574432373, 'ㄷㅗㅇㅅㅐㅇ'),
 (0.8176509141921997, 'ㄴㅏㅁㅊㅣㄴ'),
 (0.7767713069915771, 'ㄴㅏㅁㅍㅕㄴ'),
 (0.7520563006401062, 'ㅊㅣㄴㄱㅜ-'),
 (0.731107771396637, 'ㄴㅏㅁㅇㅏ-'),
 (0.7173229455947876, 'ㅅㅐㅇㅇㅣㄹ'),
 (0.6996015906333923, 'ㅈㅗ-ㅋㅏ-'),
 (0.69289231300354, 'ㄸㅏㄹ'),
 (0.6792193651199341, 'ㄴㅏㄴㅅㅐㅇ'),
 (0.6789605021476746, 'ㄴㅏㅁㅁㅐ-')]

In [43]:
def transform(word_sequence):
    return [(jamo_to_word(word), similarity) for (similarity, word) in word_sequence]

In [44]:
print(transform(model.get_nearest_neighbors(word_to_jamo('남동생'), k=10)))

[('동생', 0.8953864574432373), ('남친', 0.8176509141921997), ('남편', 0.7767713069915771), ('친구', 0.7520563006401062), ('남아', 0.731107771396637), ('생일', 0.7173229455947876), ('조카', 0.6996015906333923), ('딸', 0.69289231300354), ('난생', 0.6792193651199341), ('남매', 0.6789605021476746)]


In [45]:
vocab_size = 20000
output_dim = 128
input_length = 500

v = Embedding(vocab_size, output_dim, input_length=input_length)

NameError: name 'Embedding' is not defined

In [46]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences = ['nice great best amazing', 'stop lies', 'pitiful nerd', 'excellent work', 'supreme quality', 'bad', 'highly respectable']
y_train = [1, 0, 0, 1, 1, 0, 1]


In [47]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
vocab_size = len(tokenizer.word_index) + 1 # 패딩을 고려하여 +1
print('단어 집합 :',vocab_size)

단어 집합 : 16


In [48]:
X_encoded = tokenizer.texts_to_sequences(sentences)
print('정수 인코딩 결과 :',X_encoded)

정수 인코딩 결과 : [[1, 2, 3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13], [14, 15]]


In [49]:
max_len = max(len(l) for l in X_encoded)
print('최대 길이 :',max_len)

최대 길이 : 4


In [50]:
X_train = pad_sequences(X_encoded, maxlen=max_len, padding='post')
y_train = np.array(y_train)
print('패딩 결과 :')
print(X_train)

패딩 결과 :
[[ 1  2  3  4]
 [ 5  6  0  0]
 [ 7  8  0  0]
 [ 9 10  0  0]
 [11 12  0  0]
 [13  0  0  0]
 [14 15  0  0]]


In [51]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

embedding_dim = 4

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(X_train, y_train, epochs=100, verbose=2)




Epoch 1/100
1/1 - 4s - 4s/step - acc: 0.2857 - loss: 0.6950
Epoch 2/100
1/1 - 0s - 257ms/step - acc: 0.2857 - loss: 0.6932
Epoch 3/100
1/1 - 0s - 232ms/step - acc: 0.2857 - loss: 0.6915
Epoch 4/100
1/1 - 0s - 178ms/step - acc: 0.5714 - loss: 0.6897
Epoch 5/100
1/1 - 0s - 306ms/step - acc: 0.5714 - loss: 0.6880
Epoch 6/100
1/1 - 0s - 265ms/step - acc: 0.5714 - loss: 0.6862
Epoch 7/100
1/1 - 0s - 238ms/step - acc: 0.7143 - loss: 0.6845
Epoch 8/100
1/1 - 0s - 234ms/step - acc: 0.7143 - loss: 0.6828
Epoch 9/100
1/1 - 0s - 234ms/step - acc: 0.7143 - loss: 0.6810
Epoch 10/100
1/1 - 0s - 190ms/step - acc: 0.7143 - loss: 0.6793
Epoch 11/100
1/1 - 0s - 134ms/step - acc: 0.7143 - loss: 0.6776
Epoch 12/100
1/1 - 0s - 151ms/step - acc: 0.7143 - loss: 0.6758
Epoch 13/100
1/1 - 0s - 186ms/step - acc: 0.7143 - loss: 0.6741
Epoch 14/100
1/1 - 0s - 184ms/step - acc: 0.7143 - loss: 0.6723
Epoch 15/100
1/1 - 0s - 169ms/step - acc: 0.7143 - loss: 0.6706
Epoch 16/100
1/1 - 0s - 151ms/step - acc: 0.8571 - l

<keras.src.callbacks.history.History at 0x2af085edcc0>

### 사전훈련된 워드임베딩(Pre-trained)

In [3]:
import gensim

word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary = True)
print('모델의 크기(shape) :', word2vec_model.vectors.shape)

모델의 크기(shape) : (3000000, 300)


In [10]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences = ['nice great best amazing', 'stop lies', 'pitiful nerd', 'excellent work', 'supreme quality', 'bad', 'highly respectable']
y_train = [1, 0, 0, 1, 1, 0, 1]

In [11]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
vocab_size = len(tokenizer.word_index) + 1 # 패딩을 고려하여 +1
print('단어 집합 :',vocab_size)

단어 집합 : 16


In [12]:
import numpy as np

embedding_matrix = np.zeros((vocab_size, 300))
print("임베딩 행렬의 크기(shape) : ", np.shape(embedding_matrix))

임베딩 행렬의 크기(shape) :  (16, 300)


In [13]:
def get_vector(word):
    if word in word2vec_model:
        return word2vec_model[word]

    else:
        return None

In [14]:
for word, index in tokenizer.word_index.items():
    vector_value = get_vector(word)
    if vector_value is not None:
        embedding_matrix[index] = vector_value

In [15]:
print(word2vec_model['nice'])
print(len(word2vec_model['nice']))

[ 0.15820312  0.10595703 -0.18945312  0.38671875  0.08349609 -0.26757812
  0.08349609  0.11328125 -0.10400391  0.17871094 -0.12353516 -0.22265625
 -0.01806641 -0.25390625  0.13183594  0.0859375   0.16113281  0.11083984
 -0.11083984 -0.0859375   0.0267334   0.34570312  0.15136719 -0.00415039
  0.10498047  0.04907227 -0.06982422  0.08642578  0.03198242 -0.02844238
 -0.15722656  0.11865234  0.36132812  0.00173187  0.05297852 -0.234375
  0.11767578  0.08642578 -0.01123047  0.25976562  0.28515625 -0.11669922
  0.38476562  0.07275391  0.01147461  0.03466797  0.18164062 -0.03955078
  0.04199219  0.01013184 -0.06054688  0.09765625  0.06689453  0.14648438
 -0.12011719  0.08447266 -0.06152344  0.06347656  0.3046875  -0.35546875
 -0.2890625   0.19628906 -0.33203125 -0.07128906  0.12792969  0.09619141
 -0.12158203 -0.08691406 -0.12890625  0.27734375  0.265625    0.1796875
  0.12695312  0.06298828 -0.34375    -0.05908203  0.0456543   0.171875
  0.08935547  0.14648438 -0.04638672 -0.00842285 -0.0279

In [16]:
print("단어 nice의 맵핑된 정수 : ", tokenizer.word_index['nice'])

단어 nice의 맵핑된 정수 :  1


In [17]:
print(embedding_matrix[1])

[ 0.15820312  0.10595703 -0.18945312  0.38671875  0.08349609 -0.26757812
  0.08349609  0.11328125 -0.10400391  0.17871094 -0.12353516 -0.22265625
 -0.01806641 -0.25390625  0.13183594  0.0859375   0.16113281  0.11083984
 -0.11083984 -0.0859375   0.0267334   0.34570312  0.15136719 -0.00415039
  0.10498047  0.04907227 -0.06982422  0.08642578  0.03198242 -0.02844238
 -0.15722656  0.11865234  0.36132812  0.00173187  0.05297852 -0.234375
  0.11767578  0.08642578 -0.01123047  0.25976562  0.28515625 -0.11669922
  0.38476562  0.07275391  0.01147461  0.03466797  0.18164062 -0.03955078
  0.04199219  0.01013184 -0.06054688  0.09765625  0.06689453  0.14648438
 -0.12011719  0.08447266 -0.06152344  0.06347656  0.3046875  -0.35546875
 -0.2890625   0.19628906 -0.33203125 -0.07128906  0.12792969  0.09619141
 -0.12158203 -0.08691406 -0.12890625  0.27734375  0.265625    0.1796875
  0.12695312  0.06298828 -0.34375    -0.05908203  0.0456543   0.171875
  0.08935547  0.14648438 -0.04638672 -0.00842285 -0.0279

In [20]:
X_encoded = tokenizer.texts_to_sequences(sentences)
print('정수 인코딩 결과 :',X_encoded)

max_len = max(len(l) for l in X_encoded)
print('최대 길이 :',max_len)

정수 인코딩 결과 : [[1, 2, 3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13], [14, 15]]
최대 길이 : 4


In [22]:
X_train = pad_sequences(X_encoded, maxlen=max_len, padding='post')
y_train = np.array(y_train)
print('패딩 결과 :')
print(X_train)

패딩 결과 :
[[ 1  2  3  4]
 [ 5  6  0  0]
 [ 7  8  0  0]
 [ 9 10  0  0]
 [11 12  0  0]
 [13  0  0  0]
 [14 15  0  0]]


In [23]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten, Input

model = Sequential()
model.add(Input(shape = (max_len, ), dtype = 'int32'))
e = Embedding(vocab_size, 300, weights = [embedding_matrix], input_length = max_len, trainable = False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['acc'])
model.fit(X_train, y_train, epochs = 100, verbose = 2)

Epoch 1/100
1/1 - 1s - 1s/step - acc: 0.7143 - loss: 0.6504
Epoch 2/100
1/1 - 0s - 85ms/step - acc: 0.8571 - loss: 0.6327
Epoch 3/100
1/1 - 0s - 86ms/step - acc: 0.8571 - loss: 0.6156
Epoch 4/100
1/1 - 0s - 98ms/step - acc: 0.8571 - loss: 0.5990
Epoch 5/100
1/1 - 0s - 91ms/step - acc: 0.8571 - loss: 0.5830
Epoch 6/100
1/1 - 0s - 89ms/step - acc: 0.8571 - loss: 0.5675
Epoch 7/100
1/1 - 0s - 81ms/step - acc: 0.8571 - loss: 0.5525
Epoch 8/100
1/1 - 0s - 105ms/step - acc: 0.8571 - loss: 0.5380
Epoch 9/100
1/1 - 0s - 95ms/step - acc: 0.8571 - loss: 0.5240
Epoch 10/100
1/1 - 0s - 85ms/step - acc: 0.8571 - loss: 0.5105
Epoch 11/100
1/1 - 0s - 94ms/step - acc: 1.0000 - loss: 0.4974
Epoch 12/100
1/1 - 0s - 86ms/step - acc: 1.0000 - loss: 0.4848
Epoch 13/100
1/1 - 0s - 82ms/step - acc: 1.0000 - loss: 0.4726
Epoch 14/100
1/1 - 0s - 92ms/step - acc: 1.0000 - loss: 0.4608
Epoch 15/100
1/1 - 0s - 86ms/step - acc: 1.0000 - loss: 0.4495
Epoch 16/100
1/1 - 0s - 65ms/step - acc: 1.0000 - loss: 0.4385
Ep

<keras.src.callbacks.history.History at 0x159a5c81390>