## Lecture_22 Nature Language Processing

In [1]:
# 구글 드라이브 마운트.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [17]:
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import numpy as np
import sys

### 텍스트의 토큰화

#### 단어로 쪼개기

In [2]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence

# 텍스트 전처리 부분이다. 텍스트를 작은 단위로 분해하는 것이다.
# 문자열을 받아서 리스트로 바꿔준다.

text = 'Major League Baseball (MLB) is a professional baseball organization and the oldest major professional sports league in the world.'
result = text_to_word_sequence(text)

print('Original Text:', text)
print('################')
print('Tokenized Text:', result)
print(type(result))

Original Text: Major League Baseball (MLB) is a professional baseball organization and the oldest major professional sports league in the world.
################
Tokenized Text: ['major', 'league', 'baseball', 'mlb', 'is', 'a', 'professional', 'baseball', 'organization', 'and', 'the', 'oldest', 'major', 'professional', 'sports', 'league', 'in', 'the', 'world']
<class 'list'>


In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer

# 단어의 빈도수 파악악


docs = ['As of 2022, a total of 30 teams play in Major League Baseball—15 teams in the National League (NL) and 15 in the American League (AL)—with 29 in the United States and 1 in Canada.'
, 'The NL and AL were formed in 1876 and 1901, respectively. Beginning in 1903, the two leagues signed the National Agreement and cooperated', 
'The first few decades of professional baseball were characterized by rivalries between leagues and by players who often jumped from one team or league to another']

token = Tokenizer()
token.fit_on_texts(docs)
print('word count: {}'.format(token.word_counts))
print(type(token.word_counts))

word count: OrderedDict([('as', 1), ('of', 3), ('2022', 1), ('a', 1), ('total', 1), ('30', 1), ('teams', 2), ('play', 1), ('in', 7), ('major', 1), ('league', 4), ('baseball—15', 1), ('the', 7), ('national', 2), ('nl', 2), ('and', 6), ('15', 1), ('american', 1), ('al', 2), ('—with', 1), ('29', 1), ('united', 1), ('states', 1), ('1', 1), ('canada', 1), ('were', 2), ('formed', 1), ('1876', 1), ('1901', 1), ('respectively', 1), ('beginning', 1), ('1903', 1), ('two', 1), ('leagues', 2), ('signed', 1), ('agreement', 1), ('cooperated', 1), ('first', 1), ('few', 1), ('decades', 1), ('professional', 1), ('baseball', 1), ('characterized', 1), ('by', 2), ('rivalries', 1), ('between', 1), ('players', 1), ('who', 1), ('often', 1), ('jumped', 1), ('from', 1), ('one', 1), ('team', 1), ('or', 1), ('to', 1), ('another', 1)])
<class 'collections.OrderedDict'>


In [4]:
#문장 개수 파악 리스트의 길이 를 알려준다. 문장내의 '.' 은 영향을 주지 않는다.
print('sentence count: {}'.format(token.document_count))
# 개별 단어들이 몇개의 문장에 쓰였는지 나타낸다. 이 경우 1~3 일 것이다.
print('How many sentences does each word appear in? {}'.format(token.word_docs))
# 개별 워드에 인덱스를 부여한다. 가장 중요한 매서드중 하나이다.
print('word index: {}'.format(token.word_index))

sentence count: 3
How many sentences does each word appear in? defaultdict(<class 'int'>, {'teams': 1, 'and': 3, '1': 1, 'as': 1, 'a': 1, 'united': 1, 'of': 2, 'total': 1, 'the': 3, 'american': 1, 'states': 1, 'al': 2, 'baseball—15': 1, '—with': 1, 'canada': 1, 'nl': 2, '2022': 1, 'major': 1, '29': 1, 'league': 2, 'national': 2, 'play': 1, '30': 1, '15': 1, 'in': 2, '1903': 1, 'signed': 1, 'were': 2, 'two': 1, 'cooperated': 1, '1876': 1, 'beginning': 1, 'respectively': 1, 'agreement': 1, 'formed': 1, '1901': 1, 'leagues': 2, 'first': 1, 'one': 1, 'between': 1, 'team': 1, 'decades': 1, 'professional': 1, 'or': 1, 'from': 1, 'jumped': 1, 'often': 1, 'players': 1, 'few': 1, 'to': 1, 'baseball': 1, 'characterized': 1, 'by': 1, 'rivalries': 1, 'another': 1, 'who': 1})
word index: {'in': 1, 'the': 2, 'and': 3, 'league': 4, 'of': 5, 'teams': 6, 'national': 7, 'nl': 8, 'al': 9, 'were': 10, 'leagues': 11, 'by': 12, 'as': 13, '2022': 14, 'a': 15, 'total': 16, '30': 17, 'play': 18, 'major': 19, '

### 원 핫 인코딩

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer

# 텍스트 토큰화 및 인덱스 출력력
text = '최고가 될 때까지 끊임 없이 노력하고 최고가 되어서 떠나라.' 

token = Tokenizer()
token.fit_on_texts([text])
print('word index: {}'.format(token.word_index))

# texts_to_squencese() 함수를 통해 토큰의 인덱스로만 채워진 배열 생성
x = token.texts_to_sequences([text])
print(x)

word index: {'최고가': 1, '될': 2, '때까지': 3, '끊임': 4, '없이': 5, '노력하고': 6, '되어서': 7, '떠나라': 8}
[[1, 2, 3, 4, 5, 6, 1, 7, 8]]


In [18]:
# 원 핫 인코딩딩

from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow as tf

text = '최고가 될 때까지 끊임 없이 노력하고 최고가 되어서 떠나라.' 

token = Tokenizer()
token.fit_on_texts([text])
#print('word index: {}'.format(token.word_index))
x = token.texts_to_sequences([text])
y = tf.keras.utils.to_categorical(x, len(token.word_index) + 1) # 길이를 줘야하는데 벡터 맨 앞에 0이 필요해서 한 칸 더 추가.
print(y)

# 이런 상태로 학습하는 것은 학습이 잘 안된다. 따라서 워드 임베딩을 통해 관련성이 높은 단어들을 가까이 배치한다.

[[[0. 1. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 1. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 1.]]]


#### 워드 임베딩

In [19]:
from tensorflow.keras.layers import Embedding

model = Sequential()
model.add(Embedding(16, 4))



In [20]:
from tensorflow.keras.layers import Embedding

model = Sequential()
#model.add(Embedding(16, 4))
model.add(Embedding(16, 4, input_length = 2)) # 입력 시퀀스의 길이: 단어수는 16 개이지만 항상 두 개 씩만 넣겠다는 뜻.

### 텍스트 읽고 긍정 부정 예측하기기

In [26]:

reviews = ['너무 재밌네요', '최고에요', '참 잘 만든 영화에요', '추천하고 싶은 영화입니다', '한번 더 보고싶네요', '글쎄요', '별로에요',
           '생각보다 지루하네요', '연기가 어색해요', '재미없어요']

classes = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

In [27]:
# 토큰화
token = Tokenizer() # new tokenizer
token.fit_on_texts(reviews) # text input
print(token.word_index)# check index
print(type(token.word_index))

{'너무': 1, '재밌네요': 2, '최고에요': 3, '참': 4, '잘': 5, '만든': 6, '영화에요': 7, '추천하고': 8, '싶은': 9, '영화입니다': 10, '한번': 11, '더': 12, '보고싶네요': 13, '글쎄요': 14, '별로에요': 15, '생각보다': 16, '지루하네요': 17, '연기가': 18, '어색해요': 19, '재미없어요': 20}
<class 'dict'>


In [28]:
x = token.texts_to_sequences(reviews) # text to sequence
print(x)

padded_x = pad_sequences(x, 4) # pad sequence
print(padded_x)

word_size = len(token.word_index) + 1 # 워드 사이즈 하나 추가 해주기기
Embedding(word_size, 8, input_length = 4)

model = Sequential()
model.add(Embedding(word_size, 8, input_length = 4))
model.add(Flatten())
model.add(Dense(1, activation = 'sigmoid'))

model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model.fit(padded_x, classes, epochs = 20)
print('\n Accuracy: {}'.format(model.evaluate(padded_x, classes)[1]))

[[1, 2], [3], [4, 5, 6, 7], [8, 9, 10], [11, 12, 13], [14], [15], [16, 17], [18, 19], [20]]
[[ 0  0  1  2]
 [ 0  0  0  3]
 [ 4  5  6  7]
 [ 0  8  9 10]
 [ 0 11 12 13]
 [ 0  0  0 14]
 [ 0  0  0 15]
 [ 0  0 16 17]
 [ 0  0 18 19]
 [ 0  0  0 20]]
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20





 Accuracy: 1.0


#### 전체코드

In [30]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding
import numpy as np

reviews = ['너무 재밌네요', '최고에요', '참 잘 만든 영화에요', '추천하고 싶은 영화입니다', '한번 더 보고싶네요', '글쎄요', '별로에요', '생각보다 지루하네요', '연기가 어색해요', '재미없어요']
classes = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

token = Tokenizer() # new tokenizer
token.fit_on_texts(reviews) # text input
x = token.texts_to_sequences(reviews) # text to sequence
padded_x = pad_sequences(x, 4)#pad sequence
word_size = len(token.word_index) + 1

model = Sequential()
model.add(Embedding(word_size, 8, input_length = 4))
model.add(Flatten())
model.add(Dense(1, activation = 'sigmoid'))

model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model.fit(padded_x, classes, epochs = 20)
print('\n Accuracy: {}'.format(model.evaluate(padded_x, classes)[1]))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

 Accuracy: 1.0
