In [1]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence

In [2]:
text = "해보지 않으면 해낼 수 없다."

In [3]:
text_to_word_sequence(text)

['해보지', '않으면', '해낼', '수', '없다']

In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [5]:
doc = ['먼저 텍스트의 각 단어를 나누어 토큰화합니다.',
       '텍스트의 단어로 토큰화해야 딥러닝에서 인식됩니다.',
       '토큰화한 결과는 딥러닝에서 사용할 수 있습니다.']

In [6]:
token = Tokenizer()

In [7]:
token.fit_on_texts(doc)

In [8]:
token

<keras.src.legacy.preprocessing.text.Tokenizer at 0x1dcfc782bd0>

In [9]:
token.word_counts

OrderedDict([('먼저', 1),
             ('텍스트의', 2),
             ('각', 1),
             ('단어를', 1),
             ('나누어', 1),
             ('토큰화합니다', 1),
             ('단어로', 1),
             ('토큰화해야', 1),
             ('딥러닝에서', 2),
             ('인식됩니다', 1),
             ('토큰화한', 1),
             ('결과는', 1),
             ('사용할', 1),
             ('수', 1),
             ('있습니다', 1)])

In [10]:
token.document_count

3

In [11]:
token.word_docs

defaultdict(int,
            {'각': 1,
             '텍스트의': 2,
             '먼저': 1,
             '토큰화합니다': 1,
             '나누어': 1,
             '단어를': 1,
             '인식됩니다': 1,
             '단어로': 1,
             '토큰화해야': 1,
             '딥러닝에서': 2,
             '있습니다': 1,
             '결과는': 1,
             '토큰화한': 1,
             '사용할': 1,
             '수': 1})

In [12]:
text="오랫동안 꿈꾸는 이는 그 꿈을 닮아간다."

In [13]:
del token

In [14]:
token = Tokenizer()

In [15]:
token.fit_on_texts([text])

In [16]:
token.word_index

{'오랫동안': 1, '꿈꾸는': 2, '이는': 3, '그': 4, '꿈을': 5, '닮아간다': 6}

In [17]:
from tensorflow.keras.utils import to_categorical

In [18]:
text

'오랫동안 꿈꾸는 이는 그 꿈을 닮아간다.'

In [19]:
help(to_categorical)

Help on function to_categorical in module keras.src.utils.numerical_utils:

to_categorical(x, num_classes=None)
    Converts a class vector (integers) to binary class matrix.
    
    E.g. for use with `categorical_crossentropy`.
    
    Args:
        x: Array-like with class values to be converted into a matrix
            (integers from 0 to `num_classes - 1`).
        num_classes: Total number of classes. If `None`, this would be inferred
            as `max(x) + 1`. Defaults to `None`.
    
    Returns:
        A binary matrix representation of the input as a NumPy array. The class
        axis is placed last.
    
    Example:
    
    >>> a = keras.utils.to_categorical([0, 1, 2, 3], num_classes=4)
    >>> print(a)
    [[1. 0. 0. 0.]
     [0. 1. 0. 0.]
     [0. 0. 1. 0.]
     [0. 0. 0. 1.]]
    
    >>> b = np.array([.9, .04, .03, .03,
    ...               .3, .45, .15, .13,
    ...               .04, .01, .94, .05,
    ...               .12, .21, .5, .17],
    ...              

In [20]:
token.texts_to_sequences([text])

[[1, 2, 3, 4, 5, 6]]

In [21]:
# 원핫인코딩 시 숫자배열로 만들어 놔야함, 그 숫자 배열은 순서가 있는 토큰으로 만듦

In [22]:
# 그 만든 숫자 배열을 to_categorical을 통해 0과 1로만 이루어진 배열로 바꾸어야함
# 이 때 주의할 점은 배열 맨 앞에 0이 추가(시작, start of senteces, SOS)되어야하므로 단어수보다 1이 더 많게 인덱스 숫자를 잡아주셔야 함

In [23]:
len(token.word_index) + 1

7

In [24]:
word_size = len(token.word_index) + 1

In [25]:
x = token.texts_to_sequences([text])

In [26]:
to_categorical(x, num_classes=word_size)

array([[[0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 1.]]])

In [27]:
from tensorflow.keras.layers import Embedding

In [28]:
from tensorflow.keras.models import Sequential

In [29]:
model = Sequential()

In [30]:
model.add(Embedding(16, 4))

In [31]:
model

<Sequential name=sequential, built=False>

In [32]:
# 텍스트를 읽고 긍정, 부정 예측하기

In [33]:
docs = ["너무 재밌네요","최고예요","참 잘 만든 영화예요","추천하고 싶은 영화입니다","한번 더 보고싶네요",
        "글쎄요","별로예요","생각보다 지루하네요","연기가 어색해요","재미없어요"]

In [55]:
import numpy as np

classes=np.array([1,1,1,1,1,0,0,0,0,0])

In [35]:
token = Tokenizer()

In [36]:
token.fit_on_texts(docs)

In [37]:
token.word_docs

defaultdict(int,
            {'너무': 1,
             '재밌네요': 1,
             '최고예요': 1,
             '영화예요': 1,
             '참': 1,
             '잘': 1,
             '만든': 1,
             '싶은': 1,
             '추천하고': 1,
             '영화입니다': 1,
             '더': 1,
             '한번': 1,
             '보고싶네요': 1,
             '글쎄요': 1,
             '별로예요': 1,
             '생각보다': 1,
             '지루하네요': 1,
             '연기가': 1,
             '어색해요': 1,
             '재미없어요': 1})

In [38]:
x = token.texts_to_sequences(docs)

In [39]:
x

[[1, 2],
 [3],
 [4, 5, 6, 7],
 [8, 9, 10],
 [11, 12, 13],
 [14],
 [15],
 [16, 17],
 [18, 19],
 [20]]

In [40]:
[len(i) for i in token.texts_to_sequences(docs)]

[2, 1, 4, 3, 3, 1, 1, 2, 2, 1]

In [41]:
max([len(i) for i in token.texts_to_sequences(docs)])

4

In [42]:
from tensorflow.keras.utils import pad_sequences

In [43]:
x = token.texts_to_sequences(docs)

In [44]:
help(pad_sequences)

Help on function pad_sequences in module keras.src.utils.sequence_utils:

pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre', truncating='pre', value=0.0)
    Pads sequences to the same length.
    
    This function transforms a list (of length `num_samples`)
    of sequences (lists of integers)
    into a 2D NumPy array of shape `(num_samples, num_timesteps)`.
    `num_timesteps` is either the `maxlen` argument if provided,
    or the length of the longest sequence in the list.
    
    Sequences that are shorter than `num_timesteps`
    are padded with `value` until they are `num_timesteps` long.
    
    Sequences longer than `num_timesteps` are truncated
    so that they fit the desired length.
    
    The position where padding or truncation happens is determined by
    the arguments `padding` and `truncating`, respectively.
    Pre-padding or removing values from the beginning of the sequence is the
    default.
    
    >>> sequence = [[1], [2, 3], [4, 5, 6]]
 

In [45]:
padded_x = pad_sequences(x, 4)

In [46]:
padded_x

array([[ 0,  0,  1,  2],
       [ 0,  0,  0,  3],
       [ 4,  5,  6,  7],
       [ 0,  8,  9, 10],
       [ 0, 11, 12, 13],
       [ 0,  0,  0, 14],
       [ 0,  0,  0, 15],
       [ 0,  0, 16, 17],
       [ 0,  0, 18, 19],
       [ 0,  0,  0, 20]])

In [47]:
word_size = len(token.word_index) + 1

In [48]:
word_size

21

In [49]:
model = Sequential()
model.add(Embedding(x, word_size))

In [50]:
from tensorflow.keras.layers import Flatten, Dense

In [51]:
del model

In [52]:
model = Sequential()
model.add(Embedding(word_size, 8, input_length=4))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.summary()



In [56]:

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(padded_x, classes, epochs=20)

Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 553ms/step - accuracy: 0.4000 - loss: 0.7021
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.5000 - loss: 0.6999
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.5000 - loss: 0.6977
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.5000 - loss: 0.6955
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.5000 - loss: 0.6933
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.5000 - loss: 0.6911
Epoch 7/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.5000 - loss: 0.6889
Epoch 8/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.5000 - loss: 0.6868
Epoch 9/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

<keras.src.callbacks.history.History at 0x1dcc204e610>