# 단어의 임베딩
- 빈도수 계산 : 빈도기반 -TF
- TDM : matrix TF를 행렬로 만든 것 , 사전을 이용한 단순빈도
- TF-IDF : TF*IDF
- IDF : 역문서 빈도

In [1]:
text = "John likes to watch movies. Mary likes movies too. Mary also likes to watch football games."
words = text.replace('.', '').split()
words

['John',
 'likes',
 'to',
 'watch',
 'movies',
 'Mary',
 'likes',
 'movies',
 'too',
 'Mary',
 'also',
 'likes',
 'to',
 'watch',
 'football',
 'games']

In [2]:
import numpy as np
word_count = np.unique(words, return_counts=True)
print(word_count)

(array(['John', 'Mary', 'also', 'football', 'games', 'likes', 'movies',
       'to', 'too', 'watch'], dtype='<U8'), array([1, 2, 1, 1, 1, 3, 2, 2, 1, 2]))


In [3]:
# 딕셔너리 TF 생성
word_to_cnt = {}
for word, cnt in zip(*word_count):
    word_to_cnt[word] = cnt
word_to_cnt

{'John': 1,
 'Mary': 2,
 'also': 1,
 'football': 1,
 'games': 1,
 'likes': 3,
 'movies': 2,
 'to': 2,
 'too': 1,
 'watch': 2}

In [4]:
word_to_cnt['movies']

2

In [5]:
# !pip install scikit-learn pandas

# TDM

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
corpus = ["John likes to watch movies. Mary likes movies too." ,
          "Mary also likes to watch football games."]

vector = CountVectorizer()
tdm_array = vector.fit_transform(corpus).toarray()
tf_dic = vector.vocabulary_
print(tdm_array)
print(tf_dic)

[[0 0 0 1 2 1 2 1 1 1]
 [1 1 1 0 1 1 0 1 0 1]]
{'john': 3, 'likes': 4, 'to': 7, 'watch': 9, 'movies': 6, 'mary': 5, 'too': 8, 'also': 0, 'football': 1, 'games': 2}


In [8]:
import pandas as pd
tf_dic_sorted = dict(sorted(tf_dic.items(), key=lambda item: item[1]))
# tf_dic_sorted
df = pd.DataFrame(tdm_array, columns=tf_dic_sorted.keys())
df

Unnamed: 0,also,football,games,john,likes,mary,movies,to,too,watch
0,0,0,0,1,2,1,2,1,1,1
1,1,1,1,0,1,1,0,1,0,1


# TD-IDF

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()
tfidf_array = tfidf_vec.fit_transform(corpus).toarray()
tfidf_dic = tfidf_vec.vocabulary_
tfidf_dic_sorted = dict(sorted(tfidf_dic.items(),
                               key=lambda item: item[1]))

tfidf_dtm = pd.DataFrame(tfidf_array,
                         columns=tfidf_dic_sorted.keys())
tfidf_dtm

Unnamed: 0,also,football,games,john,likes,mary,movies,to,too,watch
0,0.0,0.0,0.0,0.323699,0.460629,0.230315,0.647398,0.230315,0.323699,0.230315
1,0.446101,0.446101,0.446101,0.0,0.317404,0.317404,0.0,0.317404,0.0,0.317404


In [10]:
!pip install --upgrade gensim



In [14]:
word_list = []
for word in corpus:
    word_list.append(word.replace('.', '').split())

from gensim.models import Word2Vec
model = Word2Vec(word_list, sg=0, vector_size=100, # sg : 0이면 CBOW, 1이면 Skip-gram방식을 사용
                 window=3, min_count=1) # 윈도우의 크기/ 3이면 앞/뒤 3단어를 포함
# min_count: 사용할 단어의 최소 빈도/ 3이면 3회 이하 단어는 무시

print(model.wv.most_similar('likes','movies'))
print(model.wv.similarity('movies', 'games'))
# similarity는 두 단어 사이의 코사인 유사도를 출력

[('John', 0.17164471745491028), ('also', 0.06594578176736832), ('Mary', 0.008838453330099583), ('watch', -0.06765829026699066), ('games', -0.08544928580522537), ('football', -0.08948154747486115), ('too', -0.11860241740942001), ('to', -0.13643866777420044)]
0.0640898


In [13]:
print(model.wv.most_similar('John','Mary'))

[('likes', 0.15334713459014893), ('football', 0.07839643210172653), ('also', 0.015055425465106964), ('too', 0.007465780712664127), ('movies', -0.006201202515512705), ('games', -0.07736953347921371), ('to', -0.12009607255458832), ('watch', -0.16032634675502777)]


In [5]:
from tensorflow.keras.datasets import imdb
(X_train, y_train),(X_test, y_test) = imdb.load_data(num_words=1000)
(X_train, y_train),(X_test, y_test)

((array([list([1, 14, 22, 16, 43, 530, 973, 2, 2, 65, 458, 2, 66, 2, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 2, 2, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2, 19, 14, 22, 4, 2, 2, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 2, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2, 2, 16, 480, 66, 2, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 2, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 2, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 2, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 2, 88, 12, 16, 283, 5, 16, 2, 113, 103, 32, 15, 16, 2, 19, 178, 32]),
         list([1, 194, 2, 194, 2, 78, 228, 5, 6, 2, 2, 2, 134, 26, 4, 715, 8, 118, 2, 14, 394, 20, 13, 119, 954, 

In [None]:
from tensorflow.keras import Sequential, layers
model_dnn = Sequential(
   [layers.Input(shape=(80,)), # 80개 단어
    layers.Embedding(input_dim=10000, output_dim=32), #(80,32)
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(2,activation='softmax')    
    ]
)

I0000 00:00:1756966492.834690   30157 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5563 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9


In [19]:
model_dnn.summary()

In [24]:
# 텍스트 데이터 전처리
from tensorflow.keras.preprocessing.sequence import pad_sequences
X_train_padding = pad_sequences(X_train, maxlen= 80, padding='post', truncating='post')
X_test_padding = pad_sequences(X_test, maxlen= 80, padding='post', truncating='post')

In [26]:
model_dnn.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy']
    )

model_dnn.fit(X_train_padding, y_train, batch_size=200, epochs=10)

Epoch 1/10


2025-09-04 15:22:52.073246: I external/local_xla/xla/service/service.cc:163] XLA service 0x7d4504004810 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-09-04 15:22:52.073295: I external/local_xla/xla/service/service.cc:171]   StreamExecutor device (0): NVIDIA GeForce RTX 4060 Laptop GPU, Compute Capability 8.9
2025-09-04 15:22:52.109316: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-09-04 15:22:52.223221: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91200




[1m 76/125[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.5361 - loss: 0.6870

I0000 00:00:1756966972.214642   38750 device_compiler.h:196] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6482 - loss: 0.6065
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8013 - loss: 0.4299
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8504 - loss: 0.3473
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9010 - loss: 0.2580
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9453 - loss: 0.1685
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9750 - loss: 0.0974
Epoch 7/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9913 - loss: 0.0497
Epoch 8/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9982 - loss: 0.0218
Epoch 9/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7d4660c49f00>

In [27]:
model_dnn.evaluate(X_test_padding, y_test)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7174 - loss: 1.4850


[1.4849534034729004, 0.7174400091171265]

In [29]:
from tensorflow.keras import Sequential, layers
model_rnn = Sequential(
   [layers.Input(shape=(80,)), # 80개 단어
    layers.Embedding(input_dim=10000, output_dim=32), #(80,32)
    #layers.Flatten(),
    #layers.Dense(64, activation='relu'),
    layers.SimpleRNN(units=64, dropout=0.2, recurrent_dropout=0.2,
              name="rnn"),
    layers.Dense(2,activation='softmax')    
    ]
)

In [30]:
model_rnn.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy']
    )

model_rnn.fit(X_train_padding, y_train, batch_size=200, epochs=10)

Epoch 1/10


2025-09-04 15:29:28.893841: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
2025-09-04 15:29:28.893901: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
2025-09-04 15:29:28.893915: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.








[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m -621us/step - accuracy: 0.5024 - loss: 0.7123
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.5018 - loss: 0.6989
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.5037 - loss: 0.6966
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.5134 - loss: 0.6954
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.5174 - loss: 0.6932
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.5200 - loss: 0.6926
Epoch 7/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.5323 - loss: 0.6901
Epoch 8/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.5467 - loss: 0.6862
Epoch 9/10
[1m125/125[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x7d4651c12a10>

In [31]:
from tensorflow.keras import Sequential, layers
model_rnn = Sequential(
   [layers.Input(shape=(80,)), # 80개 단어
    layers.Embedding(input_dim=10000, output_dim=32), #(80,32)
    layers.SimpleRNN(64,return_sequences=True ),
    layers.SimpleRNN(128),
    layers.Dense(2,activation='softmax')    
    ]
)
model_rnn.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy']
    )

model_rnn.fit(X_train_padding, y_train, batch_size=200, epochs=10)

Epoch 1/10


2025-09-04 15:30:59.572642: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
2025-09-04 15:30:59.572709: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
2025-09-04 15:30:59.572723: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
2025-09-04 15:30:59.572744: I external/l

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.4980 - loss: 0.6976
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 29ms/step - accuracy: 0.5347 - loss: 0.6897
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 30ms/step - accuracy: 0.6297 - loss: 0.6408
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 30ms/step - accuracy: 0.7710 - loss: 0.4906
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 30ms/step - accuracy: 0.8151 - loss: 0.4166
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 30ms/step - accuracy: 0.8400 - loss: 0.3679
Epoch 7/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 30ms/step - accuracy: 0.8726 - loss: 0.3074
Epoch 8/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 31ms/step - accuracy: 0.8942 - loss: 0.2615
Epoch 9/10
[1m125/125[0m [32m━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7d465e265300>

In [32]:
model_rnn.evaluate(X_test_padding, y_test)

2025-09-04 15:31:42.326761: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.



[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.6936 - loss: 0.9880


[0.9880043864250183, 0.6936399936676025]

In [33]:
# optimilzer - sgd
model_rnn.compile(loss='sparse_categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy']
    )
model_rnn.fit(X_train_padding, y_train, batch_size=200, epochs=10)

Epoch 1/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 38ms/step - accuracy: 0.8309 - loss: 0.3799
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 36ms/step - accuracy: 0.8408 - loss: 0.3537
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - accuracy: 0.8621 - loss: 0.3145
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 36ms/step - accuracy: 0.8536 - loss: 0.3340
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 36ms/step - accuracy: 0.8686 - loss: 0.3035
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 37ms/step - accuracy: 0.8694 - loss: 0.3029
Epoch 7/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - accuracy: 0.8932 - loss: 0.2561
Epoch 8/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 36ms/step - accuracy: 0.8990 - loss: 0.2503
Epoch 9/10
[1m125/125[0m [32m

<keras.src.callbacks.history.History at 0x7d45f8ed8070>

In [34]:
model_rnn.evaluate(X_test_padding, y_test)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.5832 - loss: 1.5014


[1.5014039278030396, 0.5831999778747559]

In [38]:
# 텍스트 데이터 전처리
from tensorflow.keras.preprocessing.sequence import pad_sequences
X_train_padding = pad_sequences(X_train, maxlen= 200, padding='pre', truncating='pre')
X_test_padding = pad_sequences(X_test, maxlen= 200, padding='pre', truncating='pre')

from tensorflow.keras import Sequential, layers
model_rnn1 = Sequential(
   [layers.Input(shape=(200,)), # 80개 단어
    layers.Embedding(input_dim=1000, output_dim=32), #(80,32)
    layers.SimpleRNN(64, return_sequences=True ),
    layers.SimpleRNN(128, return_sequences=True),
    layers.SimpleRNN(128),    
    layers.Dense(2,activation='softmax')    
    ]
)
model_rnn1.compile(loss='sparse_categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy']
    )

model_rnn1.fit(X_train_padding, y_train, batch_size=200, epochs=10)

Epoch 1/10


2025-09-04 15:50:47.417664: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.



[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 84ms/step - accuracy: 0.5007 - loss: 0.7044
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 93ms/step - accuracy: 0.5073 - loss: 0.6956
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 93ms/step - accuracy: 0.5087 - loss: 0.6947
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 78ms/step - accuracy: 0.5179 - loss: 0.6934
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 94ms/step - accuracy: 0.5201 - loss: 0.6919
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 94ms/step - accuracy: 0.5197 - loss: 0.6926
Epoch 7/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 81ms/step - accuracy: 0.5255 - loss: 0.6920
Epoch 8/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 99ms/step - accuracy: 0.5361 - loss: 0.6898
Epoch 9/10
[1m125/125[0m [32m━━━

<keras.src.callbacks.history.History at 0x7d4651803f70>

In [39]:
model_rnn1.evaluate(X_test_padding, y_test)

2025-09-04 15:53:30.555907: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.



[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 10ms/step - accuracy: 0.5240 - loss: 0.6944


[0.6944477558135986, 0.5239599943161011]

# 1. optimizer를 sgd로 변환

In [6]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
X_train_padding = pad_sequences(X_train, maxlen= 80, padding='post', truncating='post')
X_test_padding = pad_sequences(X_test, maxlen= 80, padding='post', truncating='post')

from tensorflow.keras import Sequential, layers
model_rnn = Sequential(
   [layers.Input(shape=(80,)), # 80개 단어
    layers.Embedding(input_dim=10000, output_dim=32), #(80,32)
    layers.SimpleRNN(64,return_sequences=True ),
    layers.SimpleRNN(128),
    layers.Dense(2,activation='softmax')    
    ]
)
model_rnn.compile(loss='sparse_categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy']
    )

model_rnn.fit(X_train_padding, y_train, batch_size=200, epochs=10)

I0000 00:00:1756969671.883556   52494 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5563 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9


Epoch 1/10


2025-09-04 16:07:51.117735: I external/local_xla/xla/service/service.cc:163] XLA service 0x7d5550008250 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-09-04 16:07:51.117758: I external/local_xla/xla/service/service.cc:171]   StreamExecutor device (0): NVIDIA GeForce RTX 4060 Laptop GPU, Compute Capability 8.9
2025-09-04 16:07:51.145520: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-09-04 16:07:51.200618: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91200
2025-09-04 16:07:51.235621: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
2025-09-04 16:07:51.

[1m  4/125[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5s[0m 44ms/step - accuracy: 0.5202 - loss: 0.7168

I0000 00:00:1756969674.745261   52867 device_compiler.h:196] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 36ms/step - accuracy: 0.5050 - loss: 0.6975
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - accuracy: 0.5123 - loss: 0.6938
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - accuracy: 0.5133 - loss: 0.6937
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - accuracy: 0.5139 - loss: 0.6931
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - accuracy: 0.5203 - loss: 0.6919
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 36ms/step - accuracy: 0.5258 - loss: 0.6915
Epoch 7/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step - accuracy: 0.5294 - loss: 0.6908
Epoch 8/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 36ms/step - accuracy: 0.5378 - loss: 0.6899
Epoch 9/10
[1m125/125[0m [32m━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7d5665b30040>

In [7]:
model_rnn.evaluate(X_test_padding, y_test)

2025-09-04 16:09:12.675604: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.



[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6ms/step - accuracy: 0.5268 - loss: 0.6926


[0.6925668120384216, 0.5267999768257141]

# 2. 전체단어의 갯수 1000개로 변환
언어는 지프의 법칙을 따라 대부분 단어가 희귀하다. 희귀 토큰까지 전부 쓰면 파라미터가 분산되고 노이즈가 커져 과적합/희소성 문제가 생긴다. 상위 빈도 1,000개만 쓰면 정보 손실이 크지 않으면서 표현 공간을 밀도 있게 쓸 수 있어 초기 모델에서 성능이 오르기 쉽다

In [8]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
X_train_padding = pad_sequences(X_train, maxlen= 80, padding='post', truncating='post')
X_test_padding = pad_sequences(X_test, maxlen= 80, padding='post', truncating='post')

from tensorflow.keras import Sequential, layers
model_rnn2 = Sequential(
   [layers.Input(shape=(80,)), 
    layers.Embedding(input_dim=1000, output_dim=32), #(80,32)
    layers.SimpleRNN(64,return_sequences=True ),
    layers.SimpleRNN(128),
    layers.Dense(2,activation='softmax')    
    ]
)
model_rnn2.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy']
    )

model_rnn2.fit(X_train_padding, y_train, batch_size=200, epochs=10)

Epoch 1/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.5836 - loss: 0.6600
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 29ms/step - accuracy: 0.7434 - loss: 0.5303
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 29ms/step - accuracy: 0.7838 - loss: 0.4674
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 29ms/step - accuracy: 0.7966 - loss: 0.4435
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 29ms/step - accuracy: 0.8164 - loss: 0.4094
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 30ms/step - accuracy: 0.8381 - loss: 0.3742
Epoch 7/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 29ms/step - accuracy: 0.8603 - loss: 0.3269
Epoch 8/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 29ms/step - accuracy: 0.8902 - loss: 0.2718
Epoch 9/10
[1m125/125[0m [32m

<keras.src.callbacks.history.History at 0x7d565efd5e40>

In [9]:
model_rnn2.evaluate(X_test_padding, y_test)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.7315 - loss: 0.8607


[0.8607183694839478, 0.7314800024032593]

# 3. 영화평의 길이를 200개로
단순 RNN은 타임스텝이 길어질수록 기울기 소실/폭발에 취약하다. 문맥을 멀리 전달하기 어렵고 초기 정보가 사라진다. 텍스트 감성처럼 문장 후반 신호가 강한 태스크라면 길이만 늘려도 유효 신호 대비 잡음이 늘어 성능이 빠질 수 있다.

In [10]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
X_train_padding = pad_sequences(X_train, maxlen= 200, padding='post', truncating='post')
X_test_padding = pad_sequences(X_test, maxlen= 200, padding='post', truncating='post')

from tensorflow.keras import Sequential, layers
model_rnn3 = Sequential(
   [layers.Input(shape=(200,)), 
    layers.Embedding(input_dim=10000, output_dim=32), #(80,32)
    layers.SimpleRNN(64,return_sequences=True ),
    layers.SimpleRNN(128),
    layers.Dense(2,activation='softmax')    
    ]
)
model_rnn3.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy']
    )

model_rnn3.fit(X_train_padding, y_train, batch_size=200, epochs=10)

Epoch 1/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 65ms/step - accuracy: 0.5024 - loss: 0.6990
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 62ms/step - accuracy: 0.5306 - loss: 0.6892
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 61ms/step - accuracy: 0.5694 - loss: 0.6648
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 44ms/step - accuracy: 0.6104 - loss: 0.6219
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 60ms/step - accuracy: 0.6474 - loss: 0.5726
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 61ms/step - accuracy: 0.6752 - loss: 0.5274
Epoch 7/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 61ms/step - accuracy: 0.7005 - loss: 0.4861
Epoch 8/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 45ms/step - accuracy: 0.7189 - loss: 0.4477
Epoch 9/10
[1m125/125[0m [32

<keras.src.callbacks.history.History at 0x7d56610b2770>

In [11]:
model_rnn3.evaluate(X_test_padding, y_test)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.5001 - loss: 1.4467


[1.4467289447784424, 0.5001199841499329]

# 4. pad_sequence의 truncating과 padding을 pre
RNN은 입력을 순차 처리한다. truncating='pre'는 앞부분을 잘라 후반(결론·감정이 모이는 구간)을 보존한다. padding='pre'는 앞쪽에 0 패딩을 채워 유효 토큰이 시퀀스 끝 쪽에 정렬되어, 마지막 타임스텝의 정보를 더 직접적으로 활용하게 된다(특히 return_sequences=False일 때). 실무에서 감성 분류는 후행 단서가 강하므로 이 조합이 종종 유리

In [12]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
X_train_padding = pad_sequences(X_train, maxlen= 80, padding='pre', truncating='pre')
X_test_padding = pad_sequences(X_test, maxlen= 80, padding='pre', truncating='pre')

from tensorflow.keras import Sequential, layers
model_rnn4 = Sequential(
   [layers.Input(shape=(80,)), 
    layers.Embedding(input_dim=10000, output_dim=32), #(80,32)
    layers.SimpleRNN(64,return_sequences=True ),
    layers.SimpleRNN(128),
    layers.Dense(2,activation='softmax')    
    ]
)
model_rnn4.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy']
    )

model_rnn4.fit(X_train_padding, y_train, batch_size=200, epochs=10)

Epoch 1/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 32ms/step - accuracy: 0.6055 - loss: 0.6347
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 29ms/step - accuracy: 0.7984 - loss: 0.4398
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 29ms/step - accuracy: 0.8208 - loss: 0.4006
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 29ms/step - accuracy: 0.8383 - loss: 0.3728
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 29ms/step - accuracy: 0.8494 - loss: 0.3460
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.8686 - loss: 0.3112
Epoch 7/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 29ms/step - accuracy: 0.8903 - loss: 0.2682
Epoch 8/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 29ms/step - accuracy: 0.9069 - loss: 0.2331
Epoch 9/10
[1m125/125[0m [32m

<keras.src.callbacks.history.History at 0x7d5660b3cd00>

In [13]:
model_rnn4.evaluate(X_test_padding, y_test)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.7687 - loss: 0.7660


[0.7659646272659302, 0.7687199711799622]

# 5. RNN 층(뉴런 128개)을 하나 더 추가
단순 RNN을 심층으로 쌓으면 기울기 소실이 가중된다. 게이트가 없는 구조라 장기 의존성을 누적하기 어렵고, 파라미터 증가로 과적합 리스크도 커진다. LSTM/GRU처럼 게이트가 있는 셀은 깊이를 늘려도 상대적으로 안정적

In [14]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
X_train_padding = pad_sequences(X_train, maxlen= 80, padding='post', truncating='post')
X_test_padding = pad_sequences(X_test, maxlen= 80, padding='post', truncating='post')

from tensorflow.keras import Sequential, layers
model_rnn5 = Sequential(
   [layers.Input(shape=(80,)), 
    layers.Embedding(input_dim=10000, output_dim=32), #(80,32)
    layers.SimpleRNN(64, return_sequences=True ),
    layers.SimpleRNN(128, return_sequences=True ),    
    layers.SimpleRNN(128),
    layers.Dense(2,activation='softmax')    
    ]
)
model_rnn5.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy']
    )

model_rnn5.fit(X_train_padding, y_train, batch_size=200, epochs=10)

Epoch 1/10


2025-09-04 16:15:35.628764: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.



[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 43ms/step - accuracy: 0.4993 - loss: 0.7057
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 41ms/step - accuracy: 0.5060 - loss: 0.6961
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 40ms/step - accuracy: 0.5282 - loss: 0.6917
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 39ms/step - accuracy: 0.5669 - loss: 0.6787
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - accuracy: 0.6177 - loss: 0.6487
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 40ms/step - accuracy: 0.6663 - loss: 0.6067
Epoch 7/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 40ms/step - accuracy: 0.7097 - loss: 0.5611
Epoch 8/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 40ms/step - accuracy: 0.7459 - loss: 0.5106
Epoch 9/10
[1m125/125[0m [32m━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7d565f9aa710>

In [15]:
model_rnn5.evaluate(X_test_padding, y_test)

2025-09-04 16:16:29.492149: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.



[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.6512 - loss: 0.7239


[0.7238671779632568, 0.6511600017547607]

# CNN 으로 구성

In [None]:
model_cnn = Sequential(
    [   layers.Input(shape=(80,)), 
        layers.Embedding(input_dim=10000, output_dim=32),
        layers.Conv1D(64, 3, activation='relu'),
        layers.GlobalMaxPool1D(),
        layers.Dense(64, activation='softmax'),
        layers.Dense(2, activation='softmax')
    ]
)

In [18]:
model_cnn.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy']
    )

model_cnn.fit(X_train_padding, y_train, batch_size=200, epochs=10)

Epoch 1/10


2025-09-04 16:34:57.086011: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
2025-09-04 16:34:57.086068: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.





[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.5878 - loss: 0.6758
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7346 - loss: 0.5714
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7681 - loss: 0.5252
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7877 - loss: 0.4968
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7999 - loss: 0.4751
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8104 - loss: 0.4566
Epoch 7/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8206 - loss: 0.4403
Epoch 8/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8318 - loss: 0.4254
Epoch 9/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7d565e181e70>

In [19]:
model_cnn.evaluate(X_test_padding, y_test)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7683 - loss: 0.5164


[0.5164104104042053, 0.7683200240135193]