In [3]:
from tensorflow.keras.datasets import reuters
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, f1_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# 10000

In [4]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=10000, test_split=0.2)

In [5]:
word_index = reuters.get_word_index(path="reuters_word_index.json")

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/reuters_word_index.json


In [6]:

index_to_word = { index+3 : word for word, index in word_index.items() }
for index, token in enumerate(("<pad>", "<sos>", "<unk>")):
  index_to_word[index]=token

In [7]:
decoded = []
for i in range(len(x_train)):
    t = ' '.join([index_to_word[index] for index in x_train[i]])
    decoded.append(t)

x_train = decoded
print(len(x_train))

8982


In [8]:
decoded_test = []
for i in range(len(x_test)):
    t = ' '.join([index_to_word[index] for index in x_test[i]])
    decoded_test.append(t)

x_test = decoded_test
print(len(x_test))

2246


In [9]:
# 벡터화 DTM, TF-idf 방법
dtmvector = CountVectorizer()

tfidf_transformer = TfidfTransformer()

x_train_dtm = dtmvector.fit_transform(x_train)
x_test_dtm= dtmvector.transform(x_test)

x_train_tfidf = tfidf_transformer.fit_transform(x_train_dtm)
x_test_tfidf = tfidf_transformer.transform(x_test_dtm)

In [10]:
x_train_tfidf.shape

(8982, 9670)

In [11]:
x_test_tfidf.shape

(2246, 9670)

In [12]:
x_train[3]

"<sos> the farmers home administration the u s agriculture department's farm lending arm could lose about seven billion dlrs in outstanding principal on its severely <unk> borrowers or about one fourth of its farm loan portfolio the general accounting office gao said in remarks prepared for delivery to the senate agriculture committee brian crowley senior associate director of gao also said that a preliminary analysis of proposed changes in <unk> financial eligibility standards indicated as many as one half of <unk> borrowers who received new loans from the agency in 1986 would be <unk> under the proposed system the agency has proposed evaluating <unk> credit using a variety of financial ratios instead of relying solely on <unk> ability senate agriculture committee chairman patrick leahy d vt <unk> the proposed eligibility changes telling <unk> administrator <unk> clark at a hearing that they would mark a dramatic shift in the agency's purpose away from being farmers' lender of last re

In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout, LSTM, Dense, Dropout


# 아래 Dense의 레이어를 바꿔가며 성능을 확인해봅시다

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout

# 샘플 하나의 shape: (26506,) -> Dynamically get the input shape
inputs = Input(shape=(x_train_tfidf.shape[1],))

x = Dense(512, activation='relu')(inputs)
x = Dropout(0.3)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.3)(x)
outputs = Dense(46, activation='softmax')(x)  # 클래스 46개

dense_model = Model(inputs=inputs, outputs=outputs)

dense_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
dense_model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 9670)]            0         
_________________________________________________________________
dense (Dense)                (None, 512)               4951552   
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               65664     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 46)                5934      
Total params: 5,023,150
Trainable params: 5,023,150
Non-trainable params: 0
___________________________________________________

In [19]:
# 시간이 좀 걸립니다! 한 20분정도..
x_train_tfidf_dense = x_train_tfidf.toarray()

# Use the dense array in the fit method
dense_model.fit(
    x_train_tfidf_dense, # <--- Changed
    y_train,
    epochs=5,
    batch_size=32,
    validation_split=0.2
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7c57c65f3bb0>

In [21]:
# 1. 테스트 데이터의 인덱스를 정렬합니다.
x_test_tfidf.sort_indices()

# 2. 이제 predict()를 다시 실행합니다.
y_pred_proba = dense_model.predict(x_test_tfidf)
y_pred = np.argmax(y_pred_proba, axis=1)

# ... 나머지 코드 계속 실행
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

In [22]:
print(acc)
print(f1)

0.8076580587711487
0.7985702687067264


# 5000

In [28]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=5000, test_split=0.2)

In [29]:
word_index = reuters.get_word_index(path="reuters_word_index.json")

In [30]:

index_to_word = { index+3 : word for word, index in word_index.items() }
for index, token in enumerate(("<pad>", "<sos>", "<unk>")):
  index_to_word[index]=token

In [31]:
decoded = []
for i in range(len(x_train)):
    t = ' '.join([index_to_word[index] for index in x_train[i]])
    decoded.append(t)

x_train = decoded
print(len(x_train))

8982


In [32]:
decoded_test = []
for i in range(len(x_test)):
    t = ' '.join([index_to_word[index] for index in x_test[i]])
    decoded_test.append(t)

x_test = decoded_test
print(len(x_test))

2246


In [33]:
# 벡터화 DTM, TF-idf 방법
dtmvector = CountVectorizer()

tfidf_transformer = TfidfTransformer()

x_train_dtm = dtmvector.fit_transform(x_train)
x_test_dtm= dtmvector.transform(x_test)

x_train_tfidf = tfidf_transformer.fit_transform(x_train_dtm)
x_test_tfidf = tfidf_transformer.transform(x_test_dtm)

In [34]:
x_train_tfidf.shape

(8982, 4867)

In [35]:
x_test_tfidf.shape

(2246, 4867)

In [36]:
x_train[3]

"<sos> the farmers home administration the u s agriculture department's farm lending arm could lose about seven billion dlrs in outstanding principal on its severely <unk> borrowers or about one fourth of its farm loan portfolio the general accounting office gao said in remarks prepared for delivery to the senate agriculture committee brian <unk> senior associate director of gao also said that a preliminary analysis of proposed changes in <unk> financial <unk> standards indicated as many as one half of <unk> borrowers who received new loans from the agency in 1986 would be <unk> under the proposed system the agency has proposed <unk> <unk> credit using a variety of financial <unk> instead of <unk> <unk> on <unk> ability senate agriculture committee chairman <unk> <unk> d <unk> <unk> the proposed <unk> changes telling <unk> administrator <unk> clark at a hearing that they would mark a dramatic shift in the <unk> purpose away from being <unk> <unk> of last <unk> toward becoming a big cit

In [37]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout, LSTM, Dense, Dropout


# 아래 Dense의 레이어를 바꿔가며 성능을 확인해봅시다

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout

# 샘플 하나의 shape: (26506,) -> Dynamically get the input shape
inputs = Input(shape=(x_train_tfidf.shape[1],))

x = Dense(512, activation='relu')(inputs)
x = Dropout(0.3)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.3)(x)
outputs = Dense(46, activation='softmax')(x)  # 클래스 46개

dense_model = Model(inputs=inputs, outputs=outputs)

dense_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
dense_model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 4867)]            0         
_________________________________________________________________
dense_3 (Dense)              (None, 512)               2492416   
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 128)               65664     
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 46)                5934      
Total params: 2,564,014
Trainable params: 2,564,014
Non-trainable params: 0
_________________________________________________

In [39]:
#시간이 좀 걸립니다! 한 20분정도..
x_train_tfidf_dense = x_train_tfidf.toarray()

# Use the dense array in the fit method
dense_model.fit(
    x_train_tfidf_dense, # <--- Changed
    y_train,
    epochs=5,
    batch_size=32,
    validation_split=0.2
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7c57c6eed190>

In [40]:
# 1. 테스트 데이터의 인덱스를 정렬합니다.
x_test_tfidf.sort_indices()

# 2. 이제 predict()를 다시 실행합니다.
y_pred_proba = dense_model.predict(x_test_tfidf)
y_pred = np.argmax(y_pred_proba, axis=1)

# ... 나머지 코드 계속 실행
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

In [41]:
print(acc)
print(f1)

0.8040961709706145
0.7948275735129886


# lightGBM -10000

In [42]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=10000, test_split=0.2)

In [43]:
word_index = reuters.get_word_index(path="reuters_word_index.json")


In [44]:

index_to_word = { index+3 : word for word, index in word_index.items() }
for index, token in enumerate(("<pad>", "<sos>", "<unk>")):
  index_to_word[index]=token

In [47]:
decoded = []
for i in range(len(x_train)):
    t = ' '.join([index_to_word[index] for index in x_train[i]])
    decoded.append(t)

x_train = decoded
print(len(x_train))

8982


In [48]:
decoded_test = []
for i in range(len(x_test)):
    t = ' '.join([index_to_word[index] for index in x_test[i]])
    decoded_test.append(t)

x_test = decoded_test
print(len(x_test))

2246


In [49]:
# 벡터화 DTM, TF-idf 방법
dtmvector = CountVectorizer()

tfidf_transformer = TfidfTransformer()

x_train_dtm = dtmvector.fit_transform(x_train)
x_test_dtm= dtmvector.transform(x_test)

x_train_tfidf = tfidf_transformer.fit_transform(x_train_dtm)
x_test_tfidf = tfidf_transformer.transform(x_test_dtm)


In [50]:
x_train_tfidf.shape

(8982, 9670)

In [51]:
x_test_tfidf.shape

(2246, 9670)

In [52]:
x_train[3]

"<sos> the farmers home administration the u s agriculture department's farm lending arm could lose about seven billion dlrs in outstanding principal on its severely <unk> borrowers or about one fourth of its farm loan portfolio the general accounting office gao said in remarks prepared for delivery to the senate agriculture committee brian crowley senior associate director of gao also said that a preliminary analysis of proposed changes in <unk> financial eligibility standards indicated as many as one half of <unk> borrowers who received new loans from the agency in 1986 would be <unk> under the proposed system the agency has proposed evaluating <unk> credit using a variety of financial ratios instead of relying solely on <unk> ability senate agriculture committee chairman patrick leahy d vt <unk> the proposed eligibility changes telling <unk> administrator <unk> clark at a hearing that they would mark a dramatic shift in the agency's purpose away from being farmers' lender of last re

In [53]:
# LightGBM 모델을 위한 라이브러리 임포트
import lightgbm as lgb

# LightGBM 모델 정의
# n_estimators: 부스팅에 사용될 트리의 개수
# learning_rate: 각 부스팅 단계에서 학습률을 축소시키는 정도
# num_leaves: 하나의 트리가 가질 수 있는 최대 잎(leaf) 수
# objective: 최적화할 목표 함수 (다중 클래스 분류이므로 'multiclass' 사용)
# num_class: 클래스의 총 개수 (Reuters 데이터셋은 46개의 클래스)
lgbm_model = lgb.LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=31,
    objective='multiclass',
    num_class=46,
    random_state=42
)

print("LightGBM 모델 정의 완료:")
print(lgbm_model)

LightGBM 모델 정의 완료:
LGBMClassifier(learning_rate=0.05, n_estimators=1000, num_class=46,
               objective='multiclass', random_state=42)


In [54]:
# LightGBM 모델 학습
# verbose 인자 없이 학습 과정을 진행합니다.
lgbm_model.fit(x_train_tfidf, y_train)

print("\nLightGBM 모델 학습 완료!")


LightGBM 모델 학습 완료!


In [55]:
# 테스트 데이터로 예측 수행
y_pred_lgbm = lgbm_model.predict(x_test_tfidf)

# 정확도 (Accuracy) 계산
acc_lgbm = accuracy_score(y_test, y_pred_lgbm)
# F1-score 계산 (다중 클래스이므로 'weighted' 평균 사용)
f1_lgbm = f1_score(y_test, y_pred_lgbm, average='weighted')

print(f"✅ LightGBM Accuracy: {acc_lgbm:.4f}")
print(f"✅ LightGBM F1-score: {f1_lgbm:.4f}")

✅ LightGBM Accuracy: 0.1790
✅ LightGBM F1-score: 0.1893


# lightGBM - 모든단어장

In [56]:
# 이부분에있는 num_words를 5000과 다른방법으로 바꿔보세요~

(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None, test_split=0.2)

In [57]:
word_index = reuters.get_word_index(path="reuters_word_index.json")

In [58]:

index_to_word = { index+3 : word for word, index in word_index.items() }
for index, token in enumerate(("<pad>", "<sos>", "<unk>")):
  index_to_word[index]=token

In [59]:
decoded = []
for i in range(len(x_train)):
    t = ' '.join([index_to_word[index] for index in x_train[i]])
    decoded.append(t)

x_train = decoded
print(len(x_train))

8982


In [60]:
decoded_test = []
for i in range(len(x_test)):
    t = ' '.join([index_to_word[index] for index in x_test[i]])
    decoded_test.append(t)

x_test = decoded_test
print(len(x_test))

2246


In [61]:
# 벡터화 DTM, TF-idf 방법
dtmvector = CountVectorizer()

tfidf_transformer = TfidfTransformer()

x_train_dtm = dtmvector.fit_transform(x_train)
x_test_dtm= dtmvector.transform(x_test)

x_train_tfidf = tfidf_transformer.fit_transform(x_train_dtm)
x_test_tfidf = tfidf_transformer.transform(x_test_dtm)

In [62]:
x_train_tfidf.shape

(8982, 26506)

In [63]:
x_test_tfidf.shape

(2246, 26506)

In [64]:
x_train[3]

"<sos> the farmers home administration the u s agriculture department's farm lending arm could lose about seven billion dlrs in outstanding principal on its severely delinquent borrowers or about one fourth of its farm loan portfolio the general accounting office gao said in remarks prepared for delivery to the senate agriculture committee brian crowley senior associate director of gao also said that a preliminary analysis of proposed changes in fmha's financial eligibility standards indicated as many as one half of fmha borrowers who received new loans from the agency in 1986 would be ineligible under the proposed system the agency has proposed evaluating applicants' credit using a variety of financial ratios instead of relying solely on cashflow ability senate agriculture committee chairman patrick leahy d vt slammed the proposed eligibility changes telling fmha administrator vance clark at a hearing that they would mark a dramatic shift in the agency's purpose away from being farmer

In [65]:
# LightGBM 모델을 위한 라이브러리 임포트
import lightgbm as lgb

# LightGBM 모델 정의
# n_estimators: 부스팅에 사용될 트리의 개수
# learning_rate: 각 부스팅 단계에서 학습률을 축소시키는 정도
# num_leaves: 하나의 트리가 가질 수 있는 최대 잎(leaf) 수
# objective: 최적화할 목표 함수 (다중 클래스 분류이므로 'multiclass' 사용)
# num_class: 클래스의 총 개수 (Reuters 데이터셋은 46개의 클래스)
lgbm_model = lgb.LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=31,
    objective='multiclass',
    num_class=46,
    random_state=42
)

print("LightGBM 모델 정의 완료:")
print(lgbm_model)

LightGBM 모델 정의 완료:
LGBMClassifier(learning_rate=0.05, n_estimators=1000, num_class=46,
               objective='multiclass', random_state=42)


In [66]:
# LightGBM 모델 학습
# verbose 인자 없이 학습 과정을 진행합니다.
lgbm_model.fit(x_train_tfidf, y_train)

print("\nLightGBM 모델 학습 완료!")


LightGBM 모델 학습 완료!


In [67]:
# 테스트 데이터로 예측 수행
y_pred_lgbm = lgbm_model.predict(x_test_tfidf)

# 정확도 (Accuracy) 계산
acc_lgbm = accuracy_score(y_test, y_pred_lgbm)
# F1-score 계산 (다중 클래스이므로 'weighted' 평균 사용)
f1_lgbm = f1_score(y_test, y_pred_lgbm, average='weighted')

print(f"✅ LightGBM Accuracy: {acc_lgbm:.4f}")
print(f"✅ LightGBM F1-score: {f1_lgbm:.4f}")

✅ LightGBM Accuracy: 0.8001
✅ LightGBM F1-score: 0.7904
