In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# 데이터 로드
data = pd.read_excel('data.xlsx')

# BERT tokenizer로 토크나이징
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# '기업명'과 '브랜드명'을 합친 새로운 column 생성
data['texts'] = data['기업명'] + ' ' + data['브랜드명']

# '기업명'과 '브랜드명'을 합친 텍스트를 BERT tokenizer를 통해 토크나이징
inputs = tokenizer(data['texts'].to_list(), return_tensors='tf', truncation=True, padding=True, max_length=256)

# '업종' label을 숫자로 변환
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(data['업종'])

# 훈련 데이터와 테스트 데이터 분할
input_ids_train, input_ids_test, labels_train, labels_test = train_test_split(inputs['input_ids'].numpy(), labels, test_size=0.2, random_state=42)
attention_mask_train, attention_mask_test, _, _ = train_test_split(inputs['attention_mask'].numpy(), labels, test_size=0.2, random_state=42)

# tf.data.Dataset으로 변환
train_dataset = tf.data.Dataset.from_tensor_slices(((input_ids_train, attention_mask_train), labels_train)).batch(8)
test_dataset = tf.data.Dataset.from_tensor_slices(((input_ids_test, attention_mask_test), labels_test)).batch(8)

# BERT 모델 로드
model = TFBertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=data['업종'].nunique())

# 옵티마이저, 손실, 측정 지표 설정
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# EarlyStopping과 ModelCheckpoint 콜백 설정
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
checkpoint_path = "checkpoints/cp.ckpt"
model_checkpoint = ModelCheckpoint(filepath=checkpoint_path,
                                    save_weights_only=True,  # only save model weights
                                    monitor='val_loss',
                                    mode='min',
                                    save_best_only=True)

# 모델 훈련
history = model.fit(train_dataset, epochs=100, validation_data=test_dataset, callbacks=[early_stopping, model_checkpoint])


# 결과 시각화
import matplotlib.pyplot as plt

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper right')
plt.show()

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper right')
plt.show()

# 테스트
test_sample = "네이버페이결제"
test_input = tokenizer(test_sample, return_tensors='tf', padding='max_length', max_length=256, truncation=True)

# 입력 데이터를 numpy 배열로 변환
test_input_ids = test_input['input_ids'].numpy()
test_attention_mask = test_input['attention_mask'].numpy()

test_dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': test_input_ids, 'attention_mask': test_attention_mask}))

test_dataset = test_dataset.batch(1)

predictions = model.predict(test_dataset)
predicted_label = np.argmax(predictions.logits, axis=1)[0]

predicted_category = label_encoder.inverse_transform([predicted_label])
print(predicted_category)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/100
  35/1219 [..............................] - ETA: 37:53 - loss: 1.7199 - accuracy: 0.5214

KeyboardInterrupt: 