In [1]:
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')

In [3]:
train_data = train_data.dropna()
test_data = test_data.dropna()

In [4]:
from transformers import BertTokenizer, TFBertForSequenceClassification

In [5]:
model_name = "klue/bert-base"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=2, from_pt=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_data[['document', 'label']])
test_dataset = Dataset.from_pandas(test_data[['document', 'label']])

In [7]:
def preprocess_function(examples):
    return tokenizer(
        examples['document'],
        truncation=True,
        padding='max_length',
#         max_length=128
        return_token_type_ids=False
    )

# 토큰화 적용
train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

  0%|          | 0/150 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [8]:
import tensorflow as tf

# TensorFlow 학습용 데이터셋으로 변환
train_tf_dataset = train_dataset.to_tf_dataset(
    columns=['input_ids', 'attention_mask'],
    label_cols='label',
    shuffle=True,
    batch_size=16
)

test_tf_dataset = test_dataset.to_tf_dataset(
    columns=['input_ids', 'attention_mask'],
    label_cols='label',
    shuffle=False,
    batch_size=16
)

In [10]:
from transformers import create_optimizer

num_epochs = 2
batch_size = 16
learning_rate = 2e-5
weight_decay_rate = 0.01
num_train_steps = int(len(train_dataset) / batch_size) * num_epochs
num_warmup_steps = int(0.1 * num_train_steps)

# 옵티마이저 생성
optimizer, schedule = create_optimizer(
    init_lr=learning_rate,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    weight_decay_rate=weight_decay_rate
)

# 손실 함수 및 평가 지표 설정
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy("accuracy")]

# 모델 컴파일
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [None]:
model.fit(
    train_tf_dataset,
    validation_data=test_tf_dataset,
    epochs=num_epochs
)

Epoch 1/2
Epoch 2/2