- nltk 영화리뷰 데이터로 BERT 감성분석 모델 완성
    - 데이터 로드 및 분할
    - BERT 토큰화
    - pytorch Dataset 구성
    - 모델 학습
    - 평가

#### 모듈 및 사용할 라이브러리

In [None]:
# %pip install tf-keras

In [None]:
# %pip install datasets

In [None]:
# 라이브러리 불러오기
import warnings
warnings.filterwarnings('ignore')

import nltk
import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
import datasets

#### 데이터 셋 준비

In [None]:
# 데이터 다운로드
# 라벨은 pos : 1, neg : 0
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews
ids = movie_reviews.fileids()
reviews = [movie_reviews.raw(id) for id in movie_reviews.fileids()]
categories = [movie_reviews.categories(id)[0] for id in movie_reviews.fileids()]
labels = [1 if label =='pos' else 0 for label in categories]

In [None]:
len(reviews), len(labels)

In [None]:
# 데이터 분할
from sklearn.model_selection import train_test_split
train_text,test_text,train_label,test_label = train_test_split(reviews,labels,random_state=42,stratify=labels,test_size=0.2)

In [None]:
# 토크나이저
BERT_MOEL_NAME = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(BERT_MOEL_NAME)
# 훈련 / 테스트 데이터 토큰화
train_encodings = tokenizer(train_text, truncation=True, padding = True,return_tensors='pt', max_length=512) 
test_encodings = tokenizer(train_text, truncation=True,padding=True,return_tensors='pt', max_length=512)
train_encodings['input_ids'].shape, test_encodings['input_ids'].shape

In [None]:
# torch dataset 구성
class MovieReciewDataset(torch.utils.data.Dataset):
    def __init__(self,encodings,labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self,idx):
        item = {key :val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
train_dataset = MovieReciewDataset(train_encodings, train_label)
test_dataset = MovieReciewDataset(train_encodings, test_label)
print(f'훈련 샘플 수 : {len(train_dataset)}')
print(f'테스트 샘플 수 : {len(test_dataset)}')

####

In [None]:
# 모델 로드
model = AutoModelForSequenceClassification.from_pretrained(BERT_MOEL_NAME, num_labels=2)
print(f'파라미터 수 : {sum(p.numel() for p in model.parameters())}')
print(f'학습 가능한 파라미터 : {sum(p.numel() for p in model.parameters() if p.requires_grad)}')

In [None]:
# %pip install evaluate

In [None]:
# 평가 매트릭스
import evaluate
accuracy = evaluate.load('accuracy')
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
from transformers import TrainingArguments

In [None]:
# %pip install transformers[torch]

In [None]:
import accelerate
print(accelerate.__version__)
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to = 'none'  # W&B TensorBoard 자동 로딩 모두 끔
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)
print(f'에포크 : {training_args.num_train_epochs}')
print(f'배치크기 : {training_args.per_device_train_batch_size}')
print(f'학습률 : {training_args.learning_rate}')

In [None]:

# 모델 학습
train_result = trainer.train()
print(f'총 학습시간 : {train_result.metrics["train_runtime"]}')
print(f'최종손실 : {train_result.metrics["train_loss"]}')
     

In [None]:

eval_result = trainer.evaluate()
print(f'테스트 정확도 : {eval_result["eval_accuracy"]}')
# 예측수행
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=-1)
# 분류리포트
print(classification_report(test_labels, pred_labels))

In [None]:

test_reviews = [
    "This movie is absolutely fantastic! The plot is engaging and the acting is superb.",
    "Terrible film. Waste of time and money. Would not recommend to anyone.",
    "It's an okay movie. Nothing special but not terrible either.",
    "Brilliant masterpiece! One of the best films I've ever seen in my life.",
    "Boring and predictable. I fell asleep halfway through."
]


In [None]:


device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

In [None]:


model.eval()
for i,review in enumerate(test_reviews):
  inputs = tokenizer(review,
                     return_tensors="pt",
                     truncation=True,
                     padding=True,
                     max_length=512
                     )
  inputs   = {k:v.to(device) for k,v in inputs.items()}
  with torch.no_grad():
    outputs = model(**inputs)
    logits =  outputs.logits
    probs = torch.softmax(logits, dim=-1)[0]
    pred_class = torch.argmax(probs).item()
  confidence = probs[pred_class].item()
  print(f'문장 : {review}')
  print(f'예측 : {pred_class}')
  print(f'긍정 : {probs[1].item():.4f}')
  print(f'부정 : {probs[0].item():.4f}\n')