In [2]:
#================================================================
# ▶ 모듈불러오기
#================================================================
# 시스템
import os
import sys
from tqdm import tqdm

# 데이터분석
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# 파이토치
import torch
from torch.utils.data import Dataset, DataLoader

# 데이터셋
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# 모델
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


# 평가지표
from sklearn.metrics import f1_score


# 데이터 분할
from sklearn.model_selection import train_test_split


# 유틸
import gc
from tqdm.auto import tqdm
from tqdm import tqdm
import random

# 기타 추가(옵션)
import copy
import zipfile
from glob import glob
import time

# NLTK BLEU 점수 계산을 위한 설정
import nltk
from nltk.translate.bleu_score import corpus_bleu

# 기타
import warnings
warnings.filterwarnings(action='ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#===============================================================================
# ▶ 전역설정
#===============================================================================
TOKEN = '---COPY YOUR TOKEN---'
MODEL_NAME = "bert-base-multilingual-cased"

MODEL = 'BERT'
VERSION = 'STEP-0'

MAX_LEN = 8
# MAX_LEN = 128
# MAX_LEN = 250 
BATCH_SIZE =4
EPOCHS = 1
LEARNING_RATE = 5e-5
# LEARNING_RATE = 1e-3
SEED = 2025
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
#===============================================================================
# ▶ 시드설정
#===============================================================================
def set_seed(SEED):
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)

# deterministic 설정
deterministic = True
if deterministic:
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark     = False

set_seed(SEED)

In [5]:
#===============================================================================
# ▶ 작업환경
#===============================================================================
IS_GOOGLE = True if 'google.colab'                 in sys.modules   else False
IS_KAGGLE = True if 'KAGGLE_KERNEL_RUN_TYPE'       in os.environ    else False
IS_LOCAL  = True if  not (IS_GOOGLE or IS_KAGGLE)                   else False

In [6]:
#===============================================================================
# ▶ 데이터 패스설정
#===============================================================================

# 코랩용
if IS_GOOGLE:
    base_path = '/content/drive/MyDrive/프로젝트/2023 교원그룹 AI OCR 챌린지/data/'

# 캐글용
elif IS_KAGGLE :
    base_path = '/kaggle/input/kor-eng/kr-en/'
# 로컬용
elif IS_LOCAL :
    base_path = './data/'

# 트레인&테스트 경로
train_file = base_path+'2_대화체_변환추출_한-영.csv'

# 기타 데이터 경로
sample_submission    = base_path+'submission.csv'

# 결과 저장 경로
result_path = './res/'


In [7]:
#===============================================================================
# ▶ 데이터 불러오기
#===============================================================================
df = pd.read_csv(train_file,encoding='utf-8-sig')

# 훈련 검증 데이터 분할
train_df, val_df = train_test_split(df, test_size=0.2, random_state=SEED)

In [8]:
#===============================================================================
# ▶ 데이터셋 클래스 정의
#===============================================================================
class TranslationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=MAX_LEN):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        source_text = self.data.iloc[index]['영어']
        target_text = self.data.iloc[index]['한국어']

        source_encoding = self.tokenizer(source_text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        target_encoding = self.tokenizer(target_text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)

        return {
            'input_ids': source_encoding['input_ids'].squeeze(),
            'attention_mask': source_encoding['attention_mask'].squeeze(),
            'labels': target_encoding['input_ids'].squeeze()
        }

In [9]:
#===============================================================================
# ▶ 모델 및 토크나이저 초기화
#===============================================================================
model_name = "facebook/bart-base"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


In [10]:
#===============================================================================
# ▶ 데이터셋 및 데이터로더 생성
#===============================================================================
train_dataset = TranslationDataset(train_df, tokenizer)
val_dataset = TranslationDataset(val_df, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [11]:
#===============================================================================
# ▶ 학습툴 준비
#===============================================================================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = torch.optim.AdamW(model.lm_head.parameters(), lr=LEARNING_RATE)

In [12]:
#===============================================================================
# ▶ 학습함수정의
#===============================================================================
# 학습용
def train(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return total_loss / len(dataloader)

# 평가용
def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    predictions = []
    references = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            generated = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=MAX_LEN)
            decoded_preds = tokenizer.batch_decode(generated, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            predictions.extend([pred.split() for pred in decoded_preds])
            references.extend([[ref.split()] for ref in decoded_labels])

    bleu_score = corpus_bleu(references, predictions)
    return total_loss / len(dataloader), bleu_score

In [14]:
#===============================================================================
# ▶ 학습 및 평가 루프
#===============================================================================
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    train_loss = train(model, train_loader, optimizer, device)
    val_loss, bleu_score = evaluate(model, val_loader, device)
    print(f"Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}, BLEU Score: {bleu_score:.4f}")

# 모델 저장
torch.save(model.state_dict(), './best_models/translation_model.pth')

Epoch 1/1


Training: 100%|██████████| 20000/20000 [17:10<00:00, 19.42it/s]
Evaluating: 100%|██████████| 5000/5000 [11:45<00:00,  7.09it/s]


Train Loss: 1.9707, Validation Loss: 1.4069, BLEU Score: 0.0000


In [13]:
#===============================================================================
# ▶  예측함수정의
#===============================================================================
def translate(text, model, tokenizer, device):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", max_length=MAX_LEN, truncation=True, padding='max_length')
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=MAX_LEN)
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)



In [15]:
# 체크포인트 불러오기
# model.load_state_dict(torch.load('./best_models/translation_model.pth'))


RuntimeError: Error(s) in loading state_dict for BartForConditionalGeneration:
	Missing key(s) in state_dict: "final_logits_bias", "model.shared.weight", "model.encoder.embed_tokens.weight", "model.encoder.embed_positions.weight", "model.encoder.layers.0.self_attn.k_proj.weight", "model.encoder.layers.0.self_attn.k_proj.bias", "model.encoder.layers.0.self_attn.v_proj.weight", "model.encoder.layers.0.self_attn.v_proj.bias", "model.encoder.layers.0.self_attn.q_proj.weight", "model.encoder.layers.0.self_attn.q_proj.bias", "model.encoder.layers.0.self_attn.out_proj.weight", "model.encoder.layers.0.self_attn.out_proj.bias", "model.encoder.layers.0.self_attn_layer_norm.weight", "model.encoder.layers.0.self_attn_layer_norm.bias", "model.encoder.layers.0.fc1.weight", "model.encoder.layers.0.fc1.bias", "model.encoder.layers.0.fc2.weight", "model.encoder.layers.0.fc2.bias", "model.encoder.layers.0.final_layer_norm.weight", "model.encoder.layers.0.final_layer_norm.bias", "model.encoder.layers.1.self_attn.k_proj.weight", "model.encoder.layers.1.self_attn.k_proj.bias", "model.encoder.layers.1.self_attn.v_proj.weight", "model.encoder.layers.1.self_attn.v_proj.bias", "model.encoder.layers.1.self_attn.q_proj.weight", "model.encoder.layers.1.self_attn.q_proj.bias", "model.encoder.layers.1.self_attn.out_proj.weight", "model.encoder.layers.1.self_attn.out_proj.bias", "model.encoder.layers.1.self_attn_layer_norm.weight", "model.encoder.layers.1.self_attn_layer_norm.bias", "model.encoder.layers.1.fc1.weight", "model.encoder.layers.1.fc1.bias", "model.encoder.layers.1.fc2.weight", "model.encoder.layers.1.fc2.bias", "model.encoder.layers.1.final_layer_norm.weight", "model.encoder.layers.1.final_layer_norm.bias", "model.encoder.layers.2.self_attn.k_proj.weight", "model.encoder.layers.2.self_attn.k_proj.bias", "model.encoder.layers.2.self_attn.v_proj.weight", "model.encoder.layers.2.self_attn.v_proj.bias", "model.encoder.layers.2.self_attn.q_proj.weight", "model.encoder.layers.2.self_attn.q_proj.bias", "model.encoder.layers.2.self_attn.out_proj.weight", "model.encoder.layers.2.self_attn.out_proj.bias", "model.encoder.layers.2.self_attn_layer_norm.weight", "model.encoder.layers.2.self_attn_layer_norm.bias", "model.encoder.layers.2.fc1.weight", "model.encoder.layers.2.fc1.bias", "model.encoder.layers.2.fc2.weight", "model.encoder.layers.2.fc2.bias", "model.encoder.layers.2.final_layer_norm.weight", "model.encoder.layers.2.final_layer_norm.bias", "model.encoder.layers.3.self_attn.k_proj.weight", "model.encoder.layers.3.self_attn.k_proj.bias", "model.encoder.layers.3.self_attn.v_proj.weight", "model.encoder.layers.3.self_attn.v_proj.bias", "model.encoder.layers.3.self_attn.q_proj.weight", "model.encoder.layers.3.self_attn.q_proj.bias", "model.encoder.layers.3.self_attn.out_proj.weight", "model.encoder.layers.3.self_attn.out_proj.bias", "model.encoder.layers.3.self_attn_layer_norm.weight", "model.encoder.layers.3.self_attn_layer_norm.bias", "model.encoder.layers.3.fc1.weight", "model.encoder.layers.3.fc1.bias", "model.encoder.layers.3.fc2.weight", "model.encoder.layers.3.fc2.bias", "model.encoder.layers.3.final_layer_norm.weight", "model.encoder.layers.3.final_layer_norm.bias", "model.encoder.layers.4.self_attn.k_proj.weight", "model.encoder.layers.4.self_attn.k_proj.bias", "model.encoder.layers.4.self_attn.v_proj.weight", "model.encoder.layers.4.self_attn.v_proj.bias", "model.encoder.layers.4.self_attn.q_proj.weight", "model.encoder.layers.4.self_attn.q_proj.bias", "model.encoder.layers.4.self_attn.out_proj.weight", "model.encoder.layers.4.self_attn.out_proj.bias", "model.encoder.layers.4.self_attn_layer_norm.weight", "model.encoder.layers.4.self_attn_layer_norm.bias", "model.encoder.layers.4.fc1.weight", "model.encoder.layers.4.fc1.bias", "model.encoder.layers.4.fc2.weight", "model.encoder.layers.4.fc2.bias", "model.encoder.layers.4.final_layer_norm.weight", "model.encoder.layers.4.final_layer_norm.bias", "model.encoder.layers.5.self_attn.k_proj.weight", "model.encoder.layers.5.self_attn.k_proj.bias", "model.encoder.layers.5.self_attn.v_proj.weight", "model.encoder.layers.5.self_attn.v_proj.bias", "model.encoder.layers.5.self_attn.q_proj.weight", "model.encoder.layers.5.self_attn.q_proj.bias", "model.encoder.layers.5.self_attn.out_proj.weight", "model.encoder.layers.5.self_attn.out_proj.bias", "model.encoder.layers.5.self_attn_layer_norm.weight", "model.encoder.layers.5.self_attn_layer_norm.bias", "model.encoder.layers.5.fc1.weight", "model.encoder.layers.5.fc1.bias", "model.encoder.layers.5.fc2.weight", "model.encoder.layers.5.fc2.bias", "model.encoder.layers.5.final_layer_norm.weight", "model.encoder.layers.5.final_layer_norm.bias", "model.encoder.layernorm_embedding.weight", "model.encoder.layernorm_embedding.bias", "model.decoder.embed_tokens.weight", "model.decoder.embed_positions.weight", "model.decoder.layers.0.self_attn.k_proj.weight", "model.decoder.layers.0.self_attn.k_proj.bias", "model.decoder.layers.0.self_attn.v_proj.weight", "model.decoder.layers.0.self_attn.v_proj.bias", "model.decoder.layers.0.self_attn.q_proj.weight", "model.decoder.layers.0.self_attn.q_proj.bias", "model.decoder.layers.0.self_attn.out_proj.weight", "model.decoder.layers.0.self_attn.out_proj.bias", "model.decoder.layers.0.self_attn_layer_norm.weight", "model.decoder.layers.0.self_attn_layer_norm.bias", "model.decoder.layers.0.encoder_attn.k_proj.weight", "model.decoder.layers.0.encoder_attn.k_proj.bias", "model.decoder.layers.0.encoder_attn.v_proj.weight", "model.decoder.layers.0.encoder_attn.v_proj.bias", "model.decoder.layers.0.encoder_attn.q_proj.weight", "model.decoder.layers.0.encoder_attn.q_proj.bias", "model.decoder.layers.0.encoder_attn.out_proj.weight", "model.decoder.layers.0.encoder_attn.out_proj.bias", "model.decoder.layers.0.encoder_attn_layer_norm.weight", "model.decoder.layers.0.encoder_attn_layer_norm.bias", "model.decoder.layers.0.fc1.weight", "model.decoder.layers.0.fc1.bias", "model.decoder.layers.0.fc2.weight", "model.decoder.layers.0.fc2.bias", "model.decoder.layers.0.final_layer_norm.weight", "model.decoder.layers.0.final_layer_norm.bias", "model.decoder.layers.1.self_attn.k_proj.weight", "model.decoder.layers.1.self_attn.k_proj.bias", "model.decoder.layers.1.self_attn.v_proj.weight", "model.decoder.layers.1.self_attn.v_proj.bias", "model.decoder.layers.1.self_attn.q_proj.weight", "model.decoder.layers.1.self_attn.q_proj.bias", "model.decoder.layers.1.self_attn.out_proj.weight", "model.decoder.layers.1.self_attn.out_proj.bias", "model.decoder.layers.1.self_attn_layer_norm.weight", "model.decoder.layers.1.self_attn_layer_norm.bias", "model.decoder.layers.1.encoder_attn.k_proj.weight", "model.decoder.layers.1.encoder_attn.k_proj.bias", "model.decoder.layers.1.encoder_attn.v_proj.weight", "model.decoder.layers.1.encoder_attn.v_proj.bias", "model.decoder.layers.1.encoder_attn.q_proj.weight", "model.decoder.layers.1.encoder_attn.q_proj.bias", "model.decoder.layers.1.encoder_attn.out_proj.weight", "model.decoder.layers.1.encoder_attn.out_proj.bias", "model.decoder.layers.1.encoder_attn_layer_norm.weight", "model.decoder.layers.1.encoder_attn_layer_norm.bias", "model.decoder.layers.1.fc1.weight", "model.decoder.layers.1.fc1.bias", "model.decoder.layers.1.fc2.weight", "model.decoder.layers.1.fc2.bias", "model.decoder.layers.1.final_layer_norm.weight", "model.decoder.layers.1.final_layer_norm.bias", "model.decoder.layers.2.self_attn.k_proj.weight", "model.decoder.layers.2.self_attn.k_proj.bias", "model.decoder.layers.2.self_attn.v_proj.weight", "model.decoder.layers.2.self_attn.v_proj.bias", "model.decoder.layers.2.self_attn.q_proj.weight", "model.decoder.layers.2.self_attn.q_proj.bias", "model.decoder.layers.2.self_attn.out_proj.weight", "model.decoder.layers.2.self_attn.out_proj.bias", "model.decoder.layers.2.self_attn_layer_norm.weight", "model.decoder.layers.2.self_attn_layer_norm.bias", "model.decoder.layers.2.encoder_attn.k_proj.weight", "model.decoder.layers.2.encoder_attn.k_proj.bias", "model.decoder.layers.2.encoder_attn.v_proj.weight", "model.decoder.layers.2.encoder_attn.v_proj.bias", "model.decoder.layers.2.encoder_attn.q_proj.weight", "model.decoder.layers.2.encoder_attn.q_proj.bias", "model.decoder.layers.2.encoder_attn.out_proj.weight", "model.decoder.layers.2.encoder_attn.out_proj.bias", "model.decoder.layers.2.encoder_attn_layer_norm.weight", "model.decoder.layers.2.encoder_attn_layer_norm.bias", "model.decoder.layers.2.fc1.weight", "model.decoder.layers.2.fc1.bias", "model.decoder.layers.2.fc2.weight", "model.decoder.layers.2.fc2.bias", "model.decoder.layers.2.final_layer_norm.weight", "model.decoder.layers.2.final_layer_norm.bias", "model.decoder.layers.3.self_attn.k_proj.weight", "model.decoder.layers.3.self_attn.k_proj.bias", "model.decoder.layers.3.self_attn.v_proj.weight", "model.decoder.layers.3.self_attn.v_proj.bias", "model.decoder.layers.3.self_attn.q_proj.weight", "model.decoder.layers.3.self_attn.q_proj.bias", "model.decoder.layers.3.self_attn.out_proj.weight", "model.decoder.layers.3.self_attn.out_proj.bias", "model.decoder.layers.3.self_attn_layer_norm.weight", "model.decoder.layers.3.self_attn_layer_norm.bias", "model.decoder.layers.3.encoder_attn.k_proj.weight", "model.decoder.layers.3.encoder_attn.k_proj.bias", "model.decoder.layers.3.encoder_attn.v_proj.weight", "model.decoder.layers.3.encoder_attn.v_proj.bias", "model.decoder.layers.3.encoder_attn.q_proj.weight", "model.decoder.layers.3.encoder_attn.q_proj.bias", "model.decoder.layers.3.encoder_attn.out_proj.weight", "model.decoder.layers.3.encoder_attn.out_proj.bias", "model.decoder.layers.3.encoder_attn_layer_norm.weight", "model.decoder.layers.3.encoder_attn_layer_norm.bias", "model.decoder.layers.3.fc1.weight", "model.decoder.layers.3.fc1.bias", "model.decoder.layers.3.fc2.weight", "model.decoder.layers.3.fc2.bias", "model.decoder.layers.3.final_layer_norm.weight", "model.decoder.layers.3.final_layer_norm.bias", "model.decoder.layers.4.self_attn.k_proj.weight", "model.decoder.layers.4.self_attn.k_proj.bias", "model.decoder.layers.4.self_attn.v_proj.weight", "model.decoder.layers.4.self_attn.v_proj.bias", "model.decoder.layers.4.self_attn.q_proj.weight", "model.decoder.layers.4.self_attn.q_proj.bias", "model.decoder.layers.4.self_attn.out_proj.weight", "model.decoder.layers.4.self_attn.out_proj.bias", "model.decoder.layers.4.self_attn_layer_norm.weight", "model.decoder.layers.4.self_attn_layer_norm.bias", "model.decoder.layers.4.encoder_attn.k_proj.weight", "model.decoder.layers.4.encoder_attn.k_proj.bias", "model.decoder.layers.4.encoder_attn.v_proj.weight", "model.decoder.layers.4.encoder_attn.v_proj.bias", "model.decoder.layers.4.encoder_attn.q_proj.weight", "model.decoder.layers.4.encoder_attn.q_proj.bias", "model.decoder.layers.4.encoder_attn.out_proj.weight", "model.decoder.layers.4.encoder_attn.out_proj.bias", "model.decoder.layers.4.encoder_attn_layer_norm.weight", "model.decoder.layers.4.encoder_attn_layer_norm.bias", "model.decoder.layers.4.fc1.weight", "model.decoder.layers.4.fc1.bias", "model.decoder.layers.4.fc2.weight", "model.decoder.layers.4.fc2.bias", "model.decoder.layers.4.final_layer_norm.weight", "model.decoder.layers.4.final_layer_norm.bias", "model.decoder.layers.5.self_attn.k_proj.weight", "model.decoder.layers.5.self_attn.k_proj.bias", "model.decoder.layers.5.self_attn.v_proj.weight", "model.decoder.layers.5.self_attn.v_proj.bias", "model.decoder.layers.5.self_attn.q_proj.weight", "model.decoder.layers.5.self_attn.q_proj.bias", "model.decoder.layers.5.self_attn.out_proj.weight", "model.decoder.layers.5.self_attn.out_proj.bias", "model.decoder.layers.5.self_attn_layer_norm.weight", "model.decoder.layers.5.self_attn_layer_norm.bias", "model.decoder.layers.5.encoder_attn.k_proj.weight", "model.decoder.layers.5.encoder_attn.k_proj.bias", "model.decoder.layers.5.encoder_attn.v_proj.weight", "model.decoder.layers.5.encoder_attn.v_proj.bias", "model.decoder.layers.5.encoder_attn.q_proj.weight", "model.decoder.layers.5.encoder_attn.q_proj.bias", "model.decoder.layers.5.encoder_attn.out_proj.weight", "model.decoder.layers.5.encoder_attn.out_proj.bias", "model.decoder.layers.5.encoder_attn_layer_norm.weight", "model.decoder.layers.5.encoder_attn_layer_norm.bias", "model.decoder.layers.5.fc1.weight", "model.decoder.layers.5.fc1.bias", "model.decoder.layers.5.fc2.weight", "model.decoder.layers.5.fc2.bias", "model.decoder.layers.5.final_layer_norm.weight", "model.decoder.layers.5.final_layer_norm.bias", "model.decoder.layernorm_embedding.weight", "model.decoder.layernorm_embedding.bias", "lm_head.weight". 
	Unexpected key(s) in state_dict: "epoch", "state_dict", "optimizer". 

In [None]:
#===============================================================================
# ▶ 예측테스트
#===============================================================================
test_sentence = "Hello! what have you been up to?"
translated = translate(test_sentence, model, tokenizer, device)
print(f"영어원문: {test_sentence}")
print(f"한글번역: {translated}")

영어원문: Hello! what have you been up to? What have you been doing in the past few months    with   your        computer
한글번역: 여�
