In [1]:
# Load model directly
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("NHNDQ/nllb-finetuned-en2ko", device_map="auto")
model = AutoModelForSeq2SeqLM.from_pretrained("NHNDQ/nllb-finetuned-en2ko", device_map="auto")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     /data/ephemeral/home/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /data/ephemeral/home/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
# 번역 수행
def translate(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    with torch.no_grad():
        result = model.generate(**inputs)
    return tokenizer.batch_decode(result, skip_special_tokens=True)[0]

In [4]:
def batch_translate(sentences, batch_size=16):
    translations = []
    for i in tqdm(range(0, len(sentences), batch_size), desc="Translating", unit="batch"):
        batch = sentences[i:i+batch_size]
        # 토크나이즈 및 입력 준비
        inputs = tokenizer(batch, return_tensors="pt", truncation=True, padding=True).to(device)
        try:
            with torch.no_grad():
                # 번역 수행
                outputs = model.generate(**inputs)
            # 번역 결과 디코딩
            translated_batch = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            translations.extend(translated_batch)
        except Exception as e:
            print(f"Error translating batch {i}: {e}")
            # 에러 발생 시 빈 문자열 추가
            translations.extend([""] * len(batch))
    return translations

In [14]:
import pandas as pd

df = pd.read_csv('/data/ephemeral/home/personal/jinjae/data_preprocess/openstax_world_history2_final.csv')

In [15]:
df

Unnamed: 0,section,title,context,context_len
0,developing a global perspective,Learning Objectives,"By the end of this section, you will be able to:",48
1,developing a global perspective,World History as Preparation for Life After Co...,History is more than a series of names and dat...,757
2,developing a global perspective,World History as Preparation for Life After Co...,This world history text has several key featur...,796
3,developing a global perspective,World History as Preparation for Life After Co...,The study of history will also enhance your cr...,657
4,developing a global perspective,World History as Preparation for Life After Co...,"Without question, skills such as critical thin...",1061
...,...,...,...,...
2310,term,offshoring,the process of moving some of a company’s oper...,93
2311,term,outsourcing,"the process of hiring outside contractors, som...",113
2312,term,Paris Agreement,a 2015 treaty among members of the United Nati...,146
2313,term,resource curse,the problem that makes resource-rich developin...,141


In [16]:
# 문장 분할 및 매핑 생성 (nltk 사용)
sentences = []
mapping = []  # 각 문장이 어느 행과 어떤 위치에 있는지 저장
for row_idx, context in enumerate(df['context']):
    # nltk를 사용하여 문장 분할
    split_sentences = [s.strip() for s in sent_tokenize(context) if s.strip()]
    for sent_idx, sentence in enumerate(split_sentences):
        sentences.append(sentence)
        mapping.append((row_idx, sent_idx))

# 배치 번역 수행
translated_sentences = batch_translate(sentences, batch_size=16)

# 번역된 문장을 원래의 구조로 재조립
translated_contexts = [[] for _ in range(len(df))]
for (row_idx, sent_idx), translated in zip(mapping, translated_sentences):
    translated_contexts[row_idx].append(translated)

# 번역된 문장들을 다시 하나의 텍스트로 결합
df['translated_context'] = [
    '. '.join(sents) + '.' if sents else '' 
    for sents in translated_contexts
]


Translating: 100%|██████████| 633/633 [08:07<00:00,  1.30batch/s]


In [17]:
# 중복 점 제거
df["translated_context"] = df["translated_context"].apply(lambda x: x.replace("..", "."))
df["translated_context"] = df["translated_context"].apply(lambda x: x.replace("?.", "?"))
df["translated_context"] = df["translated_context"].apply(lambda x: x.replace("!.", "!"))

In [18]:
df["translated_context"][0]

'이 섹션이 끝날 때까지, 당신은 다음과 같이 할 수 있을 것이다.'

In [19]:
df.to_csv('data_preprocess/openstax_world_history2_final_translated.csv', index=False)

In [20]:
df

Unnamed: 0,section,title,context,context_len,translated_context
0,developing a global perspective,Learning Objectives,"By the end of this section, you will be able to:",48,"이 섹션이 끝날 때까지, 당신은 다음과 같이 할 수 있을 것이다."
1,developing a global perspective,World History as Preparation for Life After Co...,History is more than a series of names and dat...,757,"역사는 일련의 이름과 날짜 이상의 것이며, 그것들은 단순히 그것의 구성 요소들, 전..."
2,developing a global perspective,World History as Preparation for Life After Co...,This world history text has several key featur...,796,이 세계사 텍스트에는 현재와 관련된 방식으로 과거를 이해하는 데 도움이 되는 몇 가...
3,developing a global perspective,World History as Preparation for Life After Co...,The study of history will also enhance your cr...,657,역사 연구는 또한 고용주가 원하는 상위 10개 기술에 지속적으로 나타나는 비판적 사...
4,developing a global perspective,World History as Preparation for Life After Co...,"Without question, skills such as critical thin...",1061,"의심의 여지 없이 역사 공부를 통해 비판적 사고, 분석, 창의성 등의 기술이 가장 ..."
...,...,...,...,...,...
2310,term,offshoring,the process of moving some of a company’s oper...,93,더 저렴한 노동 시장에 접근하기 위해 회사의 운영 중 일부를 해외로 이전하는 과정.
2311,term,outsourcing,"the process of hiring outside contractors, som...",113,회사가 내부에서 수행한 작업을 수행하기 위해 외부 계약자를 고용하는 과정.
2312,term,Paris Agreement,a 2015 treaty among members of the United Nati...,146,2015년 유엔 회원국 간 지구 온난화를 산업화 당시부터 2°C(3.6°F) 미만으...
2313,term,resource curse,the problem that makes resource-rich developin...,141,"자원 부국 개발 도상국을 권위주의, 높은 갈등률, 낮은 경제 성장률에 취약하게 만드..."
