# 1. polglot-ko-1.3b를 squarelike/sharegpt_deepl_ko_translation로 파인 튜닝한 모델

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "aeolian83/Gugugo_for_DnD_v0.7"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")

In [2]:
model.eval()
model.config.use_cache = True  # silence the warnings. Please re-enable for inference!

In [3]:
from transformers import StoppingCriteria, StoppingCriteriaList

class StoppingCriteriaSub(StoppingCriteria):

    def __init__(self, stops = [], encounters=1):
        super().__init__()
        self.stops = [stop for stop in stops]

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
        for stop in self.stops:
            if torch.all((stop == input_ids[0][-len(stop):])).item():
                return True

        return False

stop_words = ["</끝>"]
stop_words_ids = [tokenizer(stop_word, return_tensors='pt')['input_ids'].squeeze() for stop_word in stop_words]
stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])

In [4]:
def gen(lan="en", x=""):
    if (lan == "ko"):
        prompt = f"### 한국어: {x}</끝>\n### 영어:"
    else:
        prompt = f"### 영어: {x}</끝>\n### 한국어:"
    gened = model.generate(
        **tokenizer(
            prompt,
            return_tensors='pt',
            return_token_type_ids=False
        ),
        max_new_tokens=2048,
        temperature=0.001,
        no_repeat_ngram_size=10,
        early_stopping=True,
        eos_token_id=2,
        stopping_criteria=stopping_criteria
    )
    return tokenizer.decode(gened[0]).replace(prompt+" ", "")

In [5]:
import xml.etree.ElementTree as ET
import pandas as pd

# xml파일 파싱해서 contestuid를 key로 하고 문자열을 value로 하는 사전을 리턴
# contestuid가 중복이 있을 경우 그에 따른 메세지 도출

def parse_xml_to_dict(script_dir):
    result_dict = {}
    tree = ET.parse(script_dir)
    root = tree.getroot()
    
    # root = ET.fromstring(xml_content)
    for content_elem in root.findall('content'):
        contentuid = content_elem.get('contentuid')
        content_text = content_elem.text
        if result_dict.get(contentuid) == None:
            result_dict[contentuid] = content_text
        else:
            print("Duplicate Key!!", contentuid, content_text)
        
    return result_dict

def detect_language(text):
    total_chars = len(text)

    korean_count = sum(1 for char in text if 0xAC00 <= ord(char) <= 0xD7A3)

    return korean_count / total_chars #>= 0.5

userTranslate_path = "./train_data/BG3(raw)/ht_ko_en.xml"
english_path = "./train_data/BG3(raw)/en.xml"

userTranslate_dict = parse_xml_to_dict(userTranslate_path)
english_dict = parse_xml_to_dict(english_path)

english_corp = []
korean_corp = []
target_key = []
korean_key = []

for key in english_dict:
    if detect_language(userTranslate_dict[key]) >= 0.4:
        korean_corp.append(userTranslate_dict[key])
        english_corp.append(english_dict[key])
        korean_key.append(key)
    else:
        target_key.append(key)

print(len(korean_corp))
print(len(english_corp))
print(len(english_dict))
print(len(userTranslate_dict))

52168
52168
222785
222785


In [20]:
gen(lan="en", x=english_dict[korean_key[35806]])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'가난하고 어두운 어린 시절을 보낸 후에는, 얼마나 많은 것을 얻을 수 있는지 알고 있을 거요. 당신의 거리의 지혜를 이용하면 앞으로 나아갈 여정에 당신의 영혼을 불어넣을 수 있소.</끝>'

In [7]:
# gen(lan="en", x="While you are wearing armor, you gain a +1 bonus to AC.")

In [8]:
# gen(lan="en", x="Chultans are the human natives of Chult. They have dark skin, black hair, and a rich culture. Driven out of the jungle by monsters and undead, they took refuge behind the stout walls of Port Nyanzaru, gave up their dynasties, and united to become a mercantile power in the southern ocean, eager to do business with anyone who visits their perilous land. A few Chultans migrated northward and established outposts in dis tant cities such as Athkatla, Baldur's Gate, Cali mport, Memnon, and Zazesspur.")

In [9]:
# gen(lan="en", x="The bell tower rises another 10 feet above the thatched roof of the bailey, making it 30 feet tall overa ll. In a normal day, the bell is ru ng twelve times : it's run g every four hours to s ignal the change of watch, and it's rung 10 minutes before each of those to signal that a change of watch is about to occur. At the change of the watch, it 's rung a number of times equaling the number of the watch: once at the start of the first watch, twice at the start of the second watch, and so forth. On what the garrison calls \"the wake-ups,\" it rings once.")

In [10]:
gen(lan="en", x="Nanny Pu'pu, a green hag, plays the part of an ancient crone to the hilt. She claims that a ll the other villagers were killed over the course of many years by winged creatures that live on the south side of the plateau.")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'녹색의 망아지, 나니푸는, 녹색 망아지가 손에 든 망치로 옛 거인의 역할을 하고 있다. 그녀는 남쪽에 있는 언덕에 살고 있는 날개 달린 생명체들에 의해 수천 년 동안 많은 사람들이 죽었다고 주장한다.</끝>'

In [11]:
gen(lan="en", x="Erik, I've gone in search of the nine shrines. V.")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'에릭, 아홉 개의 사원을 찾으러 왔소. V.</끝>'