In [11]:
import pandas as pd

In [None]:
df = pd.read_csv("data/train.csv")
df

In [13]:
import re

def contains_lowercase_or_special(text):
    # 영어 소문자 포함 여부
    lowercase_present = bool(re.search(r'[a-z]', text))
    # '...' 이외의 특수기호 포함 여부 (한글과 영어, 숫자, 공백, 마침표만 허용)
    special_chars_present = bool(re.search(r'[^A-Z0-9ㄱ-ㅎ가-힣.\s]', text)) \
        and '...' not in text and '…' not in text and '·' not in text \
        and '美' not in text and '中' not in text and '日' not in text
    return lowercase_present or special_chars_present

# 새로운 컬럼 생성
df['need_clean'] = df['text'].apply(contains_lowercase_or_special)

In [14]:
df['need_clean'].value_counts()

need_clean
True     1643
False    1157
Name: count, dtype: int64

In [None]:
df['cleaned_text'] = df['text'].str.replace(r'[^0-9ㄱ-ㅎ가-힣.\s]', '_', regex=True)
df

In [16]:
from transformers import AutoTokenizer, AutoModelForCausalLM


model_id = "beomi/Llama-3-Open-Ko-8B-Instruct-preview"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="auto",
    device_map="auto",
)

model.eval()

Downloading shards: 100%|██████████| 4/4 [09:04<00:00, 136.19s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.21it/s]
Some parameters are on the meta device because they were offloaded to the cpu.


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm)

In [17]:
instruction = """뉴스 기사 제목 빈칸을 채워넣으시오.
오직 답변만 하고, 이외 설명을 절대 붙이지 마시오.

예시: '북한 대_확_기' _격당한 대성동__…
답변: '북한 대북확성기' 직격당한 대성동마을…

### 제목 ###
{}"""

In [18]:
from tqdm import tqdm

In [None]:
def change_noise(id, text):
    messages = [
        {"role": "user", "content": instruction.format(text)},
    ]

    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = model.generate(
        input_ids,
        max_new_tokens=2048,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.1,
        top_p=0.9,
    )

    output = outputs[0][input_ids.shape[-1]:]
    pred = tokenizer.decode(output, skip_special_tokens=True)
    if text != pred.strip():
        print("changed:", id, "from:", text, "to:", pred.strip())
    
    return pred.strip()

In [None]:
df["text_after"] = ""

for i, row in tqdm(df.iterrows(), total=len(df)):
    if row["need_clean"]:
        df.loc[i, "text_after"] = change_noise(row["ID"], row["cleaned_text"])
    else:
        df.loc[i, "text_after"] = row["cleaned_text"]

In [None]:
df

In [12]:
df = df.drop(columns=["need_clean", "text"])
df = df.rename(columns={"text_after": "text"})
df = df[["ID", "text", "target"]]
df.to_csv("data/train_cleaned.csv", index=False)