In [1]:
import pandas as pd

In [None]:
df = pd.read_csv("data/train.csv")
df

In [3]:
import re

def contains_lowercase_or_special(text):
    # 영어 소문자 포함 여부
    lowercase_present = bool(re.search(r'[a-z]', text))
    # '...' 이외의 특수기호 포함 여부 (한글과 영어, 숫자, 공백, 마침표만 허용)
    special_chars_present = bool(re.search(r'[^A-Z0-9ㄱ-ㅎ가-힣.\s]', text)) \
        and '...' not in text and '…' not in text and '·' not in text \
        and '美' not in text and '中' not in text and '日' not in text
    return lowercase_present or special_chars_present

# 새로운 컬럼 생성
df['need_clean'] = df['text'].apply(contains_lowercase_or_special)

In [4]:
df['need_clean'].value_counts()

need_clean
True     1643
False    1157
Name: count, dtype: int64

In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM


model_id = "rtzr/ko-gemma-2-9b-it"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

model.eval()

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 10/10 [00:06<00:00,  1.57it/s]


Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 3584, padding_idx=0)
    (layers): ModuleList(
      (0-41): 42 x Gemma2DecoderLayer(
        (self_attn): Gemma2Attention(
          (q_proj): Linear(in_features=3584, out_features=4096, bias=False)
          (k_proj): Linear(in_features=3584, out_features=2048, bias=False)
          (v_proj): Linear(in_features=3584, out_features=2048, bias=False)
          (o_proj): Linear(in_features=4096, out_features=3584, bias=False)
          (rotary_emb): Gemma2RotaryEmbedding()
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear(in_features=3584, out_features=14336, bias=False)
          (up_proj): Linear(in_features=3584, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=3584, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm((3584,), eps=1e-06)
        (pre_feedforward_layernorm): Gemma2RMSNorm((3584,), 

In [8]:
instruction = """뉴스 기사 제목을 노이즈가 없도록 완전히 복원하시오.
답변에는 설명을 붙이지 마시오.

### 제목 ###
{}"""

In [9]:
from tqdm import tqdm

In [19]:
def change_noise(id, text):
    max_attempts = 5  # Maximum number of attempts to prevent infinite loops
    attempt = 0
    messages = [
        {"role": "user", "content": instruction.format(text)},
    ]

    while attempt < max_attempts:
        input_ids = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(model.device)

        terminators = [
            tokenizer.eos_token_id,
            tokenizer.convert_tokens_to_ids("<end_of_turn>")
        ]

        outputs = model.generate(
            input_ids,
            max_new_tokens=2048,
            eos_token_id=terminators,
            do_sample=True,
            temperature=0.1,
            top_p=0.9,
        )

        pred = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True).strip()

        if text != pred:
            print("changed:", id, "from:", text, "to:", pred)

        if not contains_lowercase_or_special(pred):
            return pred  # Return the corrected text if it passes the test

        # Append "다시 수정하시오." to the messages for the next iteration
        messages.append({"role": "user", "content": f"노이즈가 없도록 수정하시오. 중간의 숫자나 불필요한 기호를 제거하고 풍부한 표현을 생성할 수 있다. \n ### 수정 필요문 ### \n {pred}"})
        attempt += 1

    # Return the last prediction if maximum attempts are reached
    return pred

In [None]:
df["text_after"] = ""

for i, row in tqdm(df.iterrows(), total=len(df)):
    if row["need_clean"]:
        df.loc[i, "text_after"] = change_noise(row["ID"], row["text"])
    else:
        df.loc[i, "text_after"] = row["text"]

In [None]:
df

In [22]:
df = df.drop(columns=["need_clean", "text"])
df = df.rename(columns={"text_after": "text"})
df = df[["ID", "text", "target"]]
df.to_csv("data/train_cleaned_multiturn.csv", index=False)

In [24]:
# \n이 있으면 split해서 -1번째를 가져옴
df['text_after'] = df['text'].apply(lambda x: x.split('\n')[-1])

In [26]:
df_cleaned = df.drop(columns=["text"])
df_cleaned = df_cleaned.rename(columns={"text_after": "text"})
df_cleaned = df_cleaned[["ID", "text", "target"]]
df_cleaned.to_csv("data/train_cleaned_multiturn_algo.csv", index=False)