In [1]:
import pandas as pd

In [None]:
df = pd.read_csv("restored_train_data6_part1.csv")
df

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM


model_id = "rtzr/ko-gemma-2-9b-it"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

model.eval()

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 10/10 [00:05<00:00,  1.70it/s]


Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 3584, padding_idx=0)
    (layers): ModuleList(
      (0-41): 42 x Gemma2DecoderLayer(
        (self_attn): Gemma2Attention(
          (q_proj): Linear(in_features=3584, out_features=4096, bias=False)
          (k_proj): Linear(in_features=3584, out_features=2048, bias=False)
          (v_proj): Linear(in_features=3584, out_features=2048, bias=False)
          (o_proj): Linear(in_features=4096, out_features=3584, bias=False)
          (rotary_emb): Gemma2RotaryEmbedding()
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear(in_features=3584, out_features=14336, bias=False)
          (up_proj): Linear(in_features=3584, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=3584, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm((3584,), eps=1e-06)
        (pre_feedforward_layernorm): Gemma2RMSNorm((3584,), 

In [4]:
instruction = """다음 뉴스 기사 제목과 동일한 카테고리에서 나올 기사 제목을 세 개 생성하시오.
답변은 오직 제목 세 개만을 포함해야 하며, 불가능한 경우는 없다.

### 제목 ###
{}"""

In [5]:
from tqdm import tqdm

In [6]:
def gen_key(id, text):
    messages = [
        {"role": "user", "content": instruction.format(text)},
    ]

    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<end_of_turn>")
    ]

    outputs = model.generate(
        input_ids,
        max_new_tokens=2048,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.9,
        top_p=0.9,
    )

    pred = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)
    if text != pred.strip():
        print("changed:", id, "from:", text, "to:", pred.strip())
    
    return pred.strip()

In [None]:
df["text_generated"] = ""

for i, row in tqdm(df.iterrows(), total=len(df)):
    df.loc[i, "text_generated"] = gen_key(row["ID"], row["text"])

In [11]:
df["text"] = df["text_generated"]
df = df.drop(columns=["text_generated"])
df.to_csv("data/train_aug_gt8250_part1.csv", index=False)

In [46]:
df = pd.read_csv("data/train_aug_gt8250_part2.csv")

In [48]:
def process_text(text):
    # 텍스트를 줄바꿈 문자로 분할
    lines = text.split('\n')
    
    # 결과를 저장할 리스트 초기화
    processed_lines = []
    
    for line in lines:
        # 줄 앞에 번호가 있는 경우 번호 제거
        if line.strip().startswith(tuple(f"{i}." for i in range(1, 10))):
            line = line[line.find('.')+1:].strip()
        
        # 줄 앞에 '-'가 있는 경우 '-' 제거
        line = line.replace('-', '').strip()
        
        # 처리된 줄을 리스트에 추가
        processed_lines.append(line)
    
    # 처리된 텍스트를 반환
    return [line for line in processed_lines if line]

In [49]:
df["text"] = df["text"].apply(process_text)

In [50]:
exploded_data = df.explode("text")

In [52]:
# DataFrame의 길이 계산
n = len(exploded_data)

# 새로운 순서 인덱스 생성
new_order = [i for i in range(0, n, 3)] + [i for i in range(1, n, 3)] + [i for i in range(2, n, 3)]

# DataFrame을 새로운 순서로 재배열
reordered_data = exploded_data.iloc[new_order].reset_index(drop=True)

In [None]:
# DataFrame의 길이 계산
n = len(exploded_data)

# 새로운 순서 인덱스 생성
new_order = [i for i in range(0, n, 3)] + [i for i in range(1, n, 3)] + [i for i in range(2, n, 3)]

# DataFrame을 새로운 순서로 재배열
reordered_data = exploded_data.iloc[new_order].reset_index(drop=True)

In [54]:
part1 = pd.read_csv("data/train_aug_gt8250_part1_reordered.csv")
part2 = pd.read_csv("data/train_aug_gt8250_part2_reordered.csv")

In [55]:
# 두 데이터프레임을 연결

df = pd.concat([part1, part2], ignore_index=True)

In [57]:
df.to_csv("data/train_aug_gt8250_reordered_all.csv", index=False)