# Import 

In [6]:
import pandas as pd

from sklearn.model_selection import train_test_split

from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import torch

from tqdm import tqdm

# Data load & Pre-processing

In [7]:
train = pd.read_csv('./train.csv')

In [8]:
# 입력 텍스트와 타겟 텍스트를 구성하는 함수
def make_input(row):
    sentences = [row[f"sentence_{i}"] for i in range(4)]
    input_text = "문장을 순서대로 정렬하세요: " + " </s> ".join(sentences)
    answer = [row[f"answer_{i}"] for i in range(4)]
    target_text = " ".join(map(str, answer))  # 예: "0 3 1 2"
    return {"input": input_text, "target": target_text}

In [9]:
# 데이터셋 가공 및 분할
inputs = train.apply(make_input, axis=1).tolist()
train_data, valid_data = train_test_split(inputs, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(pd.DataFrame(train_data))
valid_dataset = Dataset.from_pandas(pd.DataFrame(valid_data))

# Model Load

In [None]:
# 토크나이저 및 모델 로딩
model_name = "t5-small"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
model.to(device)

# Train

In [None]:
# 토크나이징 함수 정의
def tokenize(example):
    model_inputs = tokenizer(example["input"], max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(example["target"], max_length=16, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# 데이터셋 토크나이징
tokenized_train = train_dataset.map(tokenize, batched=True)
tokenized_valid = valid_dataset.map(tokenize, batched=True)

In [None]:
# 학습 설정
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=30,
)

# Trainer 정의 및 학습
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
)

trainer.train()

In [13]:
tokenizer.save_pretrained("./results")
model.save_pretrained("./results")

# Inference

In [None]:
# 모델 로드
model_dir = "./results"
tokenizer = T5Tokenizer.from_pretrained(model_dir)
model = T5ForConditionalGeneration.from_pretrained(model_dir)
model.eval()

In [None]:
# 테스트 데이터
test = pd.read_csv("./test.csv")
sentences = test[[f"sentence_{i}" for i in range(4)]].values.tolist()

# 추론 함수
def predict_order(sent_list):
    input_text = "문장을 순서대로 정렬하세요: " + " </s> ".join(sent_list)
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,
        padding="longest",
        max_length=512
    ).to(model.device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_length=16,
            do_sample=True,      
            temperature=0.2,     
            top_p=0.9,            
            pad_token_id=tokenizer.eos_token_id
        )

    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    
    try:
        order = list(map(int, decoded.strip().split()))
        return order
    except:
        return [0, 1, 2, 3]

# 예측
predictions = []
for sent_group in tqdm(sentences, desc="Predicting"):
    pred = predict_order(sent_group)
    predictions.append(pred)

# Submission

In [None]:
# sample_submission 불러오기
sample_submission = pd.read_csv("./sample_submission.csv")

# 예측 결과 적용
for i in range(4):
    sample_submission[f"answer_{i}"] = [
        pred[i] if len(pred) == 4 else i for pred in predictions
    ]

# 저장
sample_submission.to_csv("baseline_submission.csv", index=False)