In [1]:
import os
from tqdm import tqdm
import random
import torch
import pandas as pd
import sklearn
import datasets
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from torch.nn import functional as F
import torch.nn as nn
import torch.optim as optim
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorWithPadding,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    TrainerCallback,
    pipeline,
    logging,
    RobertaTokenizer,
    RobertaForSequenceClassification,
    RobertaConfig
)

In [2]:
from transformers import RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer

model = RobertaForSequenceClassification.from_pretrained("Yuhei0531/kaggle_llm_detect_512tokens_top")

model_name="roberta-base"

tokenizer = RobertaTokenizer.from_pretrained("Yuhei0531/kaggle_llm_detect_512tokens_top")


In [3]:
import pandas as pd

df = pd.read_csv("test_essays.csv")

In [4]:
target_word = "\n"
df['text'] = df['text'].str.replace(target_word, '')

In [5]:
tmp_dataset = datasets.Dataset.from_pandas(df)


class CustomDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length=512):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset['text'][idx]
        id = self.dataset['id'][idx]
        # Tokenize and encode the text
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Extract input_ids, attention_mask, and convert label to tensor
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'id': id
        }

In [6]:
test_dataset = CustomDataset(tmp_dataset, tokenizer)

test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def test(model, dataloader, device):
    model.to(device)
    model.eval()

    softmax = nn.Softmax(dim=-1)
    id_ary = []
    generated_ary = []
    

    for data in dataloader:
        input_ids = data["input_ids"].to(device)
        attention_mask = data["attention_mask"].to(device)
        id = data["id"]

        outputs = model(input_ids, attention_mask).logits
        generated = softmax(outputs).to('cpu').detach().numpy().copy()

        id_ary.append(id[0])
        generated_ary.append(generated[0][1])

    dict_ = {
        "id": id_ary,
        "generated": generated_ary
    }

    submission = pd.DataFrame(dict_)

    return submission

submission = test(model, test_dataloader, device)

In [8]:
submission.to_csv('submission.csv', index=False)