In [1]:
from transformers import AdamW
import pandas as pd
import torch
import random
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertConfig, BertForSequenceClassification, BertTokenizer
from sklearn.metrics import roc_auc_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 超参数
hidden_dropout_prob = 0.3
num_labels = 2
learning_rate = 1e-5
weight_decay = 1e-2
epochs = 3
batch_size = 64
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
class SentimentDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset.iloc[idx]["text"]
        label = self.dataset.iloc[idx]["label"]
        sample = {"text": text, "label": label}
        return sample

# 划分数据集
path_to_file = "llm-detect-ai-generated-text/train_v4_drcat_01.csv"
dataset = pd.read_csv(path_to_file, sep=",", names=["text","label","prompt_name","source","RDizzl3_seven","model"], skiprows=1)
indices = list(range(len(dataset)))
train_indices = random.sample(indices, int(0.9 * len(dataset)))
test_indices = list(set(indices) - set(train_indices))

# 使用方括号而不是圆括号
train_set = SentimentDataset(dataset.iloc[train_indices])
test_set = SentimentDataset(dataset.iloc[test_indices])

# 创建 DataLoader
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=0)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=0)

In [4]:
# 定义 tokenizer，传入词汇表
tokenizer = BertTokenizer.from_pretrained("/gemini/pretrain")
config = BertConfig.from_pretrained("/gemini/pretrain", num_labels=num_labels, hidden_dropout_prob=hidden_dropout_prob)
model = BertForSequenceClassification.from_pretrained("/gemini/pretrain", config=config).to(device)

Some weights of the model checkpoint at /gemini/pretrain were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at 

In [5]:
# 定义优化器和损失函数
# Prepare optimizer and schedule (linear warmup and decay)
# 设置 bias 和 LayerNorm.weight 不使用 weight_decay
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

#optimizer = AdamW(model.parameters(), lr=learning_rate)
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
criterion = nn.CrossEntropyLoss()



In [6]:
def train(model, iterator, optimizer, criterion, device, freezeLLM):
    if freezeLLM:
        for param in model.bert.parameters():
            param.requires_grad = False
        total_params = sum(p.numel() for p in model.bert.parameters())
        print(f"Freeze parameters in the model: {total_params}")
    else:
        for param in model.bert.parameters():
            param.requires_grad = True
    model.train()
    epoch_loss = 0
    y_true_train = []
    y_scores_train = []
    for i, batch in enumerate(iterator):
        optimizer.zero_grad()
        label = batch["label"]
        text = batch["text"]
        tokenized_text = tokenizer(text, max_length=100, add_special_tokens=True, truncation=True, padding=True, return_tensors="pt").to(device)
        label = label.clone().to(device).detach()
        output = model(**tokenized_text, labels=label)
        y_pred_prob = output[1]
        loss = criterion(y_pred_prob.view(-1, 2), label.view(-1))
        y_true_train.extend(label.cpu().numpy())
        y_scores_train.extend(y_pred_prob[:, 1].cpu().detach().numpy())  # Assuming 1 is the positive class index
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

        # 每隔一定步数打印当前 loss 和计算 AUC-ROC
        if (i+1) % 100 == 0:
            auc_train = roc_auc_score(y_true_train, y_scores_train)
            print("epoch", i+1, "\t", "current loss:", epoch_loss / (i+1), "auc-roc:", auc_train)

    return epoch_loss / len(iterator), auc_train

# 其他部分的代码保持不变


def evaluate(model, iterator, device):
    model.eval()
    epoch_loss = 0
    y_true_eval = []
    y_scores_eval = []
    with torch.no_grad():
        for _, batch in enumerate(iterator):
            label = batch["label"]
            text = batch["text"]
            tokenized_text = tokenizer(text, max_length=100, add_special_tokens=True, truncation=True, padding=True, return_tensors="pt").to(device)
            label = label.clone().to(device).detach()
            output = model(**tokenized_text, labels=label)
            y_pred_prob = output[1]
            y_true_eval.extend(label.cpu().numpy())
            y_scores_eval.extend(y_pred_prob[:, 1].cpu().detach().numpy())  # Assuming 1 is the positive class index
            loss = output[0]
            epoch_loss += loss.item()

    auc_eval = roc_auc_score(y_true_eval, y_scores_eval)
    return epoch_loss / len(iterator), auc_eval


In [7]:
# 开始训练和验证
for i in range(epochs):
    if i==0:
        train_loss, train_acc = train(model, train_loader, optimizer, criterion, device, freezeLLM=True)
    else:
        train_loss, train_acc = train(model, train_loader, optimizer, criterion, device, freezeLLM=False)
    valid_loss, valid_acc = evaluate(model, test_loader, device)
    print("final valid loss: ", valid_loss, "\t", "valid auc-roc:", valid_acc)

Freeze parameters in the model: 109482240
epoch 100 	 current loss: 0.6666945070028305 auc-roc: 0.49045136029462894
epoch 200 	 current loss: 0.66067117780447 auc-roc: 0.4971431734240753
epoch 300 	 current loss: 0.6570651801427205 auc-roc: 0.5023685652227351
epoch 400 	 current loss: 0.6537523891031742 auc-roc: 0.5084049594573892
epoch 500 	 current loss: 0.6527262557744979 auc-roc: 0.5136432881052895
epoch 600 	 current loss: 0.6510228065649668 auc-roc: 0.5191831221486636
epoch 700 	 current loss: 0.6483020538943155 auc-roc: 0.5277478325060045
epoch 800 	 current loss: 0.6474259801208972 auc-roc: 0.5315094487464447
epoch 900 	 current loss: 0.646144156522221 auc-roc: 0.5358905600004513
epoch 1000 	 current loss: 0.6435979158878327 auc-roc: 0.5432108096422851
final valid loss:  0.6543782480384992 	 valid auc-roc: 0.7453787793541692
epoch 100 	 current loss: 0.324977592676878 auc-roc: 0.9252474636254882
epoch 200 	 current loss: 0.26979101125150917 auc-roc: 0.947165167146488
epoch 300 

In [None]:
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
# Test data processing
test_inputs = tokenizer(test['text'].tolist(), padding=True, return_tensors='pt')

# Move input tensor to the same device as the model
test_inputs = {key: value.to(device) for key, value in test_inputs.items()}

# Generate predictions using your trained model
with torch.no_grad():
    outputs = model(**test_inputs)
    logits = outputs.logits

# Assuming the first column of logits corresponds to the negative class (non-AI-generated) 
# and the second column corresponds to the positive class (AI-generated)
predictions = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()  # Move predictions back to CPU

# Create a submission DataFrame with essay IDs and corresponding predictions
submission = pd.DataFrame({
    'id': test['id'],
    'generated': predictions
})

# Save the submission DataFrame to a CSV file
submission.to_csv('submission.csv', index=False)