In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load your data
essays = pd.read_csv('../llm-detect-ai-generated-text/train_essays.csv')
prompts = pd.read_csv('../llm-detect-ai-generated-text/train_prompts.csv')

# Merge essays with prompts based on 'prompt_id'
data = essays.merge(prompts, on='prompt_id', how='left')
data['combined_text'] = data['prompt_name'] + " " + data['instructions'] + " " + data['text']

# Split data into train and validation sets
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)


In [2]:
from transformers import DebertaTokenizer
from torch.utils.data import Dataset, DataLoader
import torch

# Initialize tokenizer
tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')  

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True
        )
        return {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Creating the dataset
train_dataset = TextDataset(train_data['combined_text'].tolist(), train_data['generated'].tolist(), tokenizer)
val_dataset = TextDataset(val_data['combined_text'].tolist(), val_data['generated'].tolist(), tokenizer)

# DataLoader setup
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)


In [3]:
from transformers import DebertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup

# Model initialization
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = "mps" if torch.backends.mps.is_available() else device
print(f'Using device: {device}')
model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-base', num_labels=2)  
model.to(device)

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_loader) * 3  # Assuming 'num_epochs' = 3
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop with accuracy tracking
for epoch in range(3):  # num_epochs
    model.train()
    total_train_loss = 0
    total_train_correct = 0
    total_train_examples = 0

    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_train_loss += loss.item()

        preds = torch.argmax(outputs.logits, dim=-1)
        total_train_correct += (preds == batch['labels']).sum().item()
        total_train_examples += batch['labels'].size(0)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

    train_accuracy = total_train_correct / total_train_examples
    print(f'Epoch {epoch+1}, Train Loss: {total_train_loss / len(train_loader)}, Train Accuracy: {train_accuracy}')

    # Validation step
    model.eval()
    total_val_loss = 0
    total_val_correct = 0
    total_val_examples = 0

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            total_val_loss += outputs.loss.item()

            preds = torch.argmax(outputs.logits, dim=-1)
            total_val_correct += (preds == batch['labels']).sum().item()
            total_val_examples += batch['labels'].size(0)

    val_accuracy = total_val_correct / total_val_examples
    print(f'Validation Loss: {total_val_loss / len(val_loader)}, Validation Accuracy: {val_accuracy}')


Using device: mps


  return self.fget.__get__(instance, owner)()
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Train Loss: 0.04732897857142117, Train Accuracy: 0.9935483870967742
Validation Loss: 0.0017261037913461526, Validation Accuracy: 1.0
Epoch 2, Train Loss: 0.017934408784663725, Train Accuracy: 0.9975806451612903
Validation Loss: 0.0010734709018530946, Validation Accuracy: 1.0
Epoch 3, Train Loss: 0.017630333014418402, Train Accuracy: 0.9975806451612903
Validation Loss: 0.0013947509578429163, Validation Accuracy: 1.0
