<a href="https://www.kaggle.com/code/aabdollahii/bert-model-aigen-detection?scriptVersionId=280918694" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [11]:
import re
import unicodedata
import pandas as pd 
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
import torch.nn as nn
from transformers import BertModel


In [12]:
df = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv")

In [13]:
import re
import unicodedata

def preprocess_text(text):
    # 1. Basic Safety Check
    if not isinstance(text, str):
        return ""
    
    # 2. Unicode Normalization
    # This fixes weird characters (like "smart quotes" vs "straight quotes")
    # to a standard format so the model doesn't get confused by encoding.
    text = unicodedata.normalize('NFKC', text)
    
    # 3. Remove HTML tags (if any exist in the dataset)
    # This removes things like <br>, <div>, <p>
    text = re.sub(r'<.*?>', '', text)
    
    # 4. Handle Newlines and Escape Characters
    # You mentioned removing \n. We replace them with a SPACE.
    # This regex finds newlines (\n), tabs (\t), and carriage returns (\r)
    text = re.sub(r'[\r\n\t]+', ' ', text)
    
    # 5. Remove specific artifact tags (optional customization)
    # Sometimes datasets have artifacts like "[[uuid]]" or similar.
    # If you noticed specific ugly tags, add them here. 
    # For now, we clean extra backslashes that might be escape artifacts.
    text = text.replace('\\', '')

    # 6. Collapse multiple spaces into one
    # "Hello    world" -> "Hello world"
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text


df['clean_text'] = df['text'].apply(preprocess_text)



In [14]:
df.head()

Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven,clean_text
0,Phones\n\nModern humans today are always on th...,0,Phones and driving,persuade_corpus,False,Phones Modern humans today are always on their...
1,This essay will explain if drivers should or s...,0,Phones and driving,persuade_corpus,False,This essay will explain if drivers should or s...
2,Driving while the use of cellular devices\n\nT...,0,Phones and driving,persuade_corpus,False,Driving while the use of cellular devices Toda...
3,Phones & Driving\n\nDrivers should not be able...,0,Phones and driving,persuade_corpus,False,Phones & Driving Drivers should not be able to...
4,Cell Phone Operation While Driving\n\nThe abil...,0,Phones and driving,persuade_corpus,False,Cell Phone Operation While Driving The ability...


# In the tokenization step, first I use the wordpieace method, no good results ? then use BPE.

In [21]:


# --- CONFIGURATION ---
MODEL_NAME = 'bert-base-cased'  # Using Cased to capture capitalization signals
MAX_LEN = 512
BATCH_SIZE = 16                 # Adjust based on your GPU RAM (8, 16, or 32).Good for kaggle env


class AI_Detection_Dataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,      # Adds [CLS] and [SEP]
            max_length=self.max_len,      # Sets limit to 512
            return_token_type_ids=False,
            padding='max_length',         # Pads shorter sentences to 512
            truncation=True,              # Truncates longer sentences to 512
            return_attention_mask=True,
            return_tensors='pt',          # Returns PyTorch tensors
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def prepare_dataloaders(df):
    print(f"Loading Tokenizer: {MODEL_NAME}...")
    tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
    
    df_train, df_val = train_test_split(
        df, 
        test_size=0.2, 
        random_state=42, 
        stratify=df['label']
    )
    
    train_dataset = AI_Detection_Dataset(
        texts=df_train.clean_text.to_numpy(),
        labels=df_train.label.to_numpy(),
        tokenizer=tokenizer,
        max_len=MAX_LEN
    )
    
    val_dataset = AI_Detection_Dataset(
        texts=df_val.clean_text.to_numpy(),
        labels=df_val.label.to_numpy(),
        tokenizer=tokenizer,
        max_len=MAX_LEN
    )
    
    # --- THE FIX IS HERE ---
    train_loader = DataLoader(
        train_dataset, 
        batch_size=BATCH_SIZE, 
        shuffle=True,
        num_workers=2,      # CHANGED FROM 0 TO 2 (Uses 2 CPU cores to prep data)
        pin_memory=True     # NEW: Speeds up transfer from RAM to GPU
    )
    
    val_loader = DataLoader(
        val_dataset, 
        batch_size=BATCH_SIZE,
        shuffle=False,
        num_workers=2,      # CHANGED FROM 0 TO 2
        pin_memory=True     # NEW
    )
    
    return train_loader, val_loader

# Re-initialize the loaders
train_dataloader, val_dataloader = prepare_dataloaders(df)
print("Loaders optimized and ready.")


Loading Tokenizer: bert-base-cased...
Loaders optimized and ready.


In [22]:
class BERTClassifier(nn.Module):
    def __init__(self, model_name):
        super(BERTClassifier, self).__init__()
        
        # 1. Load the pre-trained BERT model
        # This downloads the weights from Hugging Face
        self.bert = BertModel.from_pretrained(model_name)
        
        # 2. Define the "Drop Out" layer
        # This randomly turns off 30% of neurons during training to prevent overfitting
        self.drop = nn.Dropout(p=0.3)
        
        # 3. Define the Output Layer (The Classification Head)
        # 768 is the standard output size of bert-base
        # 2 is the number of classes (Human vs AI)
        self.out = nn.Linear(in_features=768, out_features=2)
        
    def forward(self, input_ids, attention_mask):
        
        # output[0] = sequence_output (states for all tokens)
        # output[1] = pooled_output (a summary vector of the whole sentence)
        output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # We use pooled_output because we want a classification for the *whole* text
        pooled_output = output[1]
        
        #  Apply Dropout
        output = self.drop(pooled_output)
        
        #  Pass through the final layer to get scores for Human vs AI
        return self.out(output)

# --- INITIALIZATION ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = BERTClassifier(MODEL_NAME)
model = model.to(device)  # Move the entire model onto the GPU

print("Model initialized and moved to GPU successfully.")

Using device: cuda
Model initialized and moved to GPU successfully.


In [23]:
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
import numpy as np
from tqdm import tqdm

# --- HYPERPARAMETERS ---
EPOCHS = 3
LEARNING_RATE = 2e-5

# 1. Define Optimizer & Scheduler
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# 2. Define Loss Function
loss_fn = CrossEntropyLoss().to(device)

# --- TRAIN FUNCTION (Updated to use d['labels']) ---
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    
    for d in tqdm(data_loader):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        # FIX: Your dataset uses 'labels', not 'targets'
        targets = d["labels"].to(device) 
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
    return correct_predictions.double() / n_examples, np.mean(losses)

# --- EVAL FUNCTION (Updated to use d['labels']) ---
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            # FIX: Your dataset uses 'labels', not 'targets'
            targets = d["labels"].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)
            
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())
            
    return correct_predictions.double() / n_examples, np.mean(losses)

# --- EXECUTE TRAINING ---
history = {'train_acc': [], 'train_loss': [], 'val_acc': [], 'val_loss': []}

print("Starting training...")

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)
    
    train_acc, train_loss = train_epoch(
        model,
        train_dataloader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(train_dataloader.dataset) # Gets precise length
    )
    
    print(f'Train loss {train_loss} accuracy {train_acc}')
    
    val_acc, val_loss = eval_model(
        model,
        val_dataloader,
        loss_fn,
        device,
        len(val_dataloader.dataset)
    )
    
    print(f'Val   loss {val_loss} accuracy {val_acc}')
    print()
    
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    
    torch.save(model.state_dict(), 'bert_model_state.bin')


Starting training...
Epoch 1/3
----------


  1%|          | 24/2244 [00:44<1:08:42,  1.86s/it]


KeyboardInterrupt: 

In [None]:
history = {
    'train_acc': [],
    'train_loss': [],
    'val_acc': [],
    'val_loss': []
}

print("Starting training...")

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)
    
    # 1. Train
    train_acc, train_loss = train_epoch(
        model,
        train_dataloader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(df_train)
    )
    
    print(f'Train loss {train_loss} accuracy {train_acc}')
    
    # 2. Validate
    val_acc, val_loss = eval_model(
        model,
        val_dataloader,
        loss_fn,
        device,
        len(df_val)
    )
    
    print(f'Val   loss {val_loss} accuracy {val_acc}')
    print()
    
    # Store history
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    
    # Save the model state just in case
    torch.save(model.state_dict(), 'bert_model_state.bin')
