In [2]:
import re
import unicodedata
import pandas as pd 
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
import torch.nn as nn
from transformers import BertModel


2025-11-21 07:01:15.186504: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763708475.211067     107 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763708475.218328     107 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
df = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv")

In [None]:
import re
import unicodedata

def preprocess_text(text):
    # 1. Basic Safety Check
    if not isinstance(text, str):
        return ""
    
    # 2. Unicode Normalization
    # This fixes weird characters (like "smart quotes" vs "straight quotes")
    # to a standard format so the model doesn't get confused by encoding.
    text = unicodedata.normalize('NFKC', text)
    
    # 3. Remove HTML tags (if any exist in the dataset)
    # This removes things like <br>, <div>, <p>
    text = re.sub(r'<.*?>', '', text)
    
    # 4. Handle Newlines and Escape Characters
    # You mentioned removing \n. We replace them with a SPACE.
    # This regex finds newlines (\n), tabs (\t), and carriage returns (\r)
    text = re.sub(r'[\r\n\t]+', ' ', text)
    
    # 5. Remove specific artifact tags (optional customization)
    # Sometimes datasets have artifacts like "[[uuid]]" or similar.
    # If you noticed specific ugly tags, add them here. 
    # For now, we clean extra backslashes that might be escape artifacts.
    text = text.replace('\\', '')

    # 6. Collapse multiple spaces into one
    # "Hello    world" -> "Hello world"
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text


df['clean_text'] = df['text'].apply(preprocess_text)



In [None]:
df.head()

# In the tokenization step, first I use the wordpieace method, no good results ? then use BPE.

In [None]:


# --- CONFIGURATION ---
MODEL_NAME = 'bert-base-cased'  # Using Cased to capture capitalization signals
MAX_LEN = 512
BATCH_SIZE = 16                 # Adjust based on your GPU RAM (8, 16, or 32).Good for kaggle env


class AI_Detection_Dataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,      # Adds [CLS] and [SEP]
            max_length=self.max_len,      # Sets limit to 512
            return_token_type_ids=False,
            padding='max_length',         # Pads shorter sentences to 512
            truncation=True,              # Truncates longer sentences to 512
            return_attention_mask=True,
            return_tensors='pt',          # Returns PyTorch tensors
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def prepare_dataloaders(df):
    print(f"Loading Tokenizer: {MODEL_NAME}...")
    tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
    
    # Split into Train (80%) and Validation (20%)
    # Using 'stratify' ensures we keep the same Human/AI ratio in both sets
    df_train, df_val = train_test_split(
        df, 
        test_size=0.2, 
        random_state=42, 
        stratify=df['label']
    )
    
    print(f"Training Samples: {len(df_train)}")
    print(f"Validation Samples: {len(df_val)}")
    
    # Create Dataset Objects
    # We use the 'clean_text' column you prepared, and 'label'
    train_dataset = AI_Detection_Dataset(
        texts=df_train.clean_text.to_numpy(),
        labels=df_train.label.to_numpy(),
        tokenizer=tokenizer,
        max_len=MAX_LEN
    )
    
    val_dataset = AI_Detection_Dataset(
        texts=df_val.clean_text.to_numpy(), # fair enough . We used clean text.
        labels=df_val.label.to_numpy(),
        tokenizer=tokenizer,
        max_len=MAX_LEN
    )
    
    # Create DataLoaders
    # These manage the batches during training
    train_loader = DataLoader(
        train_dataset, 
        batch_size=BATCH_SIZE, 
        shuffle=True,      # Shuffle training data every epoch
        num_workers=0      # Set to 2 or 4 if on Linux for speed
    )
    
    val_loader = DataLoader(
        val_dataset, 
        batch_size=BATCH_SIZE,
        shuffle=False      # No need to shuffle validation
    )
    
    return train_loader, val_loader

# --- USAGE ---
# Assuming 'cleaned_df' is the dataframe from the previous step
# train_loader, val_loader = prepare_dataloaders(cleaned_df)


In [None]:
class BERTClassifier(nn.Module):
    def __init__(self, model_name):
        super(BERTClassifier, self).__init__()
        
        # 1. Load the pre-trained BERT model
        # This downloads the weights from Hugging Face
        self.bert = BertModel.from_pretrained(model_name)
        
        # 2. Define the "Drop Out" layer
        # This randomly turns off 30% of neurons during training to prevent overfitting
        self.drop = nn.Dropout(p=0.3)
        
        # 3. Define the Output Layer (The Classification Head)
        # 768 is the standard output size of bert-base
        # 2 is the number of classes (Human vs AI)
        self.out = nn.Linear(in_features=768, out_features=2)
        
    def forward(self, input_ids, attention_mask):
        # A. Pass data through BERT
        # output[0] = sequence_output (states for all tokens)
        # output[1] = pooled_output (a summary vector of the whole sentence)
        output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # We use pooled_output because we want a classification for the *whole* text
        pooled_output = output[1]
        
        # B. Apply Dropout
        output = self.drop(pooled_output)
        
        # C. Pass through the final layer to get scores for Human vs AI
        return self.out(output)

# --- INITIALIZATION ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = BERTClassifier(MODEL_NAME)
model = model.to(device)  # Move the entire model onto the GPU

print("Model initialized and moved to GPU successfully.")