In [1]:
# !pip install evaluate

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
import pandas as pd
import numpy as np
import os
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DistilBertForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
import evaluate

In [4]:
# Commented out to avoid memory error
#os.environ["TOKENIZERS_PARALLELISM"] = "true"
#torch.set_float32_matmul_precision("high")

In [5]:
#df = pd.read_csv('data/data.csv').sample(frac=0.05, random_state=42).reset_index(drop=True)
df = pd.read_parquet('/content/drive/MyDrive/human_v_machine_data/data.parquet')
df = df.sample(frac=0.05, random_state=42).reset_index(drop=True)
df = df.rename(columns={"source": "label"})
df["label"] = df["label"].map(lambda x: 1 if x == "Human" else 0)

In [6]:
df.shape

(39446, 5)

In [7]:
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [8]:
train_dataset = train_dataset.remove_columns(
    ['prompt_id', 'text_length', 'word_count', '__index_level_0__']
)

test_dataset = test_dataset.remove_columns(
    ['prompt_id', 'text_length', 'word_count', '__index_level_0__']
)

In [9]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length")

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset  = test_dataset.map(tokenize, batched=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/35501 [00:00<?, ? examples/s]

Map:   0%|          | 0/3945 [00:00<?, ? examples/s]

In [10]:
train_dataset = train_dataset.remove_columns(['text'])

test_dataset = test_dataset.remove_columns(['text'])

In [11]:
train_dataset.set_format(type=None)
test_dataset.set_format(type=None)

In [12]:
class SafeDistilBert(DistilBertForSequenceClassification):
    def forward(self, *args, num_items_in_batch=None, **kwargs):
        return super().forward(*args, **kwargs)


model = SafeDistilBert.from_pretrained(
    model_name,
    num_labels=2
)

# Detect if CUDA is available, otherwise use CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
# model = torch.compile(model) # Temporarily commented out

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of SafeDistilBert were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


SafeDistilBert(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): L

In [13]:
args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8, # Reduced batch size
    per_device_eval_batch_size=8,  # Reduced batch size
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1", # Corrected to eval_f1
    greater_is_better=True, # Added as a higher f1 is better
    gradient_accumulation_steps=1,
    fp16=False, # Changed to False to debug CUDA illegal memory access error
    dataloader_num_workers=0,
    remove_unused_columns=False,
    report_to='none' # Disable Weights & Biases logging
)

metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    print("compute_metrics called!")
    logits, labels = eval_pred
    print(f"Shape of logits: {logits.shape}, Shape of labels: {labels.shape}")
    preds = logits.argmax(axis=1)
    result = metric.compute(predictions=preds, references=labels)
    print(f"Result from metric.compute: {result}")
    return result

Downloading builder script: 0.00B [00:00, ?B/s]

In [14]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator, # Explicitly added data collator
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.2109,No log


KeyError: "The `metric_for_best_model` training argument is set to 'eval_f1', which is not found in the evaluation metrics. The available evaluation metrics are: []. Consider changing the `metric_for_best_model` via the TrainingArguments."

In [None]:
result = trainer.evaluate()
print(result)