In [None]:
!pip install --upgrade datasets transformers

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

In [None]:
import json
from datasets import Dataset
import random

def load_json_safely(filepath, label):
    data = []
    with open(filepath, 'r') as f:
        try:
            raw = json.load(f)
            for entry in raw:
                if isinstance(entry, str):  # Ensure it's a string
                    data.append({"text": entry, "label": label})
        except json.JSONDecodeError as e:
            print(f"Error loading {filepath}: {e}")
    return data

# Load both datasets safely
ai_data = load_json_safely("ai_generated.json", label=1)
human_data = load_json_safely("human_written.json", label=0)

# Combine and shuffle
full_data = ai_data + human_data
random.shuffle(full_data)

# Convert to Hugging Face Dataset
dataset = Dataset.from_list(full_data)

In [None]:
dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset['train']
test_dataset = dataset['test']

In [None]:
from collections import Counter
print(Counter([x['label'] for x in train_dataset]))

Counter({1: 305926, 0: 182687})


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from torch.optim import AdamW
import torch
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

# Tokenization
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(
        batch["text"],
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"  # Return PyTorch tensors directly
    )

# Apply tokenization
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Convert to torch format
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


Map:   0%|          | 0/488613 [00:00<?, ? examples/s]

Map:   0%|          | 0/122154 [00:00<?, ? examples/s]

In [None]:
# Model setup
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Create dataloaders with collate_fn
def collate_fn(batch):
    return {
        'input_ids': torch.stack([x['input_ids'] for x in batch]),
        'attention_mask': torch.stack([x['attention_mask'] for x in batch]),
        'labels': torch.tensor([x['label'] for x in batch], dtype=torch.long)  # <- FIXED
    }

train_loader = DataLoader(
    train_dataset,
    batch_size=64,
    shuffle=True,
    collate_fn=collate_fn
)

eval_loader = DataLoader(
    test_dataset,
    batch_size=128,
    collate_fn=collate_fn
)

# Initialize optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
for epoch in range(3):
    model.train()
    train_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Training Epoch {epoch+1}")

    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        train_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    # Evaluation
    model.eval()
    eval_loss = 0
    with torch.no_grad():
        for batch in tqdm(eval_loader, desc=f"Evaluating Epoch {epoch+1}"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            eval_loss += outputs.loss.item()

    print(f"\nEpoch {epoch+1}:")
    print(f"Train Loss: {train_loss/len(train_loader):.4f}")
    print(f"Eval Loss: {eval_loss/len(eval_loader):.4f}")

# Save model
model.save_pretrained("./results")
tokenizer.save_pretrained("./results")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training Epoch 1:   0%|          | 0/7635 [00:00<?, ?it/s]

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import torch.nn.functional as F

# Load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("./results")
tokenizer = AutoTokenizer.from_pretrained("./results")

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

def predict(text):
    # Tokenize input
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)

    # Forward pass
    with torch.no_grad():
        outputs = model(**inputs)

    # Get predicted label
    logits = outputs.logits
    probs = F.softmax(logits, dim=1)
    predicted_label = torch.argmax(probs, dim=1).item()
    confidence = probs[0][predicted_label].item()

    return predicted_label, confidence


text = "Offshore tax shelters are financial arrangements that are set up in countries other than the one where the individual or company resides, with the goal of reducing or eliminating tax liability. There are several ways in which wealthy individuals or companies can move their money offshore without getting taxed on it, but it's important to note that these activities may be illegal or unethical, depending on the specific circumstances and the laws of the countries involved.\\n\\nOne way to move money offshore is to set up a company or trust in a country with favorable tax laws, such as a low tax rate or tax-free status. This company or trust can then be used to hold financial assets, such as stocks, bonds, or real estate, and the income generated from these assets may not be subject to tax in the country where the individual or company resides.\\n\\nAnother way to move money offshore is to use financial instruments, such as offshore bank accounts or shell corporations, to conceal the ownership of assets and the source of income. These instruments can be used to make it difficult or impossible for authorities in the individual's home country to track and tax the income.\\n\\nThere may be restrictions on using the money that is moved offshore, depending on the specific arrangement and the laws of the country where the money is held. For example, there may be limits on how the money can be spent or invested, or there may be reporting requirements that must be followed.\\n\\nIt's important to note that offshore tax shelters are often used to avoid paying taxes, which is illegal in many countries. It's also important to be aware that offshore tax shelters may be used for illicit purposes, such as money laundering or financing terrorism."
label, confidence = predict(text)

print(f"Predicted label: {label} with confidence {confidence:.4f}")

Predicted label: 0 with confidence 1.0000


In [None]:
from google.colab import files

files.download("./results/model.safetensors")