In [1]:
from transformers import XLNetTokenizer, XLNetForSequenceClassification, DataCollatorWithPadding
from datasets import load_dataset
from torch.utils.data import DataLoader, random_split
from transformers import AdamW, get_scheduler
import torch
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import random

# Define the class labels
class_labels = [
    "Company", "EducationalInstitution", "Artist", "Athlete", "OfficeHolder", "MeanOfTransportation",
    "Building", "NaturalPlace", "Village", "Animal", "Plant", "Album", "Film", "WrittenWork"
]

# Load tokenizer
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")

# Load dataset
dataset = load_dataset("dbpedia_14")

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["content"], padding="max_length", truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["content", "title"])

# Use the entire dataset for training, validation, and testing
full_train_dataset = tokenized_datasets["train"]
full_test_dataset = tokenized_datasets["test"]

# Create a validation set from the training set
val_size = int(0.1 * len(full_train_dataset))  # 10% for validation
train_size = len(full_train_dataset) - val_size
full_train_dataset, full_val_dataset = random_split(full_train_dataset, [train_size, val_size])

# Data collator for padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [2]:
# DataLoader settings
batch_size = 128
train_dataloader = DataLoader(full_train_dataset, shuffle=True, batch_size=batch_size, collate_fn=data_collator)
val_dataloader = DataLoader(full_val_dataset, batch_size=batch_size, collate_fn=data_collator)
test_dataloader = DataLoader(full_test_dataset, batch_size=batch_size, collate_fn=data_collator)

# Load model
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=len(class_labels))

# Optimizer settings
optimizer = AdamW(model.parameters(), lr=5e-5)

# Scheduler settings
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

# Move model to GPU if available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0-11): 12 x XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (sequence_summary): SequenceSummary(
    (summary): Linear(in_features=768, out_features=768, bias=True)
    (activation): Tanh()
    (first_dropout): Identity()
    (last

In [3]:
# Training loop
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/11814 [00:00<?, ?it/s]

In [6]:

# Validation and Test Evaluation
def evaluate_model(model, dataloader):
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items() if k in ["input_ids", "attention_mask", "labels"]}
            outputs = model(**batch)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            y_true.extend(batch['labels'].cpu().numpy())
            y_pred.extend(predictions.cpu().numpy())
    return y_true, y_pred

# Evaluate on validation set
y_true_val, y_pred_val = evaluate_model(model, val_dataloader)
val_accuracy = accuracy_score(y_true_val, y_pred_val)
val_precision = precision_score(y_true_val, y_pred_val, average='weighted')
val_recall = recall_score(y_true_val, y_pred_val, average='weighted')
val_f1 = f1_score(y_true_val, y_pred_val, average='weighted')
val_error = 1 - val_accuracy

print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Validation Error Rate: {val_error:.4f}")

# Evaluate on test set
y_true_test, y_pred_test = evaluate_model(model, test_dataloader)
test_accuracy = accuracy_score(y_true_test, y_pred_test)
test_precision = precision_score(y_true_test, y_pred_test, average='weighted')
test_recall = recall_score(y_true_test, y_pred_test, average='weighted')
test_f1 = f1_score(y_true_test, y_pred_test, average='weighted')
test_error = 1 - test_accuracy

print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Error Rate: {test_error:.4f}")

Validation Accuracy: 0.9933
Validation Error Rate: 0.0067
Test Accuracy: 0.9929
Test Error Rate: 0.0071


In [7]:

# Random sample prediction
subset_size = 1000  # Define the subset size
random_test_indices = random.sample(range(len(full_test_dataset)), subset_size)
random_test_dataset = full_test_dataset.select(random_test_indices)
random_test_dataloader = DataLoader(random_test_dataset, batch_size=batch_size, collate_fn=data_collator)

# Evaluate on random sample
y_true_random, y_pred_random = evaluate_model(model, random_test_dataloader)
for true_label, pred_label in zip(y_true_random[:10], y_pred_random[:10]):
    print(f"True Label: {class_labels[true_label]}, Predicted Label: {class_labels[pred_label]}")

True Label: Film, Predicted Label: Film
True Label: NaturalPlace, Predicted Label: NaturalPlace
True Label: EducationalInstitution, Predicted Label: EducationalInstitution
True Label: Album, Predicted Label: Album
True Label: Artist, Predicted Label: Artist
True Label: Album, Predicted Label: Album
True Label: Artist, Predicted Label: Artist
True Label: EducationalInstitution, Predicted Label: EducationalInstitution
True Label: EducationalInstitution, Predicted Label: EducationalInstitution
True Label: Album, Predicted Label: Album
