In [1]:
pip install transformers datasets torch


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [10]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [11]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Create small dataset
data = {
    "text": [
        "I love this product!", "This is terrible.", "Absolutely fantastic experience.",
        "Worst thing I have ever bought.", "Not bad at all.", "Horrible customer service.",
        "Really good value for money.", "I wouldn’t recommend it.", "Exceeded my expectations.",
        "Complete waste of money."
    ],
    "label": [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
}
df = pd.DataFrame(data)

# Split dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(df["text"], df["label"], test_size=0.2)

train_dataset = Dataset.from_pandas(pd.DataFrame({'text': train_texts, 'label': train_labels}))
val_dataset = Dataset.from_pandas(pd.DataFrame({'text': val_texts, 'label': val_labels}))

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function)
val_dataset = val_dataset.map(tokenize_function)

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Remove unsupported arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs'
)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='binary')
    acc = accuracy_score(p.label_ids, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)


Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss


Evaluation Results: {'eval_loss': 0.6720300912857056, 'eval_accuracy': 0.5, 'eval_f1': 0.6666666666666666, 'eval_precision': 0.5, 'eval_recall': 1.0, 'eval_runtime': 4.7042, 'eval_samples_per_second': 0.425, 'eval_steps_per_second': 0.213, 'epoch': 3.0}


In [12]:
import os
os.environ["WANDB_DISABLED"] = "true"  # Disable wandb logging

from transformers import BertTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd

# 1. Prepare a small custom corpus
texts = [
    "Transformers are amazing tools for natural language processing.",
    "BERT is a transformer-based model developed by Google.",
    "Masked language modeling helps BERT learn bidirectional context.",
    "Fine-tuning allows us to adapt pre-trained models to specific tasks.",
    "Large language models like BERT can perform multiple NLP tasks."
]

# Convert to pandas DataFrame
df = pd.DataFrame({'text': texts})
dataset = Dataset.from_pandas(df)

# 2. Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# 3. Tokenization function
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=64)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# 4. Data collator for masked language modeling (MLM)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

# 5. Load pre-trained BERT model for masked language modeling
model = BertForMaskedLM.from_pretrained("bert-base-uncased")

# 6. Set up training arguments
training_args = TrainingArguments(
    output_dir="./bert-mlm",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=5
)

# 7. Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# 8. Train the model
trainer.train()

# 9. Save the model and tokenizer
model.save_pretrained("./bert-mlm")
tokenizer.save_pretrained("./bert-mlm")


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss
5,2.4798


('./bert-mlm/tokenizer_config.json',
 './bert-mlm/special_tokens_map.json',
 './bert-mlm/vocab.txt',
 './bert-mlm/added_tokens.json')

In [17]:
pip install datasets transformers scikit-learn




In [18]:
!pip install datasets transformers scikit-learn




In [20]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.4-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.4


In [23]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=8f669f752a67a0065c7900cec980905ba0ad5984cfaeeaa954e2111ae833bebf
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [48]:
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration,
    TrainingArguments, Trainer,
    DataCollatorForSeq2Seq
)
from datasets import Dataset, DatasetDict
import evaluate
import numpy as np
import torch
from transformers import Seq2SeqTrainingArguments

# 1. Define custom travel itinerary data
custom_data = [
    {
        "query": "Please create a travel plan that starts in Charleston and leads to St. Louis over a span of three days, from March 16th to March 18th, 2022. This trip is designed for one individual with a budget of $900.",
        "reference_information": [
            {
                "Description": "Travel Plan",
                "Content": "Flight from Charleston to St. Louis on Mar 16, return on Mar 18. Accommodation at Budget Inn STL for 2 nights ($210). Visit Gateway Arch ($16), City Museum ($18), and Forest Park (free). Total estimated cost: $886."
            }
        ]
    },
    {
        "query": "Please plan a trip for me starting from Sarasota to Chicago for 3 days, from March 22nd to March 24th, 2022. The budget for this trip is set at $1,900.",
        "reference_information": [
            {
                "Description": "Travel Plan",
                "Content": "Flight from Sarasota to Chicago on Mar 22 (F3984576) for $279, return on Mar 24 (F4010566) for $368. Stay at Big bedroom in Astoria ($110 for 2 nights). Visit Millennium Park, Navy Pier, and Shedd Aquarium ($40). Estimated cost: $1147."
            }
        ]
    },
    {
        "query": "Seeking assistance to develop a travel itinerary for a 3-day trip for one person. The trip will begin in Washington, with Tampa as the destination from March 25th through March 27th, 2022. The budget for this journey is $1,800.",
        "reference_information": [
            {
                "Description": "Travel Plan",
                "Content": "Flight from Washington to Tampa on Mar 25 ($173), return on Mar 27 ($187). Stay at Bright duplex apartment ($164 for 2 nights). Visit Busch Gardens ($110), The Florida Aquarium ($32), and Tampa Bay History Center ($15). Estimated cost: $981."
            }
        ]
    }
]

# 2. Convert to Hugging Face Dataset
dataset = Dataset.from_list(custom_data)
dataset = dataset.train_test_split(test_size=0.33, seed=42)

# 3. Load tokenizer and model
model_checkpoint = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)

# 4. Preprocessing
max_input_length = 512
max_target_length = 1024

def preprocess(example):
    inputs = "generate travel plan: " + example["query"]
    targets = "\n".join([ref["Content"] for ref in example["reference_information"]])
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")

    labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# 5. Tokenize
tokenized_dataset = dataset.map(preprocess)

# 6. Metric
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Ensure predictions is the tensor/array of logits
    if isinstance(predictions, tuple):
        predictions = predictions[0]

    # Convert logits to predicted token IDs by taking the argmax along the last dimension
    if isinstance(predictions, np.ndarray):
        predictions = np.argmax(predictions, axis=-1)
    elif torch.is_tensor(predictions):
        predictions = torch.argmax(predictions, dim=-1).cpu().numpy()

    # Replace -100 with pad_token_id before decoding labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)


    # Convert predictions and labels to lists for batch_decode if they are not already
    if isinstance(predictions, np.ndarray):
        predictions = predictions.tolist()
    if isinstance(labels, np.ndarray):
        labels = labels.tolist()


    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)


    # rouge.compute expects lists of strings
    return rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)


# 7. Training args
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-travelplanner",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=500,
    save_steps=1000,
    eval_steps=1000,
    save_total_limit=2,
    fp16=False  # Set True only if you're on a compatible GPU
)

# 8. Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# 9. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 10. Train
trainer.train()

# 11. Evaluate
results = trainer.evaluate()
print("Evaluation Results:", results)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss


Evaluation Results: {'eval_loss': 24.69863510131836, 'eval_rouge1': 0.2028985507246377, 'eval_rouge2': 0.029850746268656716, 'eval_rougeL': 0.14492753623188406, 'eval_rougeLsum': 0.14492753623188406, 'eval_runtime': 4.2522, 'eval_samples_per_second': 0.235, 'eval_steps_per_second': 0.235, 'epoch': 3.0}
