In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
from transformers import BertTokenizer, BertModel

# Load pre-trained tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

print("Model loaded successfully!")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Model loaded successfully!


In [16]:
sentence = "Humans have evolved a lot with time."

# Tokenize the sentence
tokens = tokenizer.tokenize(sentence)
print(tokens)  # Output: ['transform', '##ers', 'are', 'revolution', '##izing', 'nl', '##p', '!']

['humans', 'have', 'evolved', 'a', 'lot', 'with', 'time', '.']


In [18]:
inputs = tokenizer(sentence, return_tensors='pt')  # Returns PyTorch tensors
print(inputs)

{'input_ids': tensor([[ 101, 4286, 2031, 7964, 1037, 2843, 2007, 2051, 1012,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [10]:
outputs = model(**inputs)
last_hidden_states = outputs.last_hidden_state
print(last_hidden_states.shape)  # Shape: [batch_size, sequence_length, hidden_size]

torch.Size([1, 9, 768])


In [12]:
import torch
from transformers import BertForSequenceClassification

# Load a pre-trained model with a classification head
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Example training loop (simplified)
epochs = 3  # Define number of epochs
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    
    # Dummy inputs and labels for demonstration purposes
    inputs = {
        'input_ids': torch.tensor([[101, 2023, 2003, 1037, 7099, 102]]),  # Example tokenized input
        'attention_mask': torch.tensor([[1, 1, 1, 1, 1, 1]])  # Attention mask
    }
    labels = torch.tensor([1])  # Example label (e.g., positive sentiment)

    outputs = model(**inputs, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 0.8110669255256653
Epoch 2, Loss: 0.5848608016967773
Epoch 3, Loss: 0.43619900941848755


## Training a Custom Transformer

In [3]:
import torch
import torch.nn as nn
from transformers import PreTrainedModel, PretrainedConfig

class CustomTransformerConfig(PretrainedConfig):
    def __init__(self, vocab_size=30522, hidden_size=768, num_hidden_layers=12, 
                 num_attention_heads=12, intermediate_size=3072, max_position_embeddings=512, **kwargs):
        super().__init__(**kwargs)
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.intermediate_size = intermediate_size
        self.max_position_embeddings = max_position_embeddings

class CustomTransformer(PreTrainedModel):
    config_class = CustomTransformerConfig

    def __init__(self, config):
        super().__init__(config)
        self.embedding = nn.Embedding(config.vocab_size, config.hidden_size)
        self.positional_encoding = nn.Parameter(torch.zeros(1, config.max_position_embeddings, config.hidden_size))
        decoder_layer = nn.TransformerDecoderLayer(
            d_model=config.hidden_size,
            nhead=config.num_attention_heads,
            dim_feedforward=config.intermediate_size,
            batch_first=True
        )
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=config.num_hidden_layers)
        self.classifier = nn.Linear(config.hidden_size, config.vocab_size)

    def forward(self, input_ids, attention_mask=None):
        # Embedding + Positional Encoding
        embeddings = self.embedding(input_ids) + self.positional_encoding[:, :input_ids.size(1), :]
        
        # Create causal mask for autoregressive decoding
        seq_len = input_ids.size(1)
        causal_mask = torch.triu(torch.ones(seq_len, seq_len) * float('-inf'), diagonal=1).to(input_ids.device)
        
        # Pass through the transformer decoder
        transformer_output = self.transformer_decoder(tgt=embeddings, memory=embeddings, tgt_mask=causal_mask)
        
        # Classification head
        logits = self.classifier(transformer_output)
        return logits

In [None]:
#Preparing the Dataset
with open('data.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()

# Clean and tokenize the data
dataset = [line.strip() for line in lines if line.strip()]
print(f"Loaded {len(dataset)} sentences.")

Loaded 6 sentences.


In [5]:
#Tokenization

from transformers import BertTokenizer

# Train a tokenizer on your dataset
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer.add_tokens(['<NEW_TOKEN>'])  # Add custom tokens if needed

# Tokenize the dataset
tokenized_data = [tokenizer.encode(sentence, max_length=512, truncation=True, padding='max_length') for sentence in dataset]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [6]:
#Setting Up the Training Loop
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim

# Convert tokenized data to tensors
input_ids = torch.tensor(tokenized_data)
labels = torch.tensor([sentence[1:] + [tokenizer.pad_token_id] for sentence in tokenized_data])  # Shifted labels

# Create DataLoader
dataset = TensorDataset(input_ids, labels)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# Initialize model, optimizer, and loss function
model = CustomTransformer(CustomTransformerConfig())
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training loop
for epoch in range(3):  # Number of epochs
    model.train()
    total_loss = 0
    for batch_input_ids, batch_labels in dataloader:
        batch_input_ids, batch_labels = batch_input_ids.to(device), batch_labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(batch_input_ids)
        loss = loss_fn(outputs.view(-1, outputs.size(-1)), batch_labels.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(dataloader)}")

Epoch 1, Loss: 10.416277885437012
Epoch 2, Loss: 9.843629837036133
Epoch 3, Loss: 9.438532829284668


In [5]:
# Save the model and tokenizer
model.save_pretrained('./custom_transformer')
tokenizer.save_pretrained('./custom_transformer')

('./custom_transformer/tokenizer_config.json',
 './custom_transformer/special_tokens_map.json',
 './custom_transformer/vocab.txt',
 './custom_transformer/added_tokens.json')

In [6]:
model.save_pretrained('./custom_transformer')

In [7]:
tokenizer.save_pretrained('./custom_transformer')

('./custom_transformer/tokenizer_config.json',
 './custom_transformer/special_tokens_map.json',
 './custom_transformer/vocab.txt',
 './custom_transformer/added_tokens.json')

#### Case studies


In [47]:
# Re-run the medical text summarization with different models after execution state reset

from transformers import T5Tokenizer, T5ForConditionalGeneration

# List of models to test
models = [
    "t5-small",
    "t5-base",
    "t5-large",
    "google/flan-t5-large",
    "google/flan-t5-xl"
]

# Example medical text to summarize
input_text = "Medical summary: Patient diagnosed with hypertension. Prescribed amlodipine. Follow-up in 2 weeks."

# Store results
summaries = {}

for model_name in models:
    try:
        # Load tokenizer and model
        tokenizer = T5Tokenizer.from_pretrained(model_name)
        model = T5ForConditionalGeneration.from_pretrained(model_name)

        # Tokenize input
        input_ids = tokenizer(input_text, return_tensors="pt").input_ids

        summary_ids = model.generate(
            input_ids, 
            max_length=30,  # Reduce output length for more concise summaries
            num_beams=7,  # Increase beam search candidates
            temperature=0.7,  # Reduce randomness
            top_k=50,  # Increase diversity
            top_p=0.9,  # Encourage more varied outputs
            repetition_penalty=2.0,  # Stronger penalty to avoid repetition
            early_stopping=True  # Stop when an optimal summary is found
        )

        # Decode output
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries[model_name] = summary

    except Exception as e:
        summaries[model_name] = f"Error: {str(e)}"

summaries


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

{'t5-small': 'Summary Medical summary: Patient diagnosed with hypertension. Prescribed amlodipine. Follow-up in 2 weeks.',
 't5-base': 'Amlodipine. Follow-up in 2 weeks.',
 't5-large': 'diagnosed with hypertension. Prescribed amlodipine. Follow-up in 2 weeks. Medical summary: Patient diagnosed with hypertension',
 'google/flan-t5-large': 'Patient diagnosed with hypertension. Prescribed amlodipine.',
 'google/flan-t5-xl': 'Amlodipine is prescribed to a patient with hypertension.'}

In [48]:
from transformers import BartTokenizer, BartForConditionalGeneration, PegasusTokenizer, PegasusForConditionalGeneration

# List of models to test
models = {
    "facebook/bart-large-cnn": ("BART", BartTokenizer, BartForConditionalGeneration),
    "google/pegasus-xsum": ("PEGASUS", PegasusTokenizer, PegasusForConditionalGeneration),
    "google/pegasus-large": ("PEGASUS", PegasusTokenizer, PegasusForConditionalGeneration),
}

# Example medical text to summarize
input_text = "Medical summary: Patient diagnosed with hypertension. Prescribed amlodipine. Follow-up in 2 weeks."

# Store results
summaries = {}

for model_name, (model_type, TokenizerClass, ModelClass) in models.items():
    try:
        print(f"Testing {model_type} model: {model_name}")
        
        # Load tokenizer and model
        tokenizer = TokenizerClass.from_pretrained(model_name)
        model = ModelClass.from_pretrained(model_name)

        # Tokenize input
        input_ids = tokenizer(input_text, return_tensors="pt", truncation=True, padding="longest").input_ids

        # Generate summary with optimized parameters
        summary_ids = model.generate(
            input_ids, 
            max_length=30,  # Keep it short and precise
            num_beams=7,  # Increase beam search candidates
            temperature=0.7,  # Reduce randomness
            top_k=50,  # Encourage more variety in selection
            top_p=0.9,  # Encourage more diverse outputs
            repetition_penalty=2.0,  # Avoid repeating words
            early_stopping=True  # Stop when an optimal summary is found
        )

        # Decode output
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries[model_name] = summary

    except Exception as e:
        summaries[model_name] = f"Error: {str(e)}"

# Print summaries
for model, summary in summaries.items():
    print(f"\nModel: {model}\nSummary: {summary}")


Testing BART model: facebook/bart-large-cnn


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Testing PEGASUS model: google/pegasus-xsum


tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

Testing PEGASUS model: google/pegasus-large


tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.09k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]


Model: facebook/bart-large-cnn
Summary: Medical summary: Patient diagnosed with hypertension. Prescribed amlodipine. Follow-up in 2 weeks. Back to Mail Online

Model: google/pegasus-xsum
Summary: The use of amlodipine in the treatment of hypertension: a case report.

Model: google/pegasus-large
Summary: Follow-up in 2 weeks.


In [50]:
from transformers import AutoTokenizer, AutoModel
from torch.nn.functional import cosine_similarity


# Load pre-trained BERT model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Encode product descriptions
product_1 = tokenizer("Wireless Bluetooth Headphones with Noise Cancellation", return_tensors="pt")
product_2 = tokenizer("Bluetooth Earbuds with Active Noise Cancelling", return_tensors="pt")

# Compute similarity score
embedding_1 = model(**product_1).last_hidden_state.mean(dim=1)
embedding_2 = model(**product_2).last_hidden_state.mean(dim=1)
##similarity_score = (embedding_1 @ embedding_2.T).item()


similarity_score = cosine_similarity(embedding_1, embedding_2).item()
print("Cosine Similarity Score:", similarity_score)

Cosine Similarity Score: 0.9022848606109619


In [52]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

# Use a pre-trained classification model
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name)

# Example transaction description
transaction = "Multiple high-value purchases from various other overseas locations in the last hour."
inputs = tokenizer(transaction, return_tensors="pt")

# Predict fraud probability
outputs = model(**inputs)
fraud_prob = outputs.logits.softmax(dim=1)[0, 1].item()

print("Fraud Probability:", fraud_prob)


Fraud Probability: 0.8868260383605957


### Hands-On Code: End-to-End Example

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [54]:
dataset = load_dataset("imdb")

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [55]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [56]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [57]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [61]:
import os
from transformers import TrainingArguments

# Disable Weights & Biases API key prompt
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,  # Reduced batch size
    per_device_eval_batch_size=4,
    num_train_epochs=1,  # Reduce epochs
    fp16=True,  # Mixed precision training
    logging_steps=500,  # Less frequent logging
    save_steps=1000,
    eval_steps=1000,
    evaluation_strategy="steps",
    gradient_accumulation_steps=2,  # Reduce GPU memory usage
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [60]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss
