In [1]:
# Import necessary libraries
import os
import torch
import numpy as np
import pickle
import math
from transformers import (
    BertTokenizer, BertConfig, BertForMaskedLM, Trainer, TrainingArguments, 
    DataCollatorForLanguageModeling, PreTrainedTokenizerFast, BertForQuestionAnswering, 
    BertForTokenClassification, BertTokenizerFast, DataCollatorForTokenClassification
)
from datasets import load_dataset, load_from_disk, DatasetDict
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score


# Disable W&B logging if not needed
os.environ["WANDB_DISABLED"] = "true"  # Uncomment to disable W&B logging


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load Base Model and Tokenizer
tokenizer_type = "Unigram"
tokenizer_file = "tokenizers/unigram-tokenizer.json"
tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_file)
tokenizer.add_special_tokens({'pad_token': '[PAD]', 'mask_token': '[MASK]', 'unk_token': '[UNK]'})

base_model_files = tokenizer_type + "_BERT_Base"
print(base_model_files)

Unigram_BERT_Base


In [31]:
# Finetuning for Text Generation

# Load the dataset
dataset = load_dataset('euclaise/writingprompts')

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['story'], padding='max_length', truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Define model configuration
config = BertConfig(
    vocab_size=30000,
    hidden_size=128,
    num_hidden_layers=2,
    num_attention_heads=4,
    intermediate_size=512,
    max_position_embeddings=128,
    type_vocab_size=2,
    initializer_range=0.02,
    layer_norm_eps=1e-12
)

# Load the pretrained model
model = BertForMaskedLM.from_pretrained(base_model_files, config=config, ignore_mismatched_sizes=True)

# Resize token embeddings
model.resize_token_embeddings(len(tokenizer))

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    save_strategy='no',
    logging_strategy='epoch'
)

# Create a data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained(f"{tokenizer_type}_BERT_TextGeneration")


Map: 100%|██████████| 15620/15620 [00:16<00:00, 948.77 examples/s]
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss
1,5.2315,4.394095
2,4.4873,4.088841
3,4.3078,4.009439


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [32]:
# Evaluate Text Generation

# Load the fine-tuned model
model = BertForMaskedLM.from_pretrained(f"{tokenizer_type}_BERT_TextGeneration")

# Move the model to the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load the dataset
dataset = load_dataset('euclaise/writingprompts')

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['story'], padding='max_length', truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Function to calculate perplexity
def calculate_perplexity(text):
    inputs = tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=128)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move inputs to the same device as the model
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs['input_ids'])
    loss = outputs.loss
    perplexity = math.exp(loss.item())
    return perplexity


# Calculate perplexity for the test dataset
perplexities = []
for example in tokenized_datasets['test']:
    text = example['story']
    perplexity = calculate_perplexity(text)
    perplexities.append(perplexity)

# Calculate average perplexity
average_perplexity = sum(perplexities) / len(perplexities)
print(f"Average Perplexity: {average_perplexity}")


Average Perplexity: 2.0459924432245025


In [4]:
# Fine-tuning for Question-Answering

# Load the pretrained model
model = BertForQuestionAnswering.from_pretrained(base_model_files)

# Load the dataset
dataset = load_dataset("squad")

# Tokenize the dataset
def tokenize_function(examples):
    tokenized_inputs = tokenizer(
        examples['question'],
        examples['context'],
        truncation=True,
        padding='max_length',
        max_length=128,
        return_offsets_mapping=True
    )
    start_positions = []
    end_positions = []
    for i, offsets in enumerate(tokenized_inputs['offset_mapping']):
        start_char = examples['answers'][i]['answer_start'][0]
        end_char = start_char + len(examples['answers'][i]['text'][0])
        sequence_ids = tokenized_inputs.sequence_ids(i)
        context_start = sequence_ids.index(1)
        context_end = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)
        
        # Find the start and end token positions
        start_token_pos = None
        end_token_pos = None
        for j, (start, end) in enumerate(offsets):
            if start <= start_char < end:
                start_token_pos = j
            if start < end_char <= end:
                end_token_pos = j
        if start_token_pos is None or end_token_pos is None:
            start_positions.append(0)
            end_positions.append(0)
        else:
            start_positions.append(start_token_pos)
            end_positions.append(end_token_pos)
    tokenized_inputs['start_positions'] = start_positions
    tokenized_inputs['end_positions'] = end_positions
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy='no',
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
)

# Train the model
trainer.train()

# Save Model
model.save_pretrained(f"{tokenizer_type}_BERT_QnsAns")

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at Unigram_BERT_Base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 87599/87599 [00:26<00:00, 3333.66 examples/s]
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss
1,3.2161,3.147525
2,3.0612,3.030453
3,2.9869,2.955954
4,2.9192,2.927953
5,2.9001,2.916831


In [5]:
# Evaluation for Question Answering

import torch
from transformers import BertTokenizerFast, BertForQuestionAnswering
from datasets import load_dataset
from sklearn.metrics import f1_score

# Load the fine-tuned model and tokenizer
model = BertForQuestionAnswering.from_pretrained(f"{tokenizer_type}_BERT_QnsAns")

# Load the SQuAD 1.1 dataset
dataset = load_dataset('squad')

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['question'], examples['context'], truncation=True, padding='max_length', max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Function to make predictions
def make_predictions(examples):
    inputs = tokenizer(examples['question'], examples['context'], return_tensors='pt', truncation=True, padding='max_length', max_length=128)
    inputs = {key: value.to(model.device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits
    start_index = torch.argmax(start_logits, dim=1).item()
    end_index = torch.argmax(end_logits, dim=1).item()
    return tokenizer.decode(inputs['input_ids'][0][start_index:end_index+1])

# Make predictions on the validation set
predictions = []
references = []
for example in tokenized_datasets['validation']:
    prediction = make_predictions(example)
    predictions.append(prediction)
    references.append(example['answers']['text'][0])  # Assuming single answer per question

# Compute Exact Match (EM)
def compute_exact_match(predictions, references):
    return sum([1 if pred == ref else 0 for pred, ref in zip(predictions, references)]) / len(references)

# Compute F1-Score
def compute_f1(predictions, references):
    f1_scores = []
    for pred, ref in zip(predictions, references):
        pred_tokens = pred.split()
        ref_tokens = ref.split()
        common_tokens = set(pred_tokens) & set(ref_tokens)
        if len(common_tokens) == 0:
            f1_scores.append(0)
        else:
            precision = len(common_tokens) / len(pred_tokens)
            recall = len(common_tokens) / len(ref_tokens)
            f1_scores.append(2 * (precision * recall) / (precision + recall))
    return sum(f1_scores) / len(f1_scores)

# Calculate metrics
exact_match = compute_exact_match(predictions, references)
f1 = compute_f1(predictions, references)

print(f"Exact Match: {exact_match}")
print(f"F1-Score: {f1}")

Map: 100%|██████████| 87599/87599 [00:15<00:00, 5749.95 examples/s]


Exact Match: 0.005676442762535478
F1-Score: 0.006110285173672118


In [26]:
from transformers import BertForQuestionAnswering, PreTrainedTokenizerFast, pipeline
# Create a question answering pipeline
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# Define the context and question
context = "Hugging Face is a company based in New York."
question = "Where is Hugging Face based?"

# Get the answer
result = qa_pipeline(question=question, context=context)

# Print the answer
print(f"Answer: {result['answer']}")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Answer: Hugging Face


In [27]:
# Finetuning for Named Entity Recognition

# Load the CoNLL-2003 dataset
dataset = load_dataset("conll2003")

# Tokenize the dataset and align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, padding='max_length', max_length=128, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

# Define model configuration
config = BertConfig.from_pretrained(base_model_files, num_labels=dataset['train'].features['ner_tags'].feature.num_classes)

# Load the pretrained model
model = BertForTokenClassification.from_pretrained(base_model_files, config=config)

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Create a data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained(f"{tokenizer_type}_BERT_NER")


Map: 100%|██████████| 14041/14041 [00:03<00:00, 4597.68 examples/s]
Map: 100%|██████████| 3250/3250 [00:00<00:00, 4722.73 examples/s]
Map: 100%|██████████| 3453/3453 [00:00<00:00, 5265.86 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at Wordpiece_BERT_Base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.3889,0.44917
2,0.3912,0.409253
3,0.3911,0.399283


In [28]:
# Evaluation for NER
import torch
from transformers import BertConfig, BertForTokenClassification, PreTrainedTokenizerFast, TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import load_dataset
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
import numpy as np

# Load the model and tokenizer
model = BertForTokenClassification.from_pretrained(f"{tokenizer_type}_BERT_NER")

# Load the CoNLL-2003 dataset
dataset = load_dataset("conll2003")

# Tokenize the dataset and align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, padding='max_length', max_length=128, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

# Define model configuration
config = BertConfig.from_pretrained(base_model_files, num_labels=dataset['train'].features['ner_tags'].feature.num_classes)

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy='no',
)

# Define the evaluation function
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[label for label in label if label != -100] for label in labels]
    true_predictions = [
        [pred for pred, label in zip(prediction, label) if label != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Convert integer labels to string labels
    label_list = dataset['train'].features['ner_tags'].feature.names
    true_labels = [[label_list[l] for l in label] for label in true_labels]
    true_predictions = [[label_list[p] for p in prediction] for prediction in true_predictions]

    precision = precision_score(true_labels, true_predictions)
    recall = recall_score(true_labels, true_predictions)
    f1 = f1_score(true_labels, true_predictions)

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# Evaluate the model
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics
)

results = trainer.evaluate()
# Print the precision, recall, and F1 scores 
print(f"Precision: {results['eval_precision']:.4f}")
print(f"Recall: {results['eval_recall']:.4f}")
print(f"F1 Score: {results['eval_f1']:.4f}")


Map: 100%|██████████| 14041/14041 [00:02<00:00, 4957.87 examples/s]
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Precision: 0.3124
Recall: 0.3703
F1 Score: 0.3389
