In [23]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("Dizex/FoodBaseBERT")
model = AutoModelForTokenClassification.from_pretrained("Dizex/FoodBaseBERT")

pipe = pipeline("ner", model=model, tokenizer=tokenizer)
example = "Demet's Turtles Original Caramel Chocolate Pecan Clusters 9.3 oz Holiday Gift Box"

ner_entity_results = pipe(example)
print(ner_entity_results)

[{'entity': 'I-FOOD', 'score': 0.49999642, 'index': 5, 'word': 'Turtle', 'start': 8, 'end': 14}, {'entity': 'I-FOOD', 'score': 0.6096488, 'index': 6, 'word': '##s', 'start': 14, 'end': 15}, {'entity': 'B-FOOD', 'score': 0.45608267, 'index': 7, 'word': 'Original', 'start': 16, 'end': 24}, {'entity': 'I-FOOD', 'score': 0.6613699, 'index': 8, 'word': 'Cara', 'start': 25, 'end': 29}, {'entity': 'I-FOOD', 'score': 0.5776781, 'index': 9, 'word': '##mel', 'start': 29, 'end': 32}, {'entity': 'I-FOOD', 'score': 0.86556953, 'index': 10, 'word': 'Chocolate', 'start': 33, 'end': 42}, {'entity': 'I-FOOD', 'score': 0.96111995, 'index': 11, 'word': 'P', 'start': 43, 'end': 44}, {'entity': 'I-FOOD', 'score': 0.8003402, 'index': 12, 'word': '##eca', 'start': 44, 'end': 47}, {'entity': 'I-FOOD', 'score': 0.9277613, 'index': 13, 'word': '##n', 'start': 47, 'end': 48}, {'entity': 'I-FOOD', 'score': 0.9217512, 'index': 15, 'word': '##luster', 'start': 50, 'end': 56}]


In [31]:
ner_entity_results = pipe(example)

# Initialize the entity words list with an empty string
entity_words = [""]

# Loop through each dictionary in the list and extract the entity word
for result in ner_entity_results:
    if result["entity"] == "B-FOOD":
        entity_words.append(result["word"])
    elif result["entity"] == "I-FOOD":
        entity_words[-1] += " " + result["word"]

# Remove any remaining ## symbols and extra spaces
entity_words = [word.replace("##", "").strip() for word in entity_words]

# Join the entity words into a single string
output = " ".join(entity_words)

print(output)


Turtle s Original Cara mel Chocolate P eca n luster


In [None]:
import torch
print(torch.cuda.is_available())

In [None]:
from transformers import pipeline
import numpy as np

In [None]:
classifier = pipeline("zero-shot-classification")

In [None]:
classifier(
    "This is a course about the Transformers library",
    candidate_labels=["machine learning", "gym", "food"],
)

In [None]:
from transformers import pipeline
generator = pipeline(task="text-generation", model="bigscience/bloom-1b7", device=0)

In [None]:
from transformers import AutoModelForTokenClassification, AutoModel, AutoTokenizer
import torch

# Define input text and pre-trained model checkpoint
text = "My name is wolfgang and I live in berlin"
checkpoint = "Jean-Baptiste/roberta-large-ner-english"

# Instantiate tokenizer and encode input text
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")

# Instantiate model and generate output
model = AutoModel.from_pretrained(checkpoint)
outputs = model(**inputs)
print(outputs[0].shape)

# Instantiate token classification model and generate predictions
model = AutoModelForTokenClassification.from_pretrained(checkpoint)
outputs = model(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)
print(model.config.id2label)

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')
model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-large")

# prepare input
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')

# forward pass
output = model(**encoded_input)

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

# Load the pre-trained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')
model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-large")

# Define the input sentence with a masked token
text = "I want to <mask> a new car tomorrow."

# Tokenize the input sentence, replacing the masked token with a special [MASK] token
encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')

print(output.logits.shape)
print(encoded_input['input_ids'][0].tolist().index(tokenizer.mask_token_id))

# Extract the predicted probabilities for the masked token
predicted_probabilities = output.logits[0, encoded_input['input_ids'][0].tolist().index(tokenizer.mask_token_id)]
predicted_probabilities = torch.nn.functional.softmax(predicted_probabilities, dim=-1)

# Get the top-k most probable predictions for the masked token
k = 5
top_k = torch.topk(predicted_probabilities, k)
for i in range(k):
    token = tokenizer.convert_ids_to_tokens(top_k.indices[i].item())
    score = top_k.values[i].item()
    print(f"Prediction {i+1}: '{token}' with probability {score:.5f}")

In [None]:
%%time
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

sequences = [
    "Using a Transformer network is simple",
    "The quick brown fox jumps over the lazy dog",
    "To be or not to be, that is the question"
]

# Tokenize the input sequences and convert them to padded and truncated integer token IDs
inputs = tokenizer(
    sequences,
    padding=True,
    truncation=True,
    return_tensors="pt"
)

# Print the resulting input IDs and attention masks
print(inputs['input_ids'])
print(inputs['attention_mask'])

Huggingface:

1. Understanding how to use the Pipeline (probably most useful) for various tasks, easy to use, and the different subtasks it can do like translation, QA, zero shot, sentiment analysis, token classification, etc. 
2. Understood how pipeline works in more detail by using AutoModel for various tasks as well as AutoTokenizer
3. Load dataset
4. How to finetune
5. How to evaluate
6. 

In [None]:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

# Same as before
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

# This is new
batch["labels"] = torch.tensor([1, 1])

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

In [None]:
from datasets import load_dataset
raw_datasets = load_dataset("glue", "mrpc")

In [None]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
raw_datasets = load_dataset("glue", "mrpc")

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


from transformers import TrainingArguments
training_args = TrainingArguments("test-trainer")

from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

import numpy as np
import evaluate

def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
from transformers import TrainingArguments
training_args = TrainingArguments("test-trainer")

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
import numpy as np
import evaluate

In [None]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset
batch_size=32

# Define the generator function to preprocess the data in batches
def preprocess_generator(examples):
    for i in range(0, len(examples["article"]), batch_size):
        batch = examples["article"][i:i+batch_size]
        targets = examples["highlights"][i:i+batch_size]
        model_inputs = tokenizer(batch, max_length=512, padding="max_length", truncation=True)
        with tokenizer.as_target_tokenizer():
            model_targets = tokenizer(targets, max_length=128, padding="max_length", truncation=True)
        model_inputs["labels"] = model_targets["input_ids"]
        yield model_inputs

def preprocess_function(examples):
    articles = [ex for ex in examples["article"]]
    summaries = [ex for ex in examples["highlights"]]

    model_inputs = tokenizer(articles, max_length=512, padding="max_length", truncation=True)
    with tokenizer.as_target_tokenizer():
        model_targets = tokenizer(summaries, max_length=128, padding="max_length", truncation=True)
    
    model_inputs["labels"] = model_targets["input_ids"]
    return model_inputs
    
# Load the dataset
raw_datasets = load_dataset("cnn_dailymail", "3.0.0")
preprocessed_datasets = raw_datasets.map(preprocess_function, batched=True, num_proc=4)

# Load the pre-trained model and tokenizer
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Define the data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Initialize the trainer arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    max_steps=1000,
    weight_decay=0.01,
    push_to_hub=False,
)

# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# Start the training
trainer.train()


In [None]:
from datasets import load_metric

In [None]:
preprocessed_datasets

In [None]:
# Load the pre-trained model and tokenizer
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Define the data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Initialize the trainer arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    max_steps=5000,
    weight_decay=0.01,
    push_to_hub=False,
    evaluation_strategy = "steps",
    eval_steps = 50,
)

# Load the ROUGE metric
metric = load_metric("rouge")

# Define the evaluation function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    scores = metric.compute(predictions=decoded_preds, references=decoded_labels, rouge_types=["rouge1"])["rouge1"].mid
    
    return {"rouge1_precision": scores.precision, "rouge1_recall": scores.recall, "rouge1_fmeasure": scores.fmeasure}


# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=preprocessed_datasets["train"],
    eval_dataset=preprocessed_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Start the training
trainer.train()

In [None]:
!pip install nltk
!pip install rouge_score

# Goal:

1. Implement full training from dataloading (dailycnn dataset), to model training, evaluation, etc, using HF. 
* Right now: stuck on on the fly dataset loading, we don't want to cache because this would take a lot of disk space etc.

2. After we get step 1) working, we want to go deeper on every step, so download the dataset and load it as a custom dataset rather than using huggingface simple API, in order to make it more general. Compare with loading the ds as a custom HF dataset or using pytorch class together with lightning. Speed difference? Convenience? Also we want to use the lightning Trainer so see how we can integrate that. And then compare HF to the lightning + hf model approach and see what we like the most.