In [7]:
# !pip install -q wandb
# !pip install datasets
# !pip install seqeval
# !pip install evaluate
# !pip install datasets transformers==4.28.0
# !pip install transformers[torch]
import pandas as pd
from datasets import load_dataset
import numpy as np
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import evaluate
import matplotlib.pyplot as plt
import torch
from collections import Counter
from tqdm import tqdm
import wandb
import time



# Preprocess Data

## Loading in Inspec Dataset with samples

In [8]:
dataset = load_dataset("midas/inspec", "extraction")


print("Samples for Keyphrase Extraction\n")

# sample from the train split
print("Sample from training data split")
train_sample = dataset["train"][0]
print("Fields in the sample: ", [key for key in train_sample.keys()])
print("Tokenized Document: ", train_sample["document"])
print("Document BIO Tags: ", train_sample["doc_bio_tags"])
print("\n-----------\n")

# sample from the validation split
print("Sample from validation data split")
validation_sample = dataset["validation"][0]
print("Fields in the sample: ", [key for key in validation_sample.keys()])
print("Tokenized Document: ", validation_sample["document"])
print("Document BIO Tags: ", validation_sample["doc_bio_tags"])
print("\n-----------\n")

# sample from the test split
print("Sample from test data split")
test_sample = dataset["test"][0]
print("Fields in the sample: ", [key for key in test_sample.keys()])
print("Tokenized Document: ", test_sample["document"])
print("Document BIO Tags: ", test_sample["doc_bio_tags"])
print("\n-----------\n")
print('Type of each dataset:', type(dataset["train"]))

Using the latest cached version of the module from /Users/chasevanamburg/.cache/huggingface/modules/datasets_modules/datasets/midas--inspec/debd18641afb7048a36cee2b7bb8dfbf2cd1a68899118653a42fd760cf84284e (last modified on Wed Oct  4 12:18:46 2023) since it couldn't be found locally at midas/inspec., or remotely on the Hugging Face Hub.
Repo card metadata block was not found. Setting CardData to empty.


Samples for Keyphrase Extraction

Sample from training data split
Fields in the sample:  ['id', 'document', 'doc_bio_tags']
Tokenized Document:  ['A', 'conflict', 'between', 'language', 'and', 'atomistic', 'information', 'Fred', 'Dretske', 'and', 'Jerry', 'Fodor', 'are', 'responsible', 'for', 'popularizing', 'three', 'well-known', 'theses', 'in', 'contemporary', 'philosophy', 'of', 'mind', ':', 'the', 'thesis', 'of', 'Information-Based', 'Semantics', '-LRB-', 'IBS', '-RRB-', ',', 'the', 'thesis', 'of', 'Content', 'Atomism', '-LRB-', 'Atomism', '-RRB-', 'and', 'the', 'thesis', 'of', 'the', 'Language', 'of', 'Thought', '-LRB-', 'LOT', '-RRB-', '.', 'LOT', 'concerns', 'the', 'semantically', 'relevant', 'structure', 'of', 'representations', 'involved', 'in', 'cognitive', 'states', 'such', 'as', 'beliefs', 'and', 'desires', '.', 'It', 'maintains', 'that', 'all', 'such', 'representations', 'must', 'have', 'syntactic', 'structures', 'mirroring', 'the', 'structure', 'of', 'their', 'contents', 

In [9]:
label_list = np.unique(train_sample["doc_bio_tags"])

id2label = {i: label for i, label in enumerate(label_list)}
label2id = {v: k for k, v in id2label.items()}

print('Mapping doc_bio_tag to integer:\n\n',label2id)
print('\nMapping integer to doc_bio_tag:\n\n',id2label)

Mapping doc_bio_tag to integer:

 {'B': 0, 'I': 1, 'O': 2}

Mapping integer to doc_bio_tag:

 {0: 'B', 1: 'I', 2: 'O'}


## Specifying the base model we want

In [10]:
#model_checkpoint = "distilroberta-base"
model_checkpoint = "distilbert-base-uncased"

In [11]:
#checking to see if we have gpus
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Pytorch version: {torch.__version__}")
    print(f"Device name: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("No GPU available.")

No GPU available.


## Getting the same tokenizer that was used in the pre-trained model to preprocess text

General Info:

A tokenizer breaks unstructured data and natural language text into chunks of information that can be considered as discrete elements.

We use add_prefix_space = True to specify that we want to add a space to the first word if there isn’t already one. This lets us treat 'hello' exactly like 'say hello'.

Although the documents are already tokenized, we want to make sure that the tokenization matches the one that our pre-trained model is expecting.

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
#check to make sure tokenizer has fast version available
tokenizer.is_fast

Downloading (…)okenizer_config.json: 100%|████| 28.0/28.0 [00:00<00:00, 176kB/s]
Downloading (…)lve/main/config.json: 100%|██████| 483/483 [00:00<00:00, 746kB/s]
Downloading (…)solve/main/vocab.txt: 100%|███| 232k/232k [00:00<00:00, 8.16MB/s]
Downloading (…)/main/tokenizer.json: 100%|███| 466k/466k [00:00<00:00, 8.24MB/s]


True

## Convert BIO tags of words to numerical labels (1,2, or 0) corresponding to tokens

Each word in the document gets converted into a token/multiple tokens. After the word is converted into a token we need to assign that token a classification index corresponding to the BIO tag of the word.  This is done with the function below.

see https://huggingface.co/docs/transformers/tasks/token_classification

In [13]:
def tokenize_words_with_corresponding_labels(sample):

    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

    #truncation=True to specify to truncate sequences at the maximum length
    #is_split_into_words = True to specify that our input is already pre-tokenized (e.g., split into words)
    tokenized_inputs = tokenizer(sample["document"], truncation=True, is_split_into_words=True)

    #initialize list to store lists of labels for each sample
    labels = []

    for i, label in enumerate(sample["doc_bio_tags"]):

        #map tokens to their respective word
        #word_ids() method gets index of the word that each token comes from
        word_ids = tokenized_inputs.word_ids(batch_index=i)

        #initialize list of labels for each token in a given sample
        label_ids = []

        for word_idx in word_ids:

            #set the special tokens, [CLS] and [SEP], to -100.
            # we use -100 because it's an index that is ignored in the loss function we will use (cross entropy).
            if word_idx is None:
                label_ids.append(-100)

            #set labels for tokens
            else:
                label_ids.append(label2id[label[word_idx]])

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels

    return tokenized_inputs

In [14]:
tokenized_dataset = dataset.map(tokenize_words_with_corresponding_labels, batched=True)

Map: 100%|█████████████████████████| 1000/1000 [00:00<00:00, 2129.20 examples/s]
Map: 100%|███████████████████████████| 500/500 [00:00<00:00, 1715.22 examples/s]
Map: 100%|███████████████████████████| 500/500 [00:00<00:00, 2417.34 examples/s]


# Finetuning the model with the Trainer API

Token classification with NLP on HuggingFace info

https://huggingface.co/learn/nlp-course/chapter7/2

## Data collator to pad inputs sequences and labels

In [15]:
#dynamically pad the inputs received, as well as the labels to make them all the same length
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


## Track metrics during training

To have the Trainer compute a metric every epoch, we will need to define a compute_metrics() function that takes the arrays of predictions and labels, and returns a dictionary with the metric names and values.

This compute_metrics() function takes the argmax of the logits to convert them to predictions. Then we have to convert both labels and predictions from integers to strings. We remove all the values where the label is -100, then pass the results to the metric.compute() method.

In [16]:
metric = evaluate.load("seqeval")

Downloading builder script: 100%|██████████| 6.34k/6.34k [00:00<00:00, 2.02MB/s]


In [17]:
def compute_metrics(preds):
    logits, labels = preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]

    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return all_metrics

## Weights and Biases Login

In [18]:
# Login to W&B account
wandb.login()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/chasevanamburg/.netrc


True

## Defining the model

In [21]:
#hacky way to get around AttributeError: module 'torch.distributed' has no attribute 'is_initialized' error
#setattr(torch.distributed, "is_initialized", lambda : False)

In [27]:
torch.manual_seed(0)
model_name = model_checkpoint.split("/")[-1]


model = AutoModelForTokenClassification.from_pretrained(model_checkpoint,
                                                        id2label=id2label,
                                                        label2id=label2id)
#need GPU to train
model = model.to(device)
device_ids = [0]
# model = DDP(model, device_ids=device_ids)


#check to make sure we have three labels for outputs
print('Number of labels:', model.config.num_labels)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN t

Number of labels: 3


In [28]:
batch_size = 8
learning_rate=4e-6

epochs = 6
args = TrainingArguments(
    f"{model_name}_finetuned_keyword_extract",
    evaluation_strategy = "epoch",
    logging_strategy = 'epoch',
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs= num_epochs,
    lr_scheduler_type='linear',
    weight_decay=0.01,
    seed=0
)

## Define a custom loss using weighted crossentropy

In [29]:
#counting how many beginning keywords, middle keywords, and non-keywords there are
count_0s = 0
count_1s = 0
count_2s = 0

for listt in tokenized_dataset["train"]["labels"]:
    count_dict = Counter(listt)
    count_0s += count_dict[0]
    count_1s += count_dict[1]
    count_2s += count_dict[2]

#getting weights for weighted cross_entropy
max_ = max(count_0s,count_1s,count_2s)
weights = [max_/count_0s, max_/count_1s, max_/count_2s]

#defining loss function
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels").to(model.device)
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits").to(model.device)
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = torch.nn.CrossEntropyLoss(weight= torch.tensor(weights).to(device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

## Launch training

In [30]:
# def run_learning(bucket, prefix, train_pct, batch_size, n_epochs, base_lr):
#     '''Load basic Resnet50, load train/eval data from S3, 
#     and run transfer learning over n epochs.'''
#     worker_rank = int(dist.get_rank())
    
#     # Initialize a W&B run
#     wandb.init(
#         project = 'ppp-keyword-extraction',
#         config = {
#           "learning_rate": learning_rate,
#           "epochs": epochs,
#           "batch_size": batch_size,
#           "model_name": model_name
#         },
#         name = model_name
#     )
    
#     # Format model and params
#     device = torch.device(0)
#     net = models.resnet50(pretrained=True)
#     model = net.to(device)
#     device_ids = [0]
#     model = DDP(model, device_ids=device_ids)
    
#     criterion = nn.CrossEntropyLoss().cuda()    
#     lr = base_lr * dist.get_world_size()
#     optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
#     scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode = 'min', patience = 2)
    
#     # Retrieve data for training and eval
#     whole_dataset = prepro_batches(bucket, prefix)
#     train, val = get_splits_parallel(train_pct, whole_dataset, batch_size=batch_size)
#     dataloaders = 

#     # Prepare metrics aggregation
#     count = 0
#     t_count = 0
#     for epoch in range(n_epochs):
#         # Each epoch has a training and validation phase
#         model.train()  # Set model to training mode
#         for inputs, labels in dataloaders["train"]:
#             dt = datetime.datetime.now().isoformat()
#             inputs = inputs.to(device)
#             labels = labels.to(device)
            
#             outputs = model(inputs)
#             _, preds = torch.max(outputs, 1)
#             loss = criterion(outputs, labels)
#             correct = (preds == labels).sum().item()
            
#             # zero the parameter gradients
#             optimizer.zero_grad()
#             loss.backward()
#             optimizer.step()
#             count += 1
#             # statistics
#             for param_group in optimizer.param_groups:
#                 current_lr = param_group['lr']
#             # Record the results of this model iteration (training sample) for later review.
#             rh.submit_result(
#                 f"worker/.json", 
#                 json.dumps()
#             )
        
#             if (count % 100) == 0 and worker_rank == 0:
#                 # Grab a snapshot of the current state of the model, in case of interruption or need to review
#                 rh.submit_result(f"checkpoint-.pkl", pickle.dumps(model.state_dict()))

#         with torch.no_grad():
#             model.eval()  # Set model to evaluation mode
#             for inputs_t, labels_t in dataloaders["val"]:
#                 dt = datetime.datetime.now().isoformat()
#                 inputs_t = inputs_t.to(device)
#                 labels_t = labels_t.to(device)
            
#                 outputs_t = model(inputs_t)
#                 _,pred_t = torch.max(outputs_t, dim=1)
#                 loss_t = criterion(outputs_t, labels_t)
#                 correct_t = (pred_t == labels_t).sum().item()
#                 t_count += 1

#                 # statistics
#                 for param_group in optimizer.param_groups:
#                     current_lr = param_group['lr']
#                 # Record the results of this model iteration (evaluation sample) for later review.
#                 rh.submit_result(
#                     f"worker/.json", 
#                     json.dumps()
#                 )

#         scheduler.step(loss)

In [31]:
# Initialize a W&B run
wandb.init(
    project = 'ppp-keyword-extraction',
    config = {
      "learning_rate": learning_rate,
      "epochs": epochs,
      "batch_size": batch_size,
      "model_name": model_name
    },
    name = model_name
)


# Train model
start_time = time.time()


trainer = CustomTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer)
trainer.train()

execution_time = (time.time() - start_time)/60.0
print("Training execution time (mins)",execution_time)

# Update W&B
wandb.config.update({"execution_time": execution_time})
# Close the W&B run
wandb.run.finish()

[34m[1mwandb[0m: Currently logged in as: [33mcvanamburg[0m ([33mac215-ppp[0m). Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
loss = []
val_loss = []
for d in trainer.state.log_history:
    if "loss" in d:
        loss.append(d["loss"])
    if "eval_loss" in d:
        val_loss.append(d["eval_loss"])

plt.plot(range(num_epochs), loss, 'o--')
plt.plot(range(num_epochs), val_loss, 'o--')
plt.legend(["Training Loss", "Validation Loss"])
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.show()

In [None]:
eval_overall_precision = []
eval_overall_f1 = []
eval_overall_recall = []
eval_overall_accuracy = []


for d in trainer.state.log_history:
    if "eval_overall_precision" in d:
        eval_overall_precision.append(d["eval_overall_precision"])
        eval_overall_f1.append(d["eval_overall_f1"])
        eval_overall_recall.append(d["eval_overall_recall"])
        eval_overall_accuracy.append(d["eval_overall_accuracy"])

plt.plot(range(num_epochs), eval_overall_precision,'o--', label = 'precision')
plt.plot(range(num_epochs), eval_overall_f1, 'o--',label = 'f1')
plt.plot(range(num_epochs), eval_overall_recall, 'o--',label = 'recall')
plt.plot(range(num_epochs), eval_overall_accuracy, 'o--',label = 'accuracy')
plt.legend()
plt.xlabel("Epoch")
plt.show()

## Test Set Performance

In [None]:
test_output = trainer.evaluate(tokenized_dataset["test"])
test_output

## Function to extract keyphrases on a sample text

In [None]:
def extract_keyphrases(text, model, tokenizer, text_in_list = False):

    if text_in_list:
        text = ' '.join(word for word in text)

    inputs = tokenizer(text,return_tensors="pt")["input_ids"].to(device)
    with torch.no_grad():
        logits = model(inputs).logits

    predictions = torch.argmax(logits, dim=2)
    predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]

    keyphrases = []
    keyphrase = []
    for label, token in zip(predicted_token_class, inputs[0]):
        if label == 'B':
            keyphrase = []
            keyphrase.append(tokenizer.decode(token))
        elif label == 'I' and len(keyphrase) > 0:
            keyphrase.append(tokenizer.decode(token))

        elif label == 'O' and len(keyphrase) > 0:
            keyphrases.append(keyphrase)
            keyphrase = []

    return keyphrases

## Sample Text

In [None]:
' '.join(word for word in dataset["test"][0]["document"])

## Model Generated Key Phrases

In [None]:
extract_keyphrases(dataset["test"][0]["document"], model, tokenizer, True)

## Actual key words from the sample text

In [None]:
keyphrases = []
keyphrase = []
for label, word in zip(dataset["test"][0]["doc_bio_tags"],dataset["test"][0]["document"]):
    if label == 'B':
        keyphrase = []
        keyphrase.append(word)
    elif label == 'I' and len(keyphrase) > 0:
        keyphrase.append(word)

    elif label == 'O' and len(keyphrase) > 0:
        keyphrases.append(keyphrase)
        keyphrase = []
keyphrases

## Tomorrow

- make code modular
- get it running with weights and biases for 3 different base model instead of one at a time
- convert notebook into python script with one container for preprocessing and one for training
- make it multi-gpu compatible