In [None]:
import nltk
nltk.download("wordnet")
from nltk.corpus import wordnet as wn
nltk.download("averaged_perceptron_tagger")
nltk.download("punkt")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
syns = wn.synsets("dog","n")

In [None]:
syns

[Synset('dog.n.01'),
 Synset('frump.n.01'),
 Synset('dog.n.03'),
 Synset('cad.n.01'),
 Synset('frank.n.02'),
 Synset('pawl.n.01'),
 Synset('andiron.n.01')]

In [None]:
syns[0].hypernyms()[0].hyponyms()[5].lemma_names()

['wild_dog']

In [None]:
def get_distractors(syn, word):
    distractors = []
    word = word.lower()
    orig_word = word
    if len(word.split()) > 0:
        word = word.replace(" ","_")
    hypernym = syn.hypernyms()
    if len(hypernym) == 0:
        #Find the distractors using word2vec

        return distractors
    for item in hypernym[0].hyponyms():
        name = item.lemma_names()[0]
        if name == orig_word:
            continue
        name = name.replace("_"," ")
        name = "".join(w.capitalize() for w in name.split())
        if name is not None and name not in distractors:
            distractors.append(item.lemma_names()[0])

    return distractors

In [None]:
syn = wn.synsets("Grotto","n")[0]
distractors = get_distractors(syn, "Grotto")

In [None]:
distractors

['cavern', 'cove']

In [None]:
%pip install pywsd

In [None]:
from pywsd.lesk import simple_lesk,adapted_lesk
answer = adapted_lesk("These flowers generally grow on river banks and near streams.",ambiguous_word="bank",pos="n")
answer

In [None]:
import re
def get_sense(sent):
    re_result = re.search(r"\[TGT\](.*)\[TGT\]", sent)
    if re_result is None:
        print("Incorrect input format. Please try again.")
    sent = sent.replace("[TGT]"," ")
    sent = " ".join(i for i in sent.split())
    ambiguous_word = re_result.group(1).strip()
    wn_pos = wn.NOUN
    sense = adapted_lesk(sent, ambiguous_word=ambiguous_word,pos=wn_pos)
    meaning = sense.definition()
    return (sense, meaning, ambiguous_word)

In [None]:
sense, meaning, answer = get_sense("Kalki, final avatar (incarnation) of the Hindu god [TGT] Vishnu [TGT], who is yet to appear.")
print(sense,"\t",meaning,"\t",answer)


In [None]:
sense, meaning, answer = get_sense("The river [TGT] bank [TGT] has plenty of fish")
print(sense,"\t",meaning,"\t",answer)

In [None]:
sense, meaning, answer = get_sense("I went to the [TGT] bank [TGT] to deposit my money")
print(sense,"\t",meaning,"\t",answer)

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokens = tokenizer("I went to the bank to deposit my money")

In [None]:
tokenizer.decode(tokens["input_ids"], skip_special_tokens=True)

In [None]:
tokens = tokenizer.tokenize("I went to the bank to deposit my money")

In [None]:
tokens = tokens + ["[SEP]"]
segment_ids = [0] * len(tokens)

In [None]:
wn.synsets("bank","n")[1].definition()

In [None]:
from transformers import AutoTokenizer, T5ForConditionalGeneration
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = T5ForConditionalGeneration.from_pretrained(checkpoint)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [None]:
squad_train = squad_tokenized["train"].shuffle(seed=42).select(range(1000))
squad_eval = squad_tokenized["validation"].shuffle(seed=42).select(range(100))

In [None]:
from transformers import Trainer, TrainingArguments

training_args =  TrainingArguments(
    output_dir="t5_question_generation_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=squad_train,
    eval_dataset=squad_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()

In [None]:
from transformers import AutoTokenizer, T5Model
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = T5Model.from_pretrained(checkpoint)

In [None]:
input_ids = tokenizer("Studies have shown that owning a dog is good for you", return_tensors="pt").input_ids
decoder_input_ids = tokenizer("Studies have", return_tensors="pt").input_ids
decoder_input_ids = model._shift_right(decoder_input_ids)

In [None]:
outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

In [None]:
outputs.last_hidden_state

In [None]:
import torch
linear_final = torch.nn.Linear(512, 32128, bias=True)
softmax = torch.nn.Softmax(dim=1)
x = outputs.last_hidden_state[0]
x = linear_final(x)
x = softmax(x)

In [None]:
vocab_idx = torch.argmax(x, axis=1)

In [None]:
vocab_idx

In [None]:
tokenizer.decode(vocab_idx)

In [None]:
input_ids = tokenizer("question: What is your name? context: My name is Tracy Chapman", return_tensors="pt").input_ids
outputs = model.generate(input_ids)
tokenizer.decode(outputs[0])

In [None]:
!pip install datasets evaluate transformers torch pytorch-lightning
# Train Question Generator using pytorch
import torch
# import mlflow
import numpy as np
import pytorch_lightning as pl
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, T5ForConditionalGeneration, T5Tokenizer, AdamW, get_scheduler, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from torch import optim, nn

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting pytorch-lightning
  Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (21 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting torchmetrics>=0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-1.4.2-py3-none-any.whl.metadata (19 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.11.7-py3-none-any.whl.metadata (5.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requi

In [None]:
max_source_length = 512
max_target_length = 128

In [None]:
squad = load_dataset("squad")
squad = squad.flatten()
squad["train"][0]["answers.text"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

['Saint Bernadette Soubirous']

In [None]:
prefix_1 = "answer: "
prefix_2 = " context: "
prefix_3 = "question: "
checkpoint = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def preprocess_function(examples):
  input_1 = [ prefix_1 + i[0] for i in examples["answers.text"] ]
  input_2 = [ prefix_2 + i for i in examples["context"] ]
  assert len(examples["context"]) == len(examples["answers.text"])
  input = [input_1[i] + input_2[i] for i in range(len(examples["context"]))]
  model_inputs = tokenizer(input, padding="longest", max_length = max_source_length , truncation = True, return_tensors = "pt")
  labels = tokenizer([prefix_3 + i for i in examples["question"]], padding="longest", max_length = max_target_length ,truncation = True, return_tensors = "pt")
  labels["input_ids"][labels["input_ids"]==tokenizer.pad_token_id] = -100
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [None]:
prefix_1 = "context: "
prefix_2 = " question: "
prefix_3 = "answer: "
checkpoint = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(checkpoint)
def preprocess_function(examples):
  output_1 = [ prefix_2 + i for i in examples["question"] ]
  output_2 = [ prefix_3 + i[0] for i in examples["answers.text"] ]
  output = [output_2[i] + output_1[i] for i in range(len(examples["question"]))]
  assert len(examples["question"]) == len(examples["answers.text"])
  model_inputs = tokenizer([prefix_1 + i for i in examples["context"]], padding="longest", max_length = max_source_length , truncation = True, return_tensors = "pt")
  labels = tokenizer(output, padding="longest", max_length = max_target_length, truncation = True, return_tensors = "pt")
  labels["input_ids"][labels["input_ids"]==tokenizer.pad_token_id] = -100
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
squad_tokenized = squad.map(preprocess_function, batched=True)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
squad_tokenized = squad_tokenized.remove_columns(["id","title","question","context","answers.text","answers.answer_start"])
squad_tokenized.set_format("torch")
data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer, model = checkpoint)

In [None]:
squad_tokenized = load_from_disk("/content/drive/MyDrive/learnX.ai (study support)/squad")

In [None]:
squad_train = squad_tokenized["train"]
#squad_eval = squad_tokenized["validation"]

In [None]:
eval_test = squad_tokenized["validation"].train_test_split(test_size=0.1, shuffle=False)
squad_eval = eval_test["train"]
squad_test = eval_test["test"]

In [None]:
squad_eval

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 9513
})

In [None]:
squad_tokenized

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10570
    })
})

In [None]:
train_dataloader = DataLoader(squad_train, batch_size=4, shuffle=False)
eval_dataloader = DataLoader(squad_eval, batch_size=4, shuffle=False)
test_dataloader = DataLoader(squad_test, batch_size=4, shuffle=False)

In [None]:
for batch in train_dataloader:
    break
print({k:v.shape for k,v in batch.items()})

{'input_ids': torch.Size([4, 508]), 'attention_mask': torch.Size([4, 508]), 'labels': torch.Size([4, 39]), 'references': torch.Size([4, 21])}


In [None]:
for batch in test_dataloader:
  break
print(tokenizer.decode(batch["input_ids"][3]))
fake_labels = np.where(batch["labels"][3]!=-100, batch["labels"][1], tokenizer.pad_token_id)
# print(tokenizer.decode(batch["references"][3]))
print(tokenizer.decode(fake_labels))

context: Much of the work of the Scottish Parliament is done in committee. The role of committees is stronger in the Scottish Parliament than in other parliamentary systems, partly as a means of strengthening the role of backbenchers in their scrutiny of the government and partly to compensate for the fact that there is no revising chamber. The principal role of committees in the Scottish Parliament is to take evidence from witnesses, conduct inquiries and scrutinise legislation. Committee meetings take place on Tuesday, Wednesday and Thursday morning when Parliament is sitting. Committees can also meet at other locations throughout Scotland.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><

In [None]:
# ! pip install sense2vec==1.0.0a1 matplotlib spacy preshed seqeval rouge_score
!pip install rouge_score rquge

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting rquge
  Downloading rquge-0.3-py3-none-any.whl.metadata (5.9 kB)
Downloading rquge-0.3-py3-none-any.whl (7.1 kB)
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=d2c38fe0ceebcbb217af57ca93f5f7728abc91cc051cd7fac22a0e84415fd142
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score, rquge
Successfully installed rouge_score-0.1.2 rquge-0.3


In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_aggregator=True)

    # prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    # result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
model = T5ForConditionalGeneration.from_pretrained("t5-base")
outputs = model(input_ids = batch["input_ids"], attention_mask = batch["attention_mask"], labels = batch["labels"])
print(outputs.loss, outputs.logits.shape)

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tensor(3.7189, grad_fn=<NllLossBackward0>) torch.Size([4, 40, 32128])


In [None]:
import evaluate
import torch.nn.functional as F
rouge = evaluate.load("rouge")
rquge = evaluate.load("alirezamsh/rquge")
# precision = evaluate.load("precision")
# f1 = evaluate.load("f1")

x = F.softmax(outputs.logits, dim=-1)
predictions = torch.argmax(x, dim=-1)
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

# For rouge score
converted_labels = np.where(batch["labels"]!=-100, batch["labels"], tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(converted_labels, skip_special_tokens=True)
rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_aggregator=True)

# For rquge score
decoded_contexts = tokenizer.batch_decode(batch["input_ids"], skip_special_tokens=True)
decoded_answers = tokenizer.batch_decode(batch["references"], skip_special_tokens=True)
rquge_result = rquge.compute(generated_questions=decoded_preds, contexts=decoded_contexts, answers=decoded_answers)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.65k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.15k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/834 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/299 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [None]:
rquge_result

{'mean_score': 1.7305294275283813,
 'instance_score': [2.4372000694274902,
  1.7436456680297852,
  1.1143114566802979,
  1.6269605159759521]}

In [None]:
rouge = compute_metrics([predictions, batch["labels"]])

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2TokenizerFast
tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
tokens = tokenizer("Lionel Messi,",return_tensors="pt")
outputs = model.generate(input_ids = tokens.input_ids, attention_mask = tokens.attention_mask, do_sample=True, top_k=50, top_p=0.85, max_length=200)
tokenizer.decode(outputs[0])
# softmax = torch.nn.Softmax(dim=1)
# x = softmax(outputs.logits[0])
# predictions = torch.argmax(x, dim=1)
# decoded_preds = tokenizer.decode(predictions, skip_special_tokens=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Lionel Messi, Juventus manager, says he wants the player to go. "It is a great chance to be here. It is a chance to show our team what we have been trying to do. It is something that we can\'t wait for him to go."\n\nBarcelona\'s manager says he has a message for Messi, who is on the verge of signing with a team that was a bit floundering last season. "We would like to sign him," he said. "He is not an international player. He is not a young player. He is a player that we can sign and not play for. The more we have to do, the more we will see what we have.\n\n"We have to see if he can make a contribution to the team."\n\nA first-team opportunity\n\nLionel Messi\'s signing could prove to be an even bigger boost to the Argentinian\'s progress this summer. And this was the first time that'

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2TokenizerFast, DataCollatorForLanguageModeling, GPT2Config
batch_size = 2
max_length = 256
checkpoint="openai-community/gpt2"
tokenizer = GPT2TokenizerFast.from_pretrained(checkpoint, bos_token='<|startoftext|>', eos_token='<|endoftext|>', unk_token="<|unknown|>", pad_token='<|pad|>')
# tokenizer.add_special_tokens({'pad_token': '<|pad|>', 'unk_token': "<|unknown|>", 'bos_token':"<|startoftext|>"})
def preprocess_function(examples):
  inputs = tokenizer(["<|startoftext|> " + i + " <|endoftext|>" for i in examples["context"]], padding="longest", max_length = max_length, truncation = True, return_overflowing_tokens=True, return_tensors = "pt")
  sample_map = inputs.pop("overflow_to_sample_mapping")
  for key, values in examples.items():
      inputs[key] = [values[i] for i in sample_map]
  inputs["labels"] = inputs["input_ids"]
  inputs["labels"][inputs["labels"]==tokenizer.pad_token_id] = -100
  return inputs

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



In [None]:
squad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [None]:
squad_tokenized = squad.map(preprocess_function, batched=True)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
squad_tokenized

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 94772
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 11527
    })
})

In [None]:
squad_tokenized = squad_tokenized.remove_columns(["id","title","question","context","answers"])
squad_tokenized.set_format("torch")
data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm=False, return_tensors="pt")

In [None]:
squad_train = squad_tokenized["train"]
#squad_eval = squad_tokenized["validation"]
eval_test = squad_tokenized["validation"].train_test_split(test_size=0.1, shuffle=False)
squad_eval = eval_test["train"]
squad_test = eval_test["test"]

In [None]:
train_dataloader = DataLoader(squad_train, batch_size=batch_size, collate_fn=data_collator, shuffle=False)
eval_dataloader = DataLoader(squad_eval, batch_size=batch_size, collate_fn=data_collator, shuffle=False)
test_dataloader = DataLoader(squad_test, batch_size=batch_size, collate_fn=data_collator, shuffle=False)

In [None]:
configuration = GPT2Config.from_pretrained("openai-community/gpt2")
model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
model.resize_token_embeddings(len(tokenizer))

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Embedding(50260, 768)

In [None]:
class ContentGenerator(pl.LightningModule):

  def __init__(self, model_name_or_path:str,
               learning_rate:float=2e-5,
               warmup_steps:int=1e2,
               epsilon:int=1e-8,
               #batch_size:int=2,
               #experiment_name:str="learnX.ai (study support)",
               #tracking_uri:str="databricks"
               ):

    super().__init__()
    self.save_hyperparameters()
    self.learning_rate = learning_rate
    self.epsilon = epsilon
    self.warmup_steps = warmup_steps
    self.model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
    # self.train_loss = []
    # self.val_loss = []

  def forward(self,**inputs):
    return self.model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], labels=inputs["input_ids"])

  def training_step(self, batch, batch_idx):
    outputs = self(**batch)
    loss = outputs[0]
    # self.train_loss.append(loss)
    self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=True, logger=False)
    return loss

#   def on_train_epoch_end(self):
#     loss = sum(self.train_loss)/len(self.train_loss)
#     self.logger.experiment.log_metric(run_id=self.logger.run_id, key="train_loss", value=loss)
#     self.train_loss.clear()

  def validation_step(self, batch, batch_idx):
    outputs = self(**batch)
    val_loss, logits = outputs[:2]
    # self.val_loss.append(val_loss)
    # preds = torch.argmax(logits, dim=2)
    # labels = batch["labels"]
    # self.outputs["val_loss"].append(val_loss)
    # self.outputs["preds"].append(preds)
    # self.outputs["labels"].append(labels)
    self.log("val_loss", val_loss, on_step=False, on_epoch=True, prog_bar=True, logger=False)

#   def on_validation_epoch_end(self):
#     loss = sum(self.val_loss)/len(self.val_loss)
#     self.logger.experiment.log_metric(run_id=self.logger.run_id, key="val_loss", value=loss)
#     self.val_loss.clear()

  def configure_optimizers(self):
    optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, eps=self.epsilon)
    scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.trainer.estimated_stepping_batches)
    scheduler  = {"scheduler":scheduler, "interval":"epoch", "frequency":1}
    return [optimizer], [scheduler]

In [None]:
model = ContentGenerator(model_name_or_path="openai-community/gpt2")
# mlf_logger = MLFlowLogger(experiment_name="/learnX.ai (study support)", tracking_uri="databricks")
# timer = Timer(duration="00:02:30:00")
#checkpoint_callback = ModelCheckpoint(dirpath="/content/drive/MyDrive/learnX.ai (study support)/checkpoints", filename="checkpoint-{epoch}-{step}", save_top_k=1, every_n_train_steps=5000)
trainer = pl.Trainer(max_epochs=1, accelerator="cpu", devices="auto") #callbacks=[checkpoint_callback]
trainer.fit(model, train_dataloader, eval_dataloader)

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
tokens = tokenizer("Lionel Messi is",return_tensors="pt")
outputs = model.generate(input_ids = tokens.input_ids, attention_mask = tokens.attention_mask, max_length=50)
tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
from transformers import AutoTokenizer, BartForCausalLM

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
model = BartForCausalLM.from_pretrained("facebook/bart-base", add_cross_attention=False)
assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)

logits = outputs.logits
expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size]
list(logits.shape) == expected_shape

In [None]:
from transformers import AutoTokenizer, PegasusForCausalLM

tokenizer = AutoTokenizer.from_pretrained("google/pegasus-large")
model = PegasusForCausalLM.from_pretrained("google/pegasus-large", add_cross_attention=False)
assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)

logits = outputs.logits
expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size]
list(logits.shape) == expected_shape

In [None]:
mlflow.set_tracking_uri("databricks")
mlflow.set_experiment("/Users/alphaelite10@gmail.com/learnX.ai (study support")

In [None]:
import mlflow
mlflow.login()

2024/08/27 16:12:56 INFO mlflow.utils.credentials: No valid Databricks credentials found, please enter your credentials...


Databricks Host (should begin with https://): https://community.cloud.databricks.com/
Username: alphaelite10@gmail.com
Password: ··········


2024/08/27 16:13:28 INFO mlflow.utils.credentials: Successfully connected to MLflow hosted tracking server! Host: https://community.cloud.databricks.com.


In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
print(device)

cuda


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# !mkdir -p "/content/drive/MyDrive/learnX.ai (study support)/"

In [None]:
import evaluate
rouge = evaluate.load("rouge")
rquge = evaluate.load("alirezamsh/rquge")

class QAGenerator(pl.LightningModule):

  def __init__(self, model_name_or_path:str,
               learning_rate:float=2e-5,
               train_batch_size:int=4,
               eval_batch_size:int=4,
               warmup_steps:int=0,
               adam_epsilon:int=1e-8,
               experiment_name:str="learnX.ai (study support)",
               tracking_uri:str="databricks"):

    super().__init__()
    self.save_hyperparameters()
    self.model = T5ForConditionalGeneration.from_pretrained(model_name_or_path)
    # self.train_loss = []
    # self.val_loss = []

  def forward(self,**inputs):
    return self.model(input_ids=inputs["input_ids"],
                      attention_mask=inputs["attention_mask"],
                      labels=inputs["labels"])

  def training_step(self, batch, batch_idx):
    outputs = self(**batch)
    loss = outputs[0]
    # self.train_loss.append(loss)
    self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=True, logger=False)
    return loss

#   def on_train_epoch_end(self):
#     loss = sum(self.train_loss)/len(self.train_loss)
#     self.logger.experiment.log_metric(run_id=self.logger.run_id, key="train_loss", value=loss)
#     self.train_loss.clear()

  def validation_step(self, batch, batch_idx):
    outputs = self(**batch)
    val_loss, logits = outputs[:2]
    predictions = torch.argmax(logits, dim=2)
    rouge_scores, rquge_score = self.compute_metrics([predictions, batch["labels"]])
    # self.val_loss.append(val_loss)
    # preds = torch.argmax(logits, dim=2)
    # labels = batch["labels"]
    # self.outputs["val_loss"].append(val_loss)
    # self.outputs["preds"].append(preds)
    # self.outputs["labels"].append(labels)
    self.log("val_loss", val_loss, on_step=False, on_epoch=True, prog_bar=True, logger=False)
    self.log_dict(rouge_scores, on_step=False, on_epoch=True, prog_bar=True, logger=False)
    self.log("rquge_score", rquge_score, on_step=False, on_epoch=True, prog_bar=True, logger=False)


  def compute_metrics(self, eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # For rouge score
    labels=labels.cpu()
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_aggregator=True)

    # For rquge score
    decoded_contexts = tokenizer.batch_decode(batch["input_ids"], skip_special_tokens=True)
    decoded_answers = tokenizer.batch_decode(batch["references"], skip_special_tokens=True)
    rquge_result = rquge.compute(generated_questions=decoded_preds, contexts=decoded_contexts, answers=decoded_answers)

    return {k: round(v, 4) for k, v in rouge_result.items()}, round(rquge_result["mean_score"],4)

#   def on_validation_epoch_end(self):
#     loss = sum(self.val_loss)/len(self.val_loss)
#     self.logger.experiment.log_metric(run_id=self.logger.run_id, key="val_loss", value=loss)
#     self.val_loss.clear()

  def configure_optimizers(self):
    optimizer = AdamW(self.model.parameters(), lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.trainer.estimated_stepping_batches)
    scheduler  = {"scheduler":scheduler, "interval":"epoch", "frequency":1, "monitor":"val_loss"}
    return [optimizer], [scheduler]

In [None]:
import pytorch_lightning as pl
from pytorch_lightning.loggers import MLFlowLogger
from pytorch_lightning.callbacks import Timer
from pytorch_lightning.callbacks import ModelCheckpoint
# from collections import defaultdict

class QADataModule(pl.LightningDataModule):

  def __init__(self, task_name:str="squad",
               train_batch_size:int=4,
               eval_batch_size:int=4,
               max_source_length:int=512,
               max_target_length:int=128,
               passage="context: ",
               question=" question: ",
               answer ="answer: ",
               data_dir="/content/drive/MyDrive/learnX.ai (study support)/squad"
               ):

    super().__init__()
    self.task_name = task_name
    self.max_source_length = max_source_length
    self.max_target_length = max_target_length
    self.tokenizer = T5Tokenizer.from_pretrained("t5-small")
    self.train_batch_size = train_batch_size
    self.eval_batch_size = eval_batch_size
    self.data_dir = data_dir
    self.passage = passage
    self.question = question
    self.answer = answer

#   def prepare_data(self):
#     dataset = load_dataset(self.task_name)
#     dataset = dataset.flatten()
#     dataset_tokenized = dataset.map(self.convert_to_features, batched=True)
#     dataset_tokenized = dataset_tokenized.remove_columns(["id","title","question","context","answers.text","answers.answer_start"])
#     dataset_tokenized.set_format("torch")
#     dataset_tokenized.save_to_disk(self.data_dir)

  def setup(self, stage:str=None):
    dataset = load_from_disk(self.data_dir)
    self.squad_train, self.squad_eval = dataset["train"], dataset["validation"]

  def train_dataloader(self):
    return DataLoader(self.squad_train, batch_size=self.train_batch_size, shuffle=False)

  def val_dataloader(self):
    return DataLoader(self.squad_eval, batch_size=self.eval_batch_size, shuffle=False)

  def convert_to_features(self, examples):
    output_question = [ self.question + i for i in examples["question"] ]
    output_answer = [ self.answer + i[0] for i in examples["answers.text"] ]
    output = [output_answer[i] + output_question[i] for i in range(len(examples["question"]))]
    assert len(examples["question"]) == len(examples["answers.text"])
    features = self.tokenizer([self.passage + i for i in examples["context"]], padding="longest", max_length = self.max_source_length , truncation = True, return_tensors = "pt")
    labels = self.tokenizer(output, padding="longest", max_length = self.max_target_length, truncation = True, return_tensors = "pt")
    labels["input_ids"][labels["input_ids"]==self.tokenizer.pad_token_id] = -100
    features["labels"] = labels["input_ids"]
    return features

  # def state_dict(self):
  #   state = {"current_train_batch_index": self.current_train_batch_index}
  #   return state

  # def load_state_dict(self, state_dict):
  #   self.current_train_batch_index = state_dict["current_train_batch_index"]

In [None]:
dm = QADataModule()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
dm.prepare_data()

Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/87599 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
dm.setup()

In [None]:
for batch in dm.train_dataloader():
    break
print({k:v.shape for k,v in batch.items()})

In [1]:
model = QAGenerator(model_name_or_path="t5-base")
mlf_logger = MLFlowLogger(experiment_name="/learnX.ai (study support)", tracking_uri="databricks")
# timer = Timer(duration="00:02:30:00")
checkpoint_callback = ModelCheckpoint(dirpath="/content/drive/MyDrive/learnX.ai (study support)/checkpoints", filename="checkpoint-{epoch}-{step}", save_top_k=1, save_last=True, every_n_epochs=1)
trainer = pl.Trainer(max_epochs=3, accelerator="gpu", devices="auto" , callbacks=[checkpoint_callback])
trainer.fit(model, train_dataloader, eval_dataloader)

In [None]:
model = QAGenerator.load_from_checkpoint("/content/drive/MyDrive/learnX.ai (study support)/checkpoints/checkpoint-step=15000.ckpt")

In [None]:
from collections import OrderedDict
state_dict = OrderedDict()

for i,j in model.state_dict().items():
    t = i.split(".",maxsplit=1)[1]
    i = i.replace(i,t)
    state_dict[i] = j

In [None]:
model = T5ForConditionalGeneration.from_pretrained("t5-base")
model.load_state_dict(state_dict)

<All keys matched successfully>

In [None]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")
for batch in test_dataloader:
  break
print(tokenizer.decode(batch["input_ids"][1]))
fake_labels = np.where(batch["labels"][1]!=-100, batch["labels"][1], tokenizer.pad_token_id)
print(tokenizer.decode(fake_labels))

context: Much of the work of the Scottish Parliament is done in committee. The role of committees is stronger in the Scottish Parliament than in other parliamentary systems, partly as a means of strengthening the role of backbenchers in their scrutiny of the government and partly to compensate for the fact that there is no revising chamber. The principal role of committees in the Scottish Parliament is to take evidence from witnesses, conduct inquiries and scrutinise legislation. Committee meetings take place on Tuesday, Wednesday and Thursday morning when Parliament is sitting. Committees can also meet at other locations throughout Scotland.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><

In [None]:
text = "context: Kalki, final avatar (incarnation) of the Hindu god Vishnu, who is yet to appear. At the end of the present Kali yuga (age), when virtue and dharma have disappeared and the world is ruled by the unjust, Kalki will appear to destroy the wicked and to usher in a new age. He will be seated on a white horse with a naked sword in his hand, blazing like a comet. He is less commonly represented in painting and sculpture than the other avatars of Vishnu and is shown either on horseback or accompanied by his horse. According to some legends of the end of the world, Kalki’s horse will stamp the earth with its right foot, causing the tortoise which supports the world to drop into the deep. Then the gods will restore the earth once again to its former purity."

In [None]:
outputs = tokenizer(text, return_tensors="pt")

In [None]:
preds = model.generate(input_ids = outputs["input_ids"],
                       attention_mask = outputs["attention_mask"],
                       do_sample=False,
                       num_beams=5, num_beam_groups=5,
                       max_new_tokens=50,
                       diversity_penalty=1.0,
                       num_return_sequences=5
                      )
decoded_preds = [tokenizer.decode(preds[i], skip_special_tokens=True) for i in range(len(preds))]
print(decoded_preds)

['answer: Vishnu question: Who is the Hindu god Vishnu?', 'answer: Kalki question: Who is the final avatar of Vishnu?', "answer: a naked sword in his hand question: What is Kalki's weapon?", 'answer: Kali yuga question: When will the world be ruled by the unjust?', 'answer: Vishnu question: Which Hindu god is Kalki?']


In [None]:
def get_question(context, answer):
  text = "answer: {} context: {}".format(answer, context)
  tokens = tokenizer(text, return_tensors="pt")
  input_ids = tokens.input_ids
  attention_mask = tokens.attention_mask
  outputs = model.generate(input_ids=input_ids)
  question = tokenizer.decode(outputs[0], skip_special_tokens=True)
  return question.replace("question:","").strip()

In [None]:
context = "The old woman was sitting under a tree and sipping coffee."
answer = "sipping coffee"
get_question(context, answer)

In [None]:
def get_mcqs(sent):
  sense, meaning, answer = get_sense(sent)
  if sense is not None:
    distractors = get_distractors(sense, answer)
  else:
    distractors = ["Word not found in Wordnet."]
  sentence_for_t5 = sent.replace("[TGT]"," ")
  sentence_for_t5 = " ".join(sentence_for_t5.split())
  ques = get_question(sentence_for_t5, answer)
  return ques, answer, distractors, meaning

In [None]:
ques, answer, distractors, meaning = get_mcqs("Srivatsan loves to watch [TGT] cricket [TGT] during his free time")


In [None]:
print(ques)
print(answer)
print(distractors)
print(meaning)

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
s2v = nlp.add_pipe("sense2vec")
s2v.from_disk("/path/to/s2v_reddit_2015_md")
doc = nlp("Kalki, final avatar (incarnation) of the Hindu god Vishnu, who is yet to appear.")
assert doc[9].text == "Vishnu"

In [None]:
import spacy
import numpy as np
import evaluate
from spacy import displacy
from datasets import Dataset, load_from_disk, ClassLabel, Features
nlp = spacy.load("en_core_web_sm")
squad = load_dataset("squad")
# doc = nlp(squad["train"]["context"][0])
# displacy.serve(doc, style="ent")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
def bio_tagger(examples):
    id = examples["id"]
    tokens = [[token.text for token in nlp(i)] for i in examples["context"]]
    iob_tags = [[token.ent_iob_ for token in nlp(i)] for i in examples["context"]]
    return {"id":id, "tokens":tokens, "iob_tags":iob_tags}

In [None]:
ds = squad.map(bio_tagger, batched=True)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
squad_iob = ds.remove_columns(["title","context","question","answers"])

In [None]:
squad_iob.save_to_disk("/content/drive/MyDrive/learnX.ai (study support)/squad_iob")

In [None]:
squad_iob = load_from_disk("/content/drive/MyDrive/learnX.ai (study support)/squad_iob")

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [None]:
# Convert the "iob_tags" column to a ClassLabel
features = Features({'label': ClassLabel(num_classes=3, names=["I","O","B"])})
def convert_to_class_label(example):
  example["iob_ids"] = [features['label'].str2int(label) for label in example["iob_tags"]]
  return example

squad_mod = squad_iob.map(convert_to_class_label)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
example = squad_mod["train"][0]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], padding=True, truncation=True, is_split_into_words=True, return_tensors="pt")

    labels = []
    for i, label in enumerate(examples[f"iob_ids"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
squad_tc = squad_mod.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
id2label = {
    0: "I",
    1: "O",
    2: "B"
}
label2id = {
    "I": 0,
    "O": 1,
    "B": 2
}

In [None]:
from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))

In [None]:
show_random_elements(squad_mod["train"])

Unnamed: 0,id,tokens,iob_tags,iob_ids
0,5726b11bf1498d1400e8e79a,"[After, fans, noticed, Mercury, 's, increasingly, gaunt, appearance, in, 1988, ,, rumours, began, to, spread, that, Mercury, was, suffering, from, AIDS, ., Mercury, flatly, denied, this, ,, insisting, he, was, merely, "", exhausted, "", and, too, busy, to, provide, interviews, ., The, band, decided, to, continue, making, albums, ,, starting, with, The, Miracle, in, 1989, and, continuing, with, Innuendo, in, 1991, ., Despite, his, deteriorating, health, ,, the, lead, singer, continued, to, contribute, ., For, the, last, two, albums, made, while, Mercury, was, still, alive, ,, the, band, credited, all, songs, to, Queen, ,, rather, than, specific, members, of, the, ...]","[O, O, O, B, O, O, O, O, O, B, O, O, O, O, O, O, B, O, O, O, O, O, B, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B, O, O, O, B, O, B, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B, O, O, O, B, O, O, O, O, O, O, O, O, O, O, B, O, O, O, O, O, O, O, ...]","[1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, ...]"
1,570c25306b8089140040fb6c,"[The, FBI, director, is, responsible, for, the, day, -, to, -, day, operations, at, the, FBI, ., Along, with, his, deputies, ,, the, director, makes, sure, cases, and, operations, are, handled, correctly, ., The, director, also, is, in, charge, of, making, sure, the, leadership, in, any, one, of, the, FBI, field, offices, is, manned, with, qualified, agents, ., Before, the, Intelligence, Reform, and, Terrorism, Prevention, Act, was, passed, in, the, wake, of, the, September, 11, attacks, ,, the, FBI, director, would, directly, brief, the, President, of, the, United, States, on, any, issues, that, arise, from, within, the, FBI, ., Since, ...]","[O, B, O, O, O, O, O, O, O, O, O, B, O, O, O, B, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B, O, O, O, O, O, O, O, O, O, B, I, I, I, I, I, I, O, O, O, O, O, O, O, B, I, O, O, O, B, O, O, O, O, O, O, O, B, I, I, O, O, O, O, O, O, O, O, B, O, O, ...]","[1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, ...]"
2,572bf4da750c471900ed4c3d,"[The, area, now, known, as, Tennessee, was, first, inhabited, by, Paleo, -, Indians, nearly, 12,000, years, ago, ., The, names, of, the, cultural, groups, that, inhabited, the, area, between, first, settlement, and, the, time, of, European, contact, are, unknown, ,, but, several, distinct, cultural, phases, have, been, named, by, archaeologists, ,, including, Archaic, (, 8000–1000, BC, ), ,, Woodland, (, 1000, BC–1000, AD, ), ,, and, Mississippian, (, 1000–1600, AD, ), ,, whose, chiefdoms, were, the, cultural, predecessors, of, the, Muscogee, people, who, inhabited, the, Tennessee, River, Valley, before, Cherokee, migration, into, the, river, 's, headwaters, .]","[O, O, O, O, O, B, O, O, O, O, B, I, I, B, I, I, I, O, O, O, O, O, O, O, O, O, O, O, O, B, O, O, O, O, O, B, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B, O, B, B, O, O, B, O, B, I, I, O, O, O, B, O, B, I, O, O, O, O, O, O, O, O, O, O, B, O, O, O, O, B, I, I, O, B, O, O, O, O, O, O, O]","[1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 0, 0, 2, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 1, 1, 2, 1, 2, 0, 0, 1, 1, 1, 2, 1, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 0, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1]"
3,5726e051708984140094d471,"[Medical, facilities, in, Mali, are, very, limited, ,, and, medicines, are, in, short, supply, ., Malaria, and, other, arthropod, -, borne, diseases, are, prevalent, in, Mali, ,, as, are, a, number, of, infectious, diseases, such, as, cholera, and, tuberculosis, ., Mali, 's, population, also, suffers, from, a, high, rate, of, child, malnutrition, and, a, low, rate, of, immunization, ., An, estimated, 1.9, percent, of, the, adult, and, children, population, was, afflicted, with, HIV, /, AIDS, that, year, ,, among, the, lowest, rates, in, Sub, -, Saharan, Africa, ., An, estimated, 85–91, percent, of, Mali, 's, girls, and, women, have, had, ...]","[O, O, O, B, O, O, O, O, O, O, O, O, O, O, O, B, O, O, O, O, O, O, O, O, O, B, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B, I, I, I, O, O, O, O, O, O, O, O, O, O, O, O, B, I, O, O, O, O, O, O, B, I, I, I, O, B, I, I, I, O, B, O, O, O, O, O, O, ...]","[1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, 1, 2, 1, 1, 1, 1, 1, 1, ...]"
4,56cf7e194df3c31400b0d851,"[In, September, 2013, ,, West, was, widely, rebuked, by, human, rights, groups, for, performing, in, Kazakhstan, at, the, wedding, of, authoritarian, President, Nursultan, Nazarbayev, 's, grandson, ., He, traveled, to, Kazakhstan, ,, which, has, one, of, the, poorest, human, rights, records, in, the, world, ,, as, a, personal, guest, of, Nazarbayev, ., Other, notable, Western, performers, ,, including, Sting, ,, have, previously, cancelled, performances, in, the, country, over, human, rights, concerns, ., West, was, reportedly, paid, US$, 3, million, for, his, performance, ., West, had, previously, participated, in, cultural, boycotts, ,, joining, Shakira, and, Rage, Against, The, Machine, in, refusing, ...]","[O, B, I, O, B, O, O, O, O, O, O, O, O, O, O, B, O, O, O, O, O, O, B, I, O, O, O, O, O, O, B, O, O, O, B, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B, O, O, O, O, O, O, O, B, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B, I, I, O, O, O, O, O, O, O, O, O, O, B, O, O, B, O, O, O, O, O, O, O, ...]","[1, 2, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, ...]"
5,56d4c6b02ccc5a1400d83226,"[In, January, 2013, ,, Destiny, 's, Child, released, Love, Songs, ,, a, compilation, album, of, the, romance, -, themed, songs, from, their, previous, albums, and, a, newly, recorded, track, ,, "", Nuclear, "", ., Beyoncé, performed, the, American, national, anthem, singing, along, with, a, pre, -, recorded, track, at, President, Obama, 's, second, inauguration, in, Washington, ,, D.C., The, following, month, ,, Beyoncé, performed, at, the, Super, Bowl, XLVII, halftime, show, ,, held, at, the, Mercedes, -, Benz, Superdome, in, New, Orleans, ., The, performance, stands, as, the, second, most, tweeted, about, moment, in, history, at, 268,000, tweets, per, minute, ...]","[O, B, I, O, B, O, O, O, B, I, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B, O, O, B, O, O, B, O, O, O, O, O, O, O, O, O, O, O, O, B, O, B, O, O, B, O, B, O, O, O, O, B, O, O, B, I, I, O, O, O, O, O, O, B, I, I, I, I, O, B, I, O, O, O, O, O, O, B, O, O, O, O, O, O, O, B, O, O, O, ...]","[1, 2, 0, 1, 2, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 0, 0, 1, 1, 1, 1, 1, 1, 2, 0, 0, 0, 0, 1, 2, 0, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, ...]"
6,570d5cb5b3d812140066d73b,"[Puberty, occurs, through, a, long, process, and, begins, with, a, surge, in, hormone, production, ,, which, in, turn, causes, a, number, of, physical, changes, ., It, is, the, stage, of, life, characterized, by, the, appearance, and, development, of, secondary, sex, characteristics, (, for, example, ,, a, deeper, voice, and, larger, adam, 's, apple, in, boys, ,, and, development, of, breasts, and, more, curved, and, prominent, hips, in, girls, ), and, a, strong, shift, in, hormonal, balance, towards, an, adult, state, ., This, is, triggered, by, the, pituitary, gland, ,, which, secretes, a, surge, of, hormonal, agents, into, the, blood, stream, ...]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...]"
7,5728d3d92ca10214002da8ca,"[The, Alaska, Permanent, Fund, is, a, constitutionally, authorized, appropriation, of, oil, revenues, ,, established, by, voters, in, 1976, to, manage, a, surplus, in, state, petroleum, revenues, from, oil, ,, largely, in, anticipation, of, the, recently, constructed, Trans, -, Alaska, Pipeline, System, ., The, fund, was, originally, proposed, by, Governor, Keith, Miller, on, the, eve, of, the, 1969, Prudhoe, Bay, lease, sale, ,, out, of, fear, that, the, legislature, would, spend, the, entire, proceeds, of, the, sale, (, which, amounted, to, $, 900, million, ), at, once, ., It, was, later, championed, by, Governor, Jay, Hammond, and, Kenai, state, representative, Hugh, ...]","[B, I, I, I, O, O, O, O, O, O, O, O, O, O, O, O, O, B, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B, I, I, I, I, O, O, O, O, O, O, O, O, B, I, O, B, I, I, I, B, B, I, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B, I, I, O, O, O, O, O, O, O, O, O, O, B, I, O, B, O, O, B, ...]","[2, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 2, 0, 0, 0, 2, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 2, 1, 1, 2, ...]"
8,572fa79aa23a5019007fc83d,"[Washington, University, has, been, selected, by, the, Commission, on, Presidential, Debates, to, host, more, presidential, and, vice, -, presidential, debates, than, any, other, institution, in, history, ., United, States, presidential, election, debates, were, held, at, the, Washington, University, Athletic, Complex, in, 1992, ,, 2000, ,, 2004, ,, and, 2016, ., A, presidential, debate, was, planned, to, occur, in, 1996, ,, but, owing, to, scheduling, difficulties, between, the, candidates, ,, the, debate, was, canceled, ., The, university, hosted, the, only, 2008, vice, presidential, debate, ,, between, Republican, Sarah, Palin, and, Democrat, Joe, Biden, ,, on, October, 2, ,, 2008, ,, also, ...]","[B, I, O, O, O, O, B, I, I, I, I, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B, I, O, O, O, O, O, O, B, I, I, I, I, O, B, O, B, O, B, O, O, B, O, O, O, O, O, O, O, O, O, B, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B, I, O, O, O, O, O, B, B, I, O, B, B, I, O, O, B, I, I, I, O, O, ...]","[2, 0, 1, 1, 1, 1, 2, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 2, 2, 0, 1, 2, 2, 0, 1, 1, 2, 0, 0, 0, 1, 1, ...]"
9,570d8c85fed7b91900d46204,"[Following, their, basic, and, advanced, training, at, the, individual, -, level, ,, soldiers, may, choose, to, continue, their, training, and, apply, for, an, "", additional, skill, identifier, "", (, ASI, ), ., The, ASI, allows, the, army, to, take, a, wide, ranging, MOS, and, focus, it, into, a, more, specific, MOS, ., For, example, ,, a, combat, medic, ,, whose, duties, are, to, provide, pre, -, hospital, emergency, treatment, ,, may, receive, ASI, training, to, become, a, cardiovascular, specialist, ,, a, dialysis, specialist, ,, or, even, a, licensed, practical, nurse, ., For, commissioned, officers, ,, ASI, training, includes, pre, -, ...]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B, O, O, O, B, O, O, O, O, O, O, O, O, B, O, O, O, O, O, O, O, B, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B, O, O, O, O, ...]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, ...]"


In [None]:
squad_tc.set_format("torch")

In [None]:
squad_tc

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10570
    })
})

In [None]:
squad_tc = squad_tc.remove_columns(["id", "tokens","iob_ids","iob_tags"])

In [None]:
squad_iob

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'iob_tags'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'tokens', 'iob_tags'],
        num_rows: 10570
    })
})

In [None]:
squad_train = squad_tc["train"]
eval_test = squad_tc["validation"].train_test_split(test_size=0.1, shuffle=False)
squad_eval = eval_test["train"]
squad_test = eval_test["test"]
train_dataloader = DataLoader(squad_train, batch_size=8, collate_fn=data_collator, shuffle=False)
eval_dataloader = DataLoader(squad_eval, batch_size=8, collate_fn=data_collator, shuffle=False)
test_dataloader = DataLoader(squad_test, batch_size=8, collate_fn=data_collator, shuffle=False)

In [None]:
squad_eval

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 9513
})

In [None]:
seqeval = evaluate.load("seqeval")
labels = [id2label[i] for i in example[f"ner_tags"]]

def compute_metrics(p):
    predictions, labels = p
    predictions = torch.argmax(predictions, dim=2)

    true_predictions = [
        [id2label[p.item()] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l.item()] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
results = compute_metrics([logits, batch["labels"]])

In [None]:
results

{'precision': 0.1632047477744807,
 'recall': 0.023778642455685256,
 'f1': 0.04150943396226415,
 'accuracy': 0.11612668365489626}

In [None]:
import evaluate
import pytorch_lightning as pl
from pytorch_lightning.loggers import MLFlowLogger
from pytorch_lightning.callbacks import Timer
from pytorch_lightning.callbacks import ModelCheckpoint
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

class TokenClassification(pl.LightningModule):
  def __init__(self, model_name_or_path, learning_rate=2e-5, **kwargs):
    super().__init__()
    self.save_hyperparameters()
    self.model = AutoModelForTokenClassification.from_pretrained(model_name_or_path, num_labels=kwargs["num_labels"], id2label=kwargs["id2label"], label2id=kwargs["label2id"])
    self.learning_rate = learning_rate
    self.seqeval = evaluate.load("seqeval")

  def forward(self,**inputs):
    return self.model(**inputs)

  def training_step(self, batch, batch_idx):
    outputs = self(**batch)
    loss = outputs[0]
    # self.train_loss.append(loss)
    self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=True, logger=False)
    return loss

#   def on_train_epoch_end(self):
#     loss = sum(self.train_loss)/len(self.train_loss)
#     self.logger.experiment.log_metric(run_id=self.logger.run_id, key="train_loss", value=loss)
#     self.train_loss.clear()

  def validation_step(self, batch, batch_idx):
    outputs = self(**batch)
    val_loss, logits = outputs[:2]
    metrics = self.compute_metrics([logits, batch["labels"]])
    # self.val_loss.append(val_loss)
    # preds = torch.argmax(logits, dim=2)
    # labels = batch["labels"]
    # self.outputs["val_loss"].append(val_loss)
    # self.outputs["preds"].append(preds)
    # self.outputs["labels"].append(labels)
    self.log_dict(metrics, on_step=False, on_epoch=True, prog_bar=True, logger=False)
    self.log("val_loss", val_loss, on_step=False, on_epoch=True, prog_bar=True, logger=False)

#   def on_validation_epoch_end(self):
#     loss = sum(self.val_loss)/len(self.val_loss)
#     self.logger.experiment.log_metric(run_id=self.logger.run_id, key="val_loss", value=loss)
#     self.val_loss.clear()

  def compute_metrics(self, p):
    predictions, labels = p
    predictions = torch.argmax(predictions, dim=2)

    true_predictions = [
        [self.hparams.id2label[p.item()] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [self.hparams.id2label[l.item()] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = self.seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

  def configure_optimizers(self):
    optimizer = AdamW(model.parameters(), lr=self.hparams.learning_rate)
    # scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=self.trainer.estimated_stepping_batches)
    # scheduler = {"scheduler":scheduler, "interval":"step", "frequency":1, "monitor":"val_loss"}
    return optimizer

In [None]:
model = TokenClassification(model_name_or_path="distilbert/distilbert-base-uncased", num_labels=3, id2label = {0: "I", 1: "B", 2: "O"}, label2id = {"I": 0, "B": 1, "O": 2})
mlf_logger = MLFlowLogger(experiment_name="/learnX.ai (study support)", tracking_uri="databricks")
# timer = Timer(duration="00:02:30:00")
checkpoint_callback = ModelCheckpoint(dirpath="/content/drive/MyDrive/learnX.ai (study support)/checkpoints", filename="checkpoint-tc-{epoch}-{step}", monitor="val_loss", mode="min", save_top_k=1, save_last=True, every_n_epochs=1)
trainer = pl.Trainer(max_epochs=3, accelerator="gpu", devices="auto")
trainer.fit(model, train_dataloader, eval_dataloader)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=f28b1224dc2042ecc26d66ac5c7e546bfa91b14bf958e6e0d14cbae8462d0f8d
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
model = TokenClassification.load_from_checkpoint("/content/drive/MyDrive/learnX.ai (study support)/checkpoints/last-tc.ckpt")

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from collections import OrderedDict
state_dict = OrderedDict()

for i,j in model.state_dict().items():
    t = i.split(".",maxsplit=1)[1]
    i = i.replace(i,t)
    state_dict[i] = j

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=3, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.load_state_dict(state_dict)

<All keys matched successfully>

In [None]:
for batch in test_dataloader:
    break
print({k:v.shape for k,v in batch.items()})

{'input_ids': torch.Size([8, 368]), 'attention_mask': torch.Size([8, 368]), 'labels': torch.Size([8, 368])}


In [None]:
outputs = model(**batch)
logits = outputs.logits
predictions = torch.argmax(logits, dim=2)

In [None]:
true_predictions = [
        [id2label[p.item()] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, batch["labels"])
]

In [None]:
true_labels = [
        [id2label[l.item()] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, batch["labels"])
    ]

In [None]:
for token, prediction in zip(true_labels, true_predictions):
  break
print(f"{token}:\n{prediction}")

['O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'I', 'B', 'B', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O']:
['O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'B', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'B', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'B', 'O', 'I', 'I', 'O

In [None]:
from transformers import pipeline

classifier = pipeline("ner", model=model, tokenizer=tokenizer)
classifier("Samsung is a mobile manufacturer based in South Korea.", aggregation_strategy="first")

[{'entity_group': 'B',
  'score': 0.8286821,
  'word': 'samsung',
  'start': 0,
  'end': 7},
 {'entity_group': 'B',
  'score': 0.9999157,
  'word': 'south',
  'start': 42,
  'end': 47},
 {'entity_group': 'I',
  'score': 0.99983287,
  'word': 'korea',
  'start': 48,
  'end': 53}]

In [None]:
!pip install spacy benepar

Collecting benepar
  Downloading benepar-0.2.0.tar.gz (33 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting torch-struct>=0.5 (from benepar)
  Downloading torch_struct-0.5-py3-none-any.whl.metadata (4.3 kB)
Downloading torch_struct-0.5-py3-none-any.whl (34 kB)
Building wheels for collected packages: benepar
  Building wheel for benepar (setup.py) ... [?25l[?25hdone
  Created wheel for benepar: filename=benepar-0.2.0-py3-none-any.whl size=37626 sha256=5690af5c92ce02de6b64bcc399b38c290f45778c3f5bbcf38bb652bb2bd83ed6
  Stored in directory: /root/.cache/pip/wheels/8d/4d/c1/a5af726368d5dbaaaa0b2dd36ed39b9da8cec46279a49bd6db
Successfully built benepar
Installing collected packages: torch-struct, benepar
Successfully installed benepar-0.2.0 torch-struct-0.5


In [None]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy, benepar
benepar.download('benepar_en3')
nlp = spacy.load('en_core_web_md')
nlp.add_pipe('benepar', config={'model':'benepar_en3'})

[nltk_data] Downloading package benepar_en3 to /root/nltk_data...
[nltk_data]   Unzipping models/benepar_en3.zip.
  state_dict = torch.load(
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


<benepar.integrations.spacy_plugin.BeneparComponent at 0x796634e57fd0>

In [None]:
doc = nlp('Kalki, final avatar (incarnation) of the Hindu god Vishnu, who is yet to appear')
sent = list(doc.sents)[0]
print(sent._.parse_string)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


(NP (NP (NNP Kalki)) (, ,) (NP (NP (JJ final) (NN avatar)) (-LRB- -LRB-) (NP (NN incarnation)) (-RRB- -RRB-) (PP (IN of) (NP (NP (NP (DT the) (JJ Hindu) (NN god)) (NP (NNP Vishnu))) (, ,) (SBAR (WHNP (WP who)) (S (VP (VBZ is) (ADVP (RB yet)) (S (VP (TO to) (VP (VB appear)))))))))))




In [None]:
import nltk
from nltk import tokenize
nltk.download("punkt")
from nltk.tree import Tree

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
tree = Tree.fromstring(sent._.parse_string)
print(tree.pretty_print())

            NP                                                                                                   
   _________|_________________________                                                                            
  |    |                              NP                                                                         
  |    |          ____________________|______________________                                                     
  |    |         |           |        |        |             PP                                                  
  |    |         |           |        |        |     ________|___________________                                 
  |    |         |           |        |        |    |                            NP                              
  |    |         |           |        |        |    |              ______________|________                        
  |    |         |           |        |        |    |             |              |  

In [None]:
temp1 = tree[0]
temp2 = tree[1]
temp3 = tree[-1]
temp1.pretty_print()
temp2.pretty_print()
temp3.pretty_print()

  NP 
  |   
 NNP 
  |   
Kalki

 , 
 |  
 , 

                            NP                                                                         
        ____________________|______________________                                                     
       |           |        |        |             PP                                                  
       |           |        |        |     ________|___________________                                 
       |           |        |        |    |                            NP                              
       |           |        |        |    |              ______________|________                        
       |           |        |        |    |             |              |       SBAR                    
       |           |        |        |    |             |              |    ____|____                   
       |           |        |        |    |             |              |   |         S                 
       |     

In [None]:
#split at rightmost NP or VP
def get_flattened(t):
  sent_str_final = None
  if t is not None:
    sent_str = [" ".join(x.leaves()) for x in list(t)]
    sent_str_final = [" ".join(sent_str)]
    sent_str_final = sent_str_final[0]
  return sent_str_final

In [None]:
def get_rvp_nvp(parse_tree, last_np = None, last_vp = None):
  if len(parse_tree.leaves()) == 1:
    return last_np, last_vp
  last_subtree = parse_tree[-1]
  if last_subtree.label() == 'NP':
    last_np = last_subtree
  elif last_subtree.label() == "VP":
    last_vp = last_subtree
  return get_rvp_nvp(last_subtree, last_np, last_vp)

In [None]:
last_np, last_vp = get_rvp_nvp(tree)
last_np_flattened = get_flattened(last_np)
last_vp_flattened = get_flattened(last_vp)
print(last_np_flattened)
print(last_vp_flattened)

the Hindu god Vishnu , who is yet to appear
appear


In [None]:
import re

def get_termination_portion(main_string, sub_string):
  combined_sub_string = sub_string.replace(" ","")
  main_string_list = main_string.split()
  last_index = len(main_string_list)
  for i in range(last_index):
    check_string_list = main_string_list[i:]
    check_string = "".join(check_string_list)
    check_string = check_string.replace(" ","")
    if check_string == combined_sub_string:
      return " ".join(main_string_list[:i])

  return None

In [None]:
longest_phrase = max(last_np_flattened, last_vp_flattened)
print(longest_phrase)


the Hindu god Vishnu , who is yet to appear


In [None]:
longest_phrase = re.sub(r"-LRB- ", "(", longest_phrase)
longest_phrase = re.sub(r" -RRB-", ")", longest_phrase)

In [None]:
longest_phrase

'the Hindu god Vishnu , who is yet to appear'

In [None]:
split_sentence = get_termination_portion("Kalki, final avatar (incarnation) of the Hindu god Vishnu, who is yet to appear", longest_phrase)

In [None]:
!pip install --quiet transformers

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

GPT2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
GPT2_model = GPT2LMHeadModel.from_pretrained('gpt2', pad_token_id=GPT2_tokenizer.eos_token_id)

In [None]:
partial_sentence = "Kalki, final avatar (incarnation) of"
input_ids = GPT2_tokenizer.encode(partial_sentence, return_tensors='pt')
maximum_length = len(partial_sentence.split())+40


tensor([[   42,   971,    72,    11,  2457, 30919,   357, 13211,   341,     8,
           286]])


In [None]:
#Activate top_k sampling and top_p sampling with only from 90% most likely words
sample_outputs = GPT2_model.generate(input_ids, do_sample=True, max_length=maximum_length, top_k=60, top_p=0.8, repetition_penalty=10.0, num_return_sequences=1)


In [None]:
generated_sentences = []
for i,sample_output in enumerate(sample_outputs):
  decoded_sentence = GPT2_tokenizer.decode(sample_output, skip_special_tokens=True)
  final_sentence = tokenize.sent_tokenize(decoded_sentence)[0]
  generated_sentences.append(final_sentence)
  print(final_sentence)

Kalki, final avatar (incarnation) of an Imperial Soldier.


In [8]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:

In [9]:
!pip install git+https://github.com/boudinfl/pke.git

Collecting git+https://github.com/boudinfl/pke.git
  Cloning https://github.com/boudinfl/pke.git to /tmp/pip-req-build-quup1ixa
  Running command git clone --filter=blob:none --quiet https://github.com/boudinfl/pke.git /tmp/pip-req-build-quup1ixa
  Resolved https://github.com/boudinfl/pke.git to commit 69871ffdb720b83df23684fea53ec8776fd87e63
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting unidecode (from pke==2.0.0)
  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pke
  Building wheel for pke (setup.py) ... [?25l[?25hdone
  Created wheel for pke: filename=pke-2.0.0-py3-none-any.whl size=6160627 sha256=6aae879cdd6b55b3782097cbc22b63878548dc82ab6092ec0c78d02a94aa2d81
  Stored in directory: /tmp/pip-ephem-wheel-cache-hnep50cm/wheels/8c/07/

In [10]:
from datasets import load_dataset
squad = load_dataset("squad")
text = squad["train"]["context"][0]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [11]:
import pke
import nltk
import string

nltk.download('punkt')
sent = []
extractor = pke.unsupervised.MultipartiteRank()
stoplist = list(string.punctuation)
stoplist += pke.lang.stopwords.get("en")
extractor.load_document(input=text, stoplist=stoplist)
pos = {'NOUN', 'PROPN', 'ADJ'}
extractor.candidate_selection(pos=pos)
extractor.candidate_weighting(alpha=1.1, threshold=0.74, method="average")
keyphrases = extractor.get_n_best(n=2)
cases = ([(i[0].upper(), i[0].lower(), i[0].capitalize(), i[0].title()) for i in keyphrases])
for i,j,k,l in cases:
  text = text.replace(i,"[MASK]").replace(j,"[MASK]").replace(k,"[MASK]").replace(l,"[MASK]")
  for i in nltk.sent_tokenize(text):
    if "[MASK]" in i:
      sent.append(i)
print(sent)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


["Atop the [MASK]'s gold dome is a golden statue of the Virgin Mary.", 'Immediately in front of the [MASK] and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes".', 'Next to the [MASK] is the Basilica of the Sacred Heart.', "Atop the [MASK]'s gold dome is a [MASK] of the Virgin Mary.", 'Immediately in front of the [MASK] and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes".', 'Next to the [MASK] is the Basilica of the Sacred Heart.']


In [12]:
#YAKE algorithm for match the following
extractor = pke.unsupervised.YAKE()
stoplist = list(string.punctuation)
stoplist = pke.lang.stopwords.get("english")
extractor.load_document(input=text, language="en", stoplist=stoplist, normalization=None)
extractor.candidate_selection(n=1)
extractor.candidate_weighting(window=2, use_stems=False)
keyphrases = extractor.get_n_best(n=10, threshold=0.8)
print(keyphrases)



[('mask', 0.04006231333327488), ('catholic', 0.06866257504855125), ('mary', 0.08821871083923659), ('virgin', 0.12850285481915852), ('basilica', 0.17215703488295134), ('architecturally', 0.1747914809033989), ('gold', 0.17708987746941948), ('dome', 0.17708987746941948), ('grotto', 0.1899926551185555), ('school', 0.19297513463129293)]


In [13]:
!pip install rake_nltk



In [16]:
import string
from nltk.wsd import lesk
from nltk.corpus import stopwords, wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from rake_nltk import Rake, Metric

#Keyword extraction using rake-ntlk
rake_nltk = Rake(max_length = 1, include_repeated_phrases = False, punctuations = string.punctuation)
rake_nltk.extract_keywords_from_text(text)
keywords = rake_nltk.get_ranked_phrases()[:10]

#punctuation and stopwords removal from the text followed by lemmatization
stop_words = set(stopwords.words('english'))
text_no_punc = text.translate(str.maketrans("","",string.punctuation))
word_tokens = word_tokenize(text_no_punc.lower())
#filtered_sentence = [w for w in word_tokens if not w in stop_words]
lemmatizer = WordNetLemmatizer()
lemmatized_sentence = [lemmatizer.lemmatize(w) for w in word_tokens]

In [15]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [17]:
wn_keywords = []
wn_definitions = []
try:
  for i in keywords:
    wn_keywords.append(i)
    wn_definitions.append(lesk(lemmatized_sentence, i).definition())
except AttributeError:
  pass

In [18]:
wn_definitions

['a person lacking intelligence or common sense',
 'the period of instruction in a school; the time period when school is in session',
 'copy that is not the original; something that has been copied',
 '(mathematics) a transformation in which the direction of one axis is reversed',
 'the act of communicating with a deity (especially as a petition or in adoration or contrition or thanksgiving)',
 'at the time or occasion immediately following',
 'a party of guests wearing costumes and masks',
 'the mother of Jesus; Christians refer to her as the Virgin Mary; she is especially honored by Roman Catholics']

In [19]:
import random
sample = random.sample(wn_keywords, 7)
try:
  for i in range(len(sample)):
    print("{:100}{:10}".format(sample[i], wn_definitions[i]))
except AttributeError:
  pass

prayer                                                                                              a person lacking intelligence or common sense
lourdes                                                                                             the period of instruction in a school; the time period when school is in session
school                                                                                              copy that is not the original; something that has been copied
mask                                                                                                (mathematics) a transformation in which the direction of one axis is reversed
reflection                                                                                          the act of communicating with a deity (especially as a petition or in adoration or contrition or thanksgiving)
mary                                                                                                at the time or occasio

In [24]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(squad["train"]["context"][0])
displacy.serve(doc, style="ent")




Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.
