In [2]:
import numpy as np
from datasets import load_dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW, get_scheduler
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from pytorch_lightning.loggers import MLFlowLogger
from pytorch_lightning.callbacks import ModelCheckpoint

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
max_source_length = 512
max_target_length = 128

In [4]:
squad = load_dataset("squad")
squad = squad.flatten()
squad["train"][0]["answers.text"]

['Saint Bernadette Soubirous']

In [None]:
from tokenizers import AutoTokenizer
prefix_1 = "answer: "
prefix_2 = " context: "
prefix_3 = "question: "
checkpoint = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def preprocess_function(examples):
  input_1 = [ prefix_1 + i[0] for i in examples["answers.text"] ]
  input_2 = [ prefix_2 + i for i in examples["context"] ]
  input = [input_1[i] + input_2[i] for i in range(len(examples["context"]))]
  inputs = [input[0] for i in range(len(examples["context"]))]
  model_inputs = tokenizer(inputs, padding="longest", max_length = max_source_length , truncation = True, return_tensors = "pt")
  labels = tokenizer([prefix_3 + i for i in examples["question"]], padding="longest", max_length = max_target_length ,truncation = True, return_tensors = "pt")
  labels["input_ids"][labels["input_ids"]==tokenizer.pad_token_id] = -100
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [4]:
prefix_1 = "context: "
prefix_2 = " question: "
prefix_3 = "answer: "
checkpoint = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(checkpoint)
def preprocess_function(examples):
  output_1 = [ prefix_2 + i for i in examples["question"] ]
  output_2 = [ prefix_3 + i[0] for i in examples["answers.text"] ]
  output = [output_2[i] + output_1[i] for i in range(len(examples["question"]))]
  assert len(examples["question"]) == len(examples["answers.text"])
  model_inputs = tokenizer([prefix_1 + i for i in examples["context"]], padding="max_length", max_length = max_source_length , truncation = True, return_tensors = "pt")
  labels = tokenizer(output, padding="max_length", max_length = max_target_length, truncation = True, return_tensors = "pt")
  labels["input_ids"][labels["input_ids"]==tokenizer.pad_token_id] = -100
  model_inputs["labels"] = labels["input_ids"]
  model_inputs["target_mask"] = labels["attention_mask"]
  return model_inputs

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
squad_tokenized = squad.map(preprocess_function, batched=True)

In [6]:
squad_tokenized = squad_tokenized.remove_columns(["id","title","question","context","answers.text","answers.answer_start"])
squad_tokenized.set_format("torch")
#data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer, model = checkpoint)

In [7]:
squad_tokenized

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'target_mask'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'target_mask'],
        num_rows: 10570
    })
})

In [8]:
squad_tokenized["train"][0]["labels"]

tensor([ 1525,    10,  2788,  8942,     9,    26,  1954,   264,  8371,  8283,
          822,    10,   304,  4068,   410,     8, 16823,  3790,     3, 18280,
         2385,    16,   507,  3449,    16,   301,  1211,  1395,  1410,    58,
            1,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100])

In [9]:
# train_dataloader = DataLoader(squad_tokenized["train"], batch_size=4, collate_fn=data_collator, num_workers=4)
# eval_dataloader = DataLoader(squad_tokenized["validation"], batch_size=4, collate_fn=data_collator, num_workers=4)

In [9]:
squad_train = squad_tokenized["train"]
#squad_eval = squad_tokenized["validation"]
eval_test = squad_tokenized["validation"].train_test_split(test_size=0.1, shuffle=False)
squad_eval = eval_test["train"]
squad_test = eval_test["test"]

In [10]:
squad_train = squad_train.select(range(10000))
squad_eval = squad_eval.select(range(1000))
squad_test = squad_test.select(range(10))

In [11]:
train_dataloader = DataLoader(squad_train, batch_size=4, shuffle=False, num_workers=4)
eval_dataloader = DataLoader(squad_eval, batch_size=4, shuffle=False, num_workers=4)
test_dataloader = DataLoader(squad_test, batch_size=4, shuffle=False, num_workers=4)

In [114]:
for batch in train_dataloader:
    break
print({k:v.shape for k,v in batch.items()})
print(tokenizer.decode(batch["input_ids"][3]))
fake_labels = np.where(batch["labels"][3]!=-100, batch["labels"][3], tokenizer.pad_token_id)
print(tokenizer.decode(fake_labels))

{'input_ids': torch.Size([4, 512]), 'attention_mask': torch.Size([4, 512]), 'labels': torch.Size([4, 128]), 'target_mask': torch.Size([4, 128])}
context: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><p

In [13]:
for batch in test_dataloader:
  break
print(tokenizer.decode(batch["input_ids"][0]))
fake_labels = np.where(batch["labels"][0]!=-100, batch["labels"][0], tokenizer.pad_token_id)
print(tokenizer.decode(fake_labels))

context: Much of the work of the Scottish Parliament is done in committee. The role of committees is stronger in the Scottish Parliament than in other parliamentary systems, partly as a means of strengthening the role of backbenchers in their scrutiny of the government and partly to compensate for the fact that there is no revising chamber. The principal role of committees in the Scottish Parliament is to take evidence from witnesses, conduct inquiries and scrutinise legislation. Committee meetings take place on Tuesday, Wednesday and Thursday morning when Parliament is sitting. Committees can also meet at other locations throughout Scotland.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><

In [14]:
squad["validation"][9514]

{'id': '572fc6f204bcaa1900d76cf6',
 'title': 'Scottish_Parliament',
 'context': 'Much of the work of the Scottish Parliament is done in committee. The role of committees is stronger in the Scottish Parliament than in other parliamentary systems, partly as a means of strengthening the role of backbenchers in their scrutiny of the government and partly to compensate for the fact that there is no revising chamber. The principal role of committees in the Scottish Parliament is to take evidence from witnesses, conduct inquiries and scrutinise legislation. Committee meetings take place on Tuesday, Wednesday and Thursday morning when Parliament is sitting. Committees can also meet at other locations throughout Scotland.',
 'question': 'What are committees in the Scottish Parliament compared to other systems?',
 'answers.text': ['stronger',
  'stronger',
  'stronger in the Scottish Parliament than in other parliamentary systems'],
 'answers.answer_start': [92, 92, 92]}

In [16]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokens = tokenizer("Lionel Messi,",return_tensors="pt")
outputs = model.generate(input_ids = tokens.input_ids, attention_mask = tokens.attention_mask, max_length=50)
tokenizer.decode(outputs[0])
# softmax = torch.nn.Softmax(dim=1)
# x = softmax(outputs.logits[0])
# predictions = torch.argmax(x, dim=1)
# decoded_preds = tokenizer.decode(predictions, skip_special_tokens=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Lionel Messi, who has been linked with a move to Manchester United, has been linked with a move to Barcelona.\n\nThe Argentina international has been linked with a move to Barcelona, with the club reportedly interested in the player.\n\n'

In [64]:
model = T5ForConditionalGeneration.from_pretrained("t5-small")
outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["labels"])
print(outputs.loss, outputs.logits.shape)

tensor(3.9637, grad_fn=<NllLossBackward0>) torch.Size([4, 128, 32128])


In [18]:
outputs.logits.shape

torch.Size([4, 128, 32128])

In [20]:
import torch
# softmax = torch.nn.Softmax(dim=2)
# x = softmax(outputs.logits)
predictions = torch.argmax(outputs.logits, dim=2)
decoded_preds = tokenizer.batch_decode(predictions)
converted_labels = np.where(batch["labels"]!=-100, batch["labels"], tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(converted_labels)

In [105]:
decoded_labels

['answer: committee question: Where is much of the work of the Scottish Parliament done?</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>',
 'answer: stronger question: What are committees in the Scottish Parliament compared to other systems?</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad

In [106]:
decoded_preds

['The questions nos</s></s> is the of the work of the Scottish Parliament done in</s></s> The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The back back The The The back back The back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back there back back back back back back back back back back back the the</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>',
 'The questions no in than</s> is thes in the Scottish Parliament?? to in </s></s></s> The The The back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back back 

In [107]:
len(train_dataloader)

2500

In [16]:
optimizer = AdamW(model.parameters(), lr=1e-4)
loss = outputs.loss
loss.backward()
optimizer.step()
optimizer.zero_grad()



In [17]:
num_epochs = 20
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [18]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# model.to(device)
print(device)

cuda


In [43]:
optimizer = AdamW(model.parameters(), lr=1e-4)

In [18]:
# progress_bar = tqdm(range(num_training_steps))

# model.train()
# for epoch in range(num_epochs):
#   for batch in train_dataloader:
#     batch = {k:v.to(device) for k,v in batch.items()}
#     outputs = model(**batch)
#     loss = outputs.loss
#     loss.backward()

#     optimizer.step()
#     lr_scheduler.step()
#     optimizer.zero_grad()
#     progress_bar.update(1)

In [None]:
model.save_pretrained("t5_question_generation_model")

In [50]:
class MCQGenerator(pl.LightningModule):

  def __init__(self, model_name_or_path:str, 
               learning_rate:float=3e-4,
              # batch_size:int=4,
              #  experiment_name:str="learnX.ai (study support)", 
              #  tracking_uri:str="http://127.0.0.1:8080"
              ):
    
    super().__init__()
    self.save_hyperparameters()
    self.model = T5ForConditionalGeneration.from_pretrained(model_name_or_path)
    self.learning_rate=learning_rate
    # self.train_loss = []
    # self.val_loss = []

  def forward(self,**inputs):
    return self.model(input_ids=inputs["input_ids"], 
                      attention_mask=inputs["attention_mask"], 
                      decoder_attention_mask=inputs["target_mask"], 
                      labels=inputs["labels"])

  def training_step(self, batch, batch_idx):
    outputs = self(**batch)
    loss = outputs[0]
    #self.train_loss.append(loss)
    self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
    return loss
  
  # def on_train_epoch_end(self):
  #   loss = sum(self.train_loss)/len(self.train_loss)
  #   self.logger.experiment.log_metric(run_id=self.logger.run_id, key="train_loss", value=loss)
  #   self.train_loss.clear()

  def validation_step(self, batch, batch_idx):
    outputs = self(**batch)
    val_loss, logits = outputs[:2]
    # self.val_loss.append(val_loss)
    # preds = torch.argmax(logits, dim=2)
    # labels = batch["labels"]
    # self.outputs["val_loss"].append(val_loss)
    # self.outputs["preds"].append(preds)
    # self.outputs["labels"].append(labels)
    self.log("val_loss", val_loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
    return val_loss

  # def on_validation_epoch_end(self):
  #   loss = sum(self.val_loss)/len(self.val_loss)
  #   self.logger.experiment.log_metric(run_id=self.logger.run_id, key="val_loss", value=loss)
  #   self.val_loss.clear()

  def configure_optimizers(self):
    optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, eps=1e-8)
    return optimizer


In [16]:
model = MCQGenerator(model_name_or_path="t5-small")
mlf_logger = MLFlowLogger(experiment_name="learnX.ai (study support)", tracking_uri="http://127.0.0.1:8080")
# timer = Timer(duration="00:02:30:00")
checkpoint_callback = ModelCheckpoint(dirpath="G:\My Drive\learnX.ai (study support)\checkpoints", filename="checkpoint-{epoch}-{step}", save_top_k=1, save_last=True, every_n_epochs=3)
trainer = pl.Trainer(max_epochs=3, accelerator="gpu", devices="auto", logger=mlf_logger, callbacks=[checkpoint_callback])
trainer.fit(model, train_dataloader, eval_dataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params | Mode
------------------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M | eval
------------------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)
0         Modules in train mode
277       Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

d:\learnx.ai\.venv\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:419: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.


                                                                           

d:\learnx.ai\.venv\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:419: Consider setting `persistent_workers=True` in 'train_dataloader' to speed up the dataloader worker initialization.


Epoch 2: 100%|██████████| 2500/2500 [08:00<00:00,  5.20it/s, v_num=4a79, val_loss=1.850, train_loss=0.790]

`Trainer.fit` stopped: `max_epochs=3` reached.


Epoch 2: 100%|██████████| 2500/2500 [08:11<00:00,  5.09it/s, v_num=4a79, val_loss=1.850, train_loss=0.790]


2024/08/30 02:57:20 INFO mlflow.tracking._tracking_service.client: 🏃 View run exultant-horse-117 at: http://127.0.0.1:8080/#/experiments/547642378050026970/runs/6067b533fc374c42bd0a2c04f69a4a79.
2024/08/30 02:57:20 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/547642378050026970.


In [51]:
model_test = MCQGenerator.load_from_checkpoint("G:\My Drive\learnX.ai (study support)\checkpoints\checkpoint-epoch=2-step=7500.ckpt")

In [53]:
model_test.hparams

"learning_rate":      0.0003
"model_name_or_path": t5-small

In [43]:
trainer.fit(model, train_dataloader, eval_dataloader, ckpt_path="G:\My Drive\learnX.ai (study support)\checkpoints\checkpoint-epoch=2-step=7500.ckpt")

d:\learnx.ai\.venv\lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:654: Checkpoint directory ..\547642378050026970\122b6dc49c224959be300874a34506be\checkpoints exists and is not empty.
Restoring states from the checkpoint path at D:/learnx.ai/547642378050026970/122b6dc49c224959be300874a34506be/checkpoints/epoch=0-step=125.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
d:\learnx.ai\.venv\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.

  | Name  | Type                       | Params | Mode
------------------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M | eval
------------------------------------------------------------
60.5 M    Tr

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

d:\learnx.ai\.venv\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


                                                                           

`Trainer.fit` stopped: `max_epochs=1` reached.




2024/08/27 10:55:21 INFO mlflow.tracking._tracking_service.client: 🏃 View run adorable-lamb-851 at: http://127.0.0.1:8080/#/experiments/547642378050026970/runs/122b6dc49c224959be300874a34506be.
2024/08/27 10:55:21 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/547642378050026970.


In [99]:
from collections import OrderedDict
state_dict = OrderedDict()

for i,j in model_test.state_dict().items():
    t = i.split(".",maxsplit=1)[1]
    i = i.replace(i,t)
    state_dict[i] = j

In [101]:
model.load_state_dict(state_dict)

<All keys matched successfully>

In [124]:
model.eval()
for batch in train_dataloader:
    break
preds = model.generate(input_ids = batch["input_ids"][0].expand(1,-1), attention_mask = batch["attention_mask"][0].expand(1,-1), max_length=200, num_beams=5, num_return_sequences=3)
decoded_preds = [tokenizer.decode(pred, skip_special_tokens=True) for pred in preds]
decoded_preds

['answer: a copper statue of Christ question: What sits next to the Main Building in front of the Basilica of the Sacred Heart?',
 'answer: the Grotto question: What is the name of the grotto at Lourdes, France?',
 'answer: the Grotto question: What is the name of the grotto at Notre Dame?']

In [7]:
import nltk
import string
from nltk.wsd import lesk
from nltk.corpus import stopwords, wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from rake_nltk import Rake, Metric

In [1]:
from datasets import load_dataset
squad = load_dataset("squad")
text = squad["train"]["context"][0]

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
rake_nltk = Rake(max_length=3, 
                include_repeated_phrases = False)

rake_nltk.extract_keywords_from_text(text)
keywords = rake_nltk.get_ranked_phrases()

filtered_keywords = set()
#punctuation removal from the text followed
for i in keywords:
    i = i.translate(str.maketrans("","",string.punctuation)).strip()
    filtered_keywords.add(i)    


In [4]:
import random

sample = random.sample(filtered_keywords, 5)
masked_sentence = []
cased_keywords = ([(i.upper(), i.lower(), i.capitalize(), i.title()) for i in sample])
temp = text

for i,j,k,l in cased_keywords:
  temp = temp.replace(i,"[MASK]").replace(j,"[MASK]").replace(k,"[MASK]").replace(l,"[MASK]")

for i in nltk.sent_tokenize(temp):
  if "[MASK]" in i:
    masked_sentence.append(i)

In [5]:
for i,j in enumerate(masked_sentence):
    print("{:}{:10}{:}".format(i," ",j))

0          Atop the Main Building's [MASK] is a golden statue of the Virgin Mary.
1          Immediately behind the basilica is the Grotto, a [MASK] of [MASK] and reflection.
2          It is a [MASK] of the grotto at [MASK], France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858.
3          At the end of the main drive (and in a direct line that connects through 3 statues and the [MASK]), is a simple, modern stone statue of Mary.


In [6]:
#Keyword extraction using rake-nltk
rake_nltk = Rake(max_length = 1, 
                include_repeated_phrases = False,
                punctuations = string.punctuation)

rake_nltk.extract_keywords_from_text(text)
keywords = rake_nltk.get_ranked_phrases()[:10]

#punctuation and stopwords removal from the text followed by lemmatization
stop_words = set(stopwords.words("english"))
text_no_punc = text.translate(str.maketrans("","",string.punctuation))
word_tokens = word_tokenize(text_no_punc.lower())
#filtered_sentence = [w for w in word_tokens if not w in stop_words]
lemmatizer = WordNetLemmatizer()
lemmatized_sentence = [lemmatizer.lemmatize(w) for w in word_tokens]

In [7]:
wn_keywords = []
wn_definitions = []
try:
    for i in keywords:
        wn_keywords.append(i)
        wn_definitions.append(lesk(lemmatized_sentence, i).definition())
except AttributeError:
    pass

In [8]:
import random

sample = random.sample(wn_keywords, len(wn_keywords)-1)
try:
    for i in range(len(sample)):
        print("{:100}{:10}".format(sample[i], wn_definitions[i]))
except AttributeError:
    pass

replica                                                                                             a person lacking intelligence or common sense
mary                                                                                                the period of instruction in a school; the time period when school is in session
next                                                                                                copy that is not the original; something that has been copied
simple                                                                                              (mathematics) a transformation in which the direction of one axis is reversed
prayer                                                                                              the act of communicating with a deity (especially as a petition or in adoration or contrition or thanksgiving)
lourdes                                                                                             at the time or occasio

In [9]:
from transformers import AutoTokenizer, BertForPreTraining

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = BertForPreTraining.from_pretrained(checkpoint)

In [10]:
!pip install gensim




[notice] A new release of pip is available: 24.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import re

import nltk
from nltk import tokenize
from nltk.tree import Tree
import spacy
import benepar

nlp = spacy.load("en_core_web_md")
nlp.add_pipe("benepar", config={"model":"benepar_en3"})

  state_dict = torch.load(
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


<benepar.integrations.spacy_plugin.BeneparComponent at 0x1db31537a00>

In [12]:
doc = nlp("Kalki, final avatar (incarnation) of the Hindu god Vishnu, who is yet to appear")
sent = list(doc.sents)[0]
print(sent._.parse_string)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


(NP (NP (NNP Kalki)) (, ,) (NP (NP (JJ final) (NN avatar)) (-LRB- -LRB-) (NP (NN incarnation)) (-RRB- -RRB-) (PP (IN of) (NP (NP (NP (DT the) (JJ Hindu) (NN god)) (NP (NNP Vishnu))) (, ,) (SBAR (WHNP (WP who)) (S (VP (VBZ is) (ADVP (RB yet)) (S (VP (TO to) (VP (VB appear)))))))))))




In [13]:
print(sent._.labels)

('NP',)


In [14]:
list((list(sent._.children)[1])._.children)

[]

In [15]:
tree = Tree.fromstring(sent._.parse_string)
print(tree.pretty_print())

            NP                                                                                                   
   _________|_________________________                                                                            
  |    |                              NP                                                                         
  |    |          ____________________|______________________                                                     
  |    |         |           |        |        |             PP                                                  
  |    |         |           |        |        |     ________|___________________                                 
  |    |         |           |        |        |    |                            NP                              
  |    |         |           |        |        |    |              ______________|________                        
  |    |         |           |        |        |    |             |              |  

In [16]:
temp1 = tree[0]
temp2 = tree[1]
temp3 = tree[-1]
temp1.pretty_print()
temp2.pretty_print()
temp3.pretty_print()

  NP 
  |   
 NNP 
  |   
Kalki

 , 
 |  
 , 

                            NP                                                                         
        ____________________|______________________                                                     
       |           |        |        |             PP                                                  
       |           |        |        |     ________|___________________                                 
       |           |        |        |    |                            NP                              
       |           |        |        |    |              ______________|________                        
       |           |        |        |    |             |              |       SBAR                    
       |           |        |        |    |             |              |    ____|____                   
       |           |        |        |    |             |              |   |         S                 
       |     

In [17]:
#split at rightmost NP or VP

def get_flattened(t):
    sent_str_final = None
    if t is not None:
        sent_str = [" ".join(x.leaves()) for x in list(t)]
        sent_str_final = [" ".join(sent_str)]
        sent_str_final = sent_str_final[0]
    return sent_str_final

In [18]:
def get_rvp_nvp(parse_tree, last_np = None, last_vp = None):

    if len(parse_tree.leaves()) == 1:
        return last_np, last_vp
    last_subtree = parse_tree[-1]
    if last_subtree.label() == "NP":
        last_np = last_subtree
    elif last_subtree.label() == "VP":
        last_vp = last_subtree
    return get_rvp_nvp(last_subtree, last_np, last_vp)

In [19]:
last_np, last_vp = get_rvp_nvp(tree)
last_np_flattened = get_flattened(last_np)
last_vp_flattened = get_flattened(last_vp)
print(last_np_flattened)
print(last_vp_flattened)

the Hindu god Vishnu , who is yet to appear
appear


In [20]:
def get_termination_portion(main_string, sub_string):
    combined_sub_string = sub_string.replace(" ","")
    main_string_list = main_string.split()
    last_index = len(main_string_list)
    for i in range(last_index):
        check_string_list = main_string_list[i:]
        check_string = "".join(check_string_list)
        check_string = check_string.replace(" ","")
        if check_string == combined_sub_string:
            return " ".join(main_string_list[:i])
        
    return None

In [21]:
longest_phrase = max(last_np_flattened, last_vp_flattened)
print(longest_phrase)

the Hindu god Vishnu , who is yet to appear


In [22]:
longest_phrase = re.sub(r"-LRB-", "(", longest_phrase)
longest_phrase = re.sub(r"-RRB-", ")", longest_phrase)

In [23]:
longest_phrase

'the Hindu god Vishnu , who is yet to appear'

In [24]:
split_sentence = get_termination_portion("The old woman was sitting under a tree and sipping coffee", longest_phrase)

In [25]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

GPT2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
GPT2_model = GPT2LMHeadModel.from_pretrained('gpt2', pad_token_id=GPT2_tokenizer.eos_token_id)

In [26]:
partial_sentence = "The old woman was sitting under a tree and"
input_ids = GPT2_tokenizer.encode(partial_sentence, return_tensors='pt')
print(input_ids)
maximum_length = len(partial_sentence.split())+40

tensor([[ 464, 1468, 2415,  373, 5586,  739,  257, 5509,  290]])


In [27]:
#Activate top_k sampling and top_p sampling with only from 90% most likely words

sample_outputs = GPT2_model.generate(input_ids, do_sample=True, max_length=maximum_length, top_k=60, top_p=0.8, repetition_penalty=10.0, num_return_sequences=12)


The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [28]:
generated_sentences = []
for i,sample_output in enumerate(sample_outputs):
  decoded_sentence = GPT2_tokenizer.decode(sample_output, skip_special_tokens=True)
  final_sentence = tokenize.sent_tokenize(decoded_sentence)[0]
  generated_sentences.append(final_sentence)
  print(final_sentence)

The old woman was sitting under a tree and there wasn't much in the way of foliage.
The old woman was sitting under a tree and listening to her husband's sermon.
The old woman was sitting under a tree and her mother stood behind the fence to keep it from falling out.
The old woman was sitting under a tree and there, beside her husband on the edge of his bed.
The old woman was sitting under a tree and she said something to the stranger, "What is it?"
The old woman was sitting under a tree and one of the men in her way asked what he could do for you.
The old woman was sitting under a tree and her husband in the back seat of his truck.
The old woman was sitting under a tree and she kept on repeating the same word: "This is my girl.
The old woman was sitting under a tree and heard that the man who'd been there had come to fetch her some food.
The old woman was sitting under a tree and waiting for him to come along.
The old woman was sitting under a tree and listening to her thoughts.
The o

In [5]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(squad["train"]["context"][0])
displacy.serve(doc, style="ent")




Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.
