In [2]:
import spacy
import torch
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, T5ForConditionalGeneration, AdamW, get_scheduler, DataCollatorWithPadding
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
max_source_length = 512
max_target_length = 128
prefix_1 = "answer: "
prefix_2 = " context: "
prefix_3 = "question: "
checkpoint = "t5-base"

In [4]:

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def preprocess_function(examples):
  input_1 = [ prefix_1 + i[0] for i in examples["answers.text"] ]
  input_2 = [ prefix_2 + i for i in examples["context"] ]
  input = [input_1[i] + input_2[i] for i in range(len(examples["context"]))]
  inputs = [input[0] for i in range(len(examples["context"]))]
  model_inputs = tokenizer(inputs, padding="longest", max_length = max_source_length , truncation = True, return_tensors = "pt")
  labels = tokenizer([prefix_3 + i for i in examples["question"]], padding="longest", max_length = max_target_length ,truncation = True, return_tensors = "pt")
  labels["input_ids"][labels["input_ids"]==tokenizer.pad_token_id] = -100
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [5]:
squad = load_dataset("squad")
squad = squad.flatten()
squad["train"][0]["answers.text"]

['Saint Bernadette Soubirous']

In [6]:
squad_tokenized = squad.map(preprocess_function, batched=True)

In [7]:
squad_tokenized = squad_tokenized.remove_columns(["id","title","question","context","answers.text","answers.answer_start"])
squad_tokenized.set_format("torch")
data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer, model = checkpoint)

In [8]:
squad_tokenized

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10570
    })
})

In [9]:
train_dataloader = DataLoader(squad_tokenized["train"], batch_size=8, collate_fn=data_collator)
eval_dataloader = DataLoader(squad_tokenized["validation"], batch_size=8, collate_fn=data_collator)

In [10]:
for batch in train_dataloader:
    break
print({k:v.shape for k,v in batch.items()})


{'input_ids': torch.Size([8, 192]), 'attention_mask': torch.Size([8, 192]), 'labels': torch.Size([8, 41])}


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


In [11]:
for batch in train_dataloader:
  break
print(tokenizer.decode(batch["input_ids"][0]))
fake_labels = np.where(batch["labels"][0]!=-100, batch["labels"][0], tokenizer.pad_token_id)
print(tokenizer.decode(fake_labels))

answer: Saint Bernadette Soubirous context: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.</s>
question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


In [12]:
model = T5ForConditionalGeneration.from_pretrained(checkpoint)
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(4.2687, grad_fn=<NllLossBackward0>) torch.Size([8, 41, 32128])


In [13]:
len(train_dataloader)

10950

In [14]:
optimizer = AdamW(model.parameters(), lr=1e-4)
loss = outputs.loss
loss.backward()
optimizer.step()
optimizer.zero_grad()



In [15]:
num_epochs = 20
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [16]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
print(device)

cuda


In [17]:
optimizer = AdamW(model.parameters(), lr=1e-4)

In [18]:
# progress_bar = tqdm(range(num_training_steps))

# model.train()
# for epoch in range(num_epochs):
#   for batch in train_dataloader:
#     batch = {k:v.to(device) for k,v in batch.items()}
#     outputs = model(**batch)
#     loss = outputs.loss
#     loss.backward()

#     optimizer.step()
#     lr_scheduler.step()
#     optimizer.zero_grad()
#     progress_bar.update(1)

In [None]:
model.save_pretrained("t5_question_generation_model")

In [1]:
import nltk
import string
from nltk.wsd import lesk
from nltk.corpus import stopwords, wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from rake_nltk import Rake, Metric

In [2]:
from datasets import load_dataset
squad = load_dataset("squad")
text = squad["train"]["context"][0]

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
rake_nltk = Rake(max_length=3, 
                include_repeated_phrases = False)

rake_nltk.extract_keywords_from_text(text)
keywords = rake_nltk.get_ranked_phrases()

filtered_keywords = set()
#punctuation removal from the text followed
for i in keywords:
    i = i.translate(str.maketrans("","",string.punctuation)).strip()
    filtered_keywords.add(i)    


In [4]:
import random

sample = random.sample(filtered_keywords, 5)
masked_sentence = []
cased_keywords = ([(i.upper(), i.lower(), i.capitalize(), i.title()) for i in sample])
temp = text

for i,j,k,l in cased_keywords:
  temp = temp.replace(i,"[MASK]").replace(j,"[MASK]").replace(k,"[MASK]").replace(l,"[MASK]")

for i in nltk.sent_tokenize(temp):
  if "[MASK]" in i:
    masked_sentence.append(i)

In [5]:
for i,j in enumerate(masked_sentence):
    print("{:}{:10}{:}".format(i," ",j))

0          Atop the Main Building's [MASK] is a golden statue of the Virgin Mary.
1          Immediately behind the basilica is the Grotto, a [MASK] of [MASK] and reflection.
2          It is a [MASK] of the grotto at [MASK], France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858.
3          At the end of the main drive (and in a direct line that connects through 3 statues and the [MASK]), is a simple, modern stone statue of Mary.


In [6]:
#Keyword extraction using rake-nltk
rake_nltk = Rake(max_length = 1, 
                include_repeated_phrases = False,
                punctuations = string.punctuation)

rake_nltk.extract_keywords_from_text(text)
keywords = rake_nltk.get_ranked_phrases()[:10]

#punctuation and stopwords removal from the text followed by lemmatization
stop_words = set(stopwords.words("english"))
text_no_punc = text.translate(str.maketrans("","",string.punctuation))
word_tokens = word_tokenize(text_no_punc.lower())
#filtered_sentence = [w for w in word_tokens if not w in stop_words]
lemmatizer = WordNetLemmatizer()
lemmatized_sentence = [lemmatizer.lemmatize(w) for w in word_tokens]

In [7]:
wn_keywords = []
wn_definitions = []
try:
    for i in keywords:
        wn_keywords.append(i)
        wn_definitions.append(lesk(lemmatized_sentence, i).definition())
except AttributeError:
    pass

In [8]:
import random

sample = random.sample(wn_keywords, len(wn_keywords)-1)
try:
    for i in range(len(sample)):
        print("{:100}{:10}".format(sample[i], wn_definitions[i]))
except AttributeError:
    pass

replica                                                                                             a person lacking intelligence or common sense
mary                                                                                                the period of instruction in a school; the time period when school is in session
next                                                                                                copy that is not the original; something that has been copied
simple                                                                                              (mathematics) a transformation in which the direction of one axis is reversed
prayer                                                                                              the act of communicating with a deity (especially as a petition or in adoration or contrition or thanksgiving)
lourdes                                                                                             at the time or occasio

In [9]:
from transformers import AutoTokenizer, BertForPreTraining

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = BertForPreTraining.from_pretrained(checkpoint)

In [10]:
!pip install gensim




[notice] A new release of pip is available: 24.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip
