In [64]:
import fitz
import spacy
import re

def extract_text(pdf_path):
    doc=fitz.open(pdf_path)#opens pdf at specified path
    text=""
    for page_no in range(doc.page_count):#loops through all pages of pdf
        page=doc[page_no]
        text+=page.get_text()#stores all text content of pdf
    doc.close()
    return(text)
def tokenize(text):
    nlp=spacy.load("en_core_web_sm")
    doc=nlp(text)
    tokens=[token.text for token in doc]#tokenizes extracted text
    return(tokens)
pdf_path='C:/Personal/ML/datasets/chapter-2.pdf'
pdf_text=extract_text(pdf_path)#stores extracted text
tokens=tokenize(pdf_text)#stores extracted tokens
print('Extracted tokens:')
print(tokens[:10])

def paragraphs_org(tokens):
    paragraphs = []
    current_paragraph = []
    
    for i,token in enumerate(tokens):#loops through tokens
        if token == '\n':#on detection of newline character:
            if current_paragraph:
                paragraphs.append({'paragraph': current_paragraph})
                current_paragraph = []
            else:
                current_paragraph.append(token)#appended to present paragraph if does not satify condition
        else:
            current_paragraph.append(token)#appended to present paragraph if does not satify condition

    if current_paragraph:
        paragraphs.append({'paragraph': current_paragraph})#individual paragraphs appended to paragraphs dictionary

    return paragraphs

paragraphs=paragraphs_org(tokens)
print('Total number of paragraphs present:')
print(len(paragraphs))

documents=[' '.join(p['paragraph']) for p in paragraphs]
g_paragraphs=[' '.join(documents[i:i+15]) for i in range(0, len(documents),15)]#grouped 15 extracted paragraphs together

def ner(grouped_paragraphs):#named entity recognition function
    nlp = spacy.load("en_core_web_sm")
    ner_results = []
    for i, group in enumerate(grouped_paragraphs):
        doc = nlp(group)
        entities = [(ent.text, ent.label_) for ent in doc.ents]#helps extract the entities and their corresponding labels
        ner_results.append({'group_number':i+1,'n_entities': entities})
    return ner_results

grouped_ner_results=ner(g_paragraphs)#stores entities of compiled paragraphs

for result in grouped_ner_results[:10]:
    print(f'Group {result["group_number"]} NER results:')
    print(result['n_entities'])

print('Contents of the first grouped paragraph:')
print(g_paragraphs[0])



Extracted tokens:
['From', 'Trade', 'to', 'Territory', '                  \n', 'The', 'Company', 'Establishes', 'Power', '\n']
Total number of paragraphs present:
873
Group 1 NER results:
[('Trade to Territory                   \n The Company Establishes Power 2', 'ORG'), ('Mughal', 'ORG'), ('India', 'GPE'), ('1707', 'DATE'), ('Mughal', 'ORG'), ('India', 'GPE'), ('Delhi', 'GPE'), ('the second half of the eighteenth century', 'DATE'), ('British', 'NORP'), ('British', 'NORP')]
Group 2 NER results:
[('Fig', 'PERSON'), ('1', 'CARDINAL'), ('Mughal', 'ORG'), ('Mughal', 'PERSON'), ('British', 'NORP'), ('1857', 'DATE'), ('Mughal', 'PERSON'), ('Bahadur Shah Zafar', 'PERSON')]
Group 3 NER results:
[('2023 - 24 10', 'DATE'), ('1600', 'DATE'), ('the East India Company', 'ORG'), ('England', 'GPE'), ('Queen Elizabeth', 'PERSON'), ('East', 'LOC'), ('England', 'GPE')]
Group 4 NER results:
[('the East India Company', 'ORG'), ('Company', 'ORG'), ('Europe', 'LOC'), ('Company', 'NORP'), ('English', 'LANGU

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer
#uses TF IDF method for keyword extraction
def extract_keywords(grouped_paragraphs):
    vectorizer=TfidfVectorizer()#instance of vectorizer
    tf_idf_matrix=vectorizer.fit_transform(grouped_paragraphs)#calculates a TF IDF matrix with each row correspodning to a paragraph
    #each column corresponds to a word
    feature_names=vectorizer.get_feature_names_out()
    keywords=[]

    for i, group in enumerate(grouped_paragraphs):
        tf_idf_scores=tf_idf_matrix[i].toarray().flatten()
        top_indices=tf_idf_scores.argsort()[-5:][::-1]
        top_keywords=[feature_names[index] for index in top_indices]
        keywords.append({'group_number':i+1,'keywords': top_keywords})#keywords stored in a dictionary

    return keywords

keywords=extract_keywords(g_paragraphs)
for i,p in enumerate(keywords[:5]):
    print(f'Keywords of group {i+1}')
    print(p)#sample keywords printed


Keywords of group 1
{'group_number': 1, 'keywords': ['the', 'regional', 'as', 'powerful', 'kingdoms']}
Keywords of group 2
{'group_number': 2, 'keywords': ['bahadur', 'shah', 'zafar', 'mughal', 'the']}
Keywords of group 3
{'group_number': 3, 'keywords': ['east', 'england', 'the', 'cold', 'compete']}
Keywords of group 4
{'group_number': 4, 'keywords': ['could', 'the', 'charter', 'sell', 'cheap']}
Keywords of group 5
{'group_number': 5, 'keywords': ['the', 'in', 'ocean', 'portuguese', 'too']}


In [84]:
from transformers import TFT5ForConditionalGeneration, T5Tokenizer
#T5 LLM used here
import random
def generate_mcq(context,model,tokenizer,num_choices=4): #generates mcq questions
    text=f"context: {context} generate multiple choice question:" #serves as a guide to the LLM

    encoding=tokenizer.encode_plus(text,max_length=384,pad_to_max_length=False,truncation=True,return_tensors="tf")#converts text to numerical format
    #uses tokenizer to obtain input IDs etc
    input_ids,attention_mask=encoding["input_ids"],encoding["attention_mask"]
    #attention mask sieves out relevant and useful tokens

    outs=model.generate(input_ids=input_ids,attention_mask=attention_mask,early_stopping=True,num_beams=5,
        num_return_sequences=5,no_repeat_ngram_size=2,max_length=100)
    #above snippet generates question 5 possibilities considered parallely, early stopping enabled
    
    dec=[tokenizer.decode(ids, skip_special_tokens=True) for ids in outs] #helps decode the outputted IDs into text
    generated_questions=[]
    for i in range(5):
        generated_questions.append(dec[i].replace("question:", "").strip()) #only 1 sequence outputted so we use dec[0]

    nlp=spacy.load("en_core_web_sm")
    doc=nlp(context)
    entities=set([(ent.text, ent.label_) for ent in doc.ents]) #extracts relevant important entities from context
    #this helps in generating choicws
    #keys=[]
    #for k in keywords:
        #for x in k['keywords']:
            #keys.append(x)

    #combined_entities=entities.union(set(keywords))#combines extracted entities and keywords together
    #shuffled_entities=list(combined_entities)
    #random.shuffle(shuffled_entities)
    entities=list(entities)
    answer_choices=[f"{ent[0]} ({ent[1]})" for ent in entities[:num_choices]]#generates answer choices on basis of extracted entities

    return generated_questions,answer_choices



def get_mca_questions(context):
    if not isinstance(context,str):
        raise TypeError("Received parameter must be a string")
    model_name = "t5-small"
    model=TFT5ForConditionalGeneration.from_pretrained(model_name)
    tokenizer=T5Tokenizer.from_pretrained(model_name)#T5 small model and tokenizers initialized
    
    generated_questions,answer_choices=generate_mcq(context,model,tokenizer)
    mca_questions=[]
    answers=""
    for i in answer_choices:
        answers+=i
    for i in generated_questions:
        x=i+answers
        mca_questions.append(x)
    return(mca_questions)
#we can pass passages present in g_paragraphs as context parameters to above function, it would return mcq questions corresponding 
#to that particular context

ans=get_mca_questions(g_paragraphs[7])
print(ans)

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


['Mughal officials into giving the Company zamindari rights1696 (DATE)Mughal (ORG)One (CARDINAL)Kalikata (PERSON)', 'Mughal emperor Aurangzeb1696 (DATE)Mughal (ORG)One (CARDINAL)Kalikata (PERSON)', 'Mughal officials1696 (DATE)Mughal (ORG)One (CARDINAL)Kalikata (PERSON)', 'Bengal1696 (DATE)Mughal (ORG)One (CARDINAL)Kalikata (PERSON)', 'Bengal.1696 (DATE)Mughal (ORG)One (CARDINAL)Kalikata (PERSON)']
