In [None]:
!pip install transformers



In [None]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.14-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading PyMuPDF-1.24.14-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (19.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.24.14


In [None]:
!pip install torch



# GPT - 2

In [None]:
import fitz
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Step 1: Extract Text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Step 2: Fine-tune the GPT-2 Model
def fine_tune_gpt2_on_pdf_text(pdf_text):
    # Load Pretrained GPT-2 Model and Tokenizer
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

    # Fix: Set eos_token as the padding token
    tokenizer.pad_token = tokenizer.eos_token

    # Tokenize the text data
    def tokenize_text(text, tokenizer, block_size=128):
        return tokenizer(text, return_tensors='pt', truncation=True, padding="max_length", max_length=block_size)

    # Create Dataset from Text for Fine-tuning
    def prepare_text_dataset(text_data, tokenizer, block_size=128):
        tokenized_text = tokenize_text(text_data, tokenizer, block_size)
        return tokenized_text

    # Convert the PDF text into tokenized form
    dataset = prepare_text_dataset(pdf_text, tokenizer)

    # Define the Training Arguments
    training_args = TrainingArguments(
        output_dir="./results",
        overwrite_output_dir=True,
        num_train_epochs=3,
        per_device_train_batch_size=4,
        save_steps=10_000,
        save_total_limit=2,
        prediction_loss_only=True,
        logging_dir='./logs',
        report_to="none" # Disable WandbCallback

    )

    # Data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )

    # Create Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset['input_ids']
    )

    # Fine-tune the model
    trainer.train()
    return model, tokenizer

# Step 3: Generate Answers from Fine-tuned Model
# def generate_answer(question, model, tokenizer, max_length=100):
#     inputs = tokenizer.encode(question, return_tensors="pt")
#     outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1)
#     answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
#     return answer
def generate_answer(question, model, tokenizer, max_length=200, num_beams=5, temperature=0.7, top_p=0.9, do_sample=True):
    # Tokenize the input question
    inputs = tokenizer(question, return_tensors="pt", padding=True, truncation=True)

    # Generate answer using the model
    outputs = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],  # Ensure attention mask is passed
        max_length=max_length,
        num_beams=num_beams,  # Beam search for better responses
        temperature=temperature,  # Sampling temperature for diverse answers
        top_p=top_p,  # Top-p sampling for diverse answers
        do_sample=do_sample,  # Enable sampling for temperature and top_p to take effect
        pad_token_id=tokenizer.eos_token_id,  # Ensure pad token is set to eos_token
        no_repeat_ngram_size=3  # Prevent repeating the same phrases
    )

    # Decode and return the generated answer
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer


In [None]:
# Example Usage:
if __name__ == "__main__":
    # Extract text from the PDF
    pdf_text = extract_text_from_pdf("SolarSystem.pdf")

    # Fine-tune GPT-2 on the extracted text
    model, tokenizer = fine_tune_gpt2_on_pdf_text(pdf_text)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Step,Training Loss


In [None]:
# Ask a question
question = input("que:")
answer = generate_answer(question, model, tokenizer)

print(f"Question: {question}")
print(f"Answer: {answer}")

que:solar system
Question: solar system
Answer: solar system.

In a recent study, researchers at the University of California, San Diego, found that the U.S. has the world's highest levels of carbon dioxide (CO 2.5 million metric tons per year ) and other greenhouse gases (NOx) in the atmosphere. The study, published in the journal Nature Geoscience, is the first to show that the Earth's climate is changing at the rate of 1.5 degrees per year, or 1.2 degrees per century.
"This study shows that climate change is happening at a much faster pace than previously thought," said study co-author and University of San Diego geophysicist Johnsen. "It is important to note that this is not the first time that climate has been shown to be changing at such a fast rate."

The study, which was funded by the National Oceanic and Atmospheric Administration (NOAA) and the National Science Foundation (NSF), was conducted by researchers from the


# BERT

In [None]:
import fitz
from transformers import BertForQuestionAnswering, BertTokenizer
import torch

# Step 1: Extract Text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Step 2: Load Pretrained BERT for Question Answering
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
model = BertForQuestionAnswering.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

# Step 3: Answer Questions using BERT
def answer_question(question, context, model, tokenizer, max_length=512):
    inputs = tokenizer.encode_plus(
        question,
        context,
        return_tensors="pt",
        max_length=max_length,  # Set max_length
        truncation=True,       # Enable truncation
        padding="max_length"  # Pad to max_length
    )
    with torch.no_grad():
        outputs = model(**inputs)
        start_scores = outputs.start_logits
        end_scores = outputs.end_logits

    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores) + 1
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
    return answer

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Example Usage
if __name__ == "__main__":
    # Step 1: Extract text from the PDF
    context = extract_text_from_pdf("SolarSystem.pdf")

In [None]:
# Step 2: Ask a question
question = "solar system"

# Step 3: Get the answer from the BERT model
answer = answer_question(question, context, model, tokenizer)

print(f"Question: {question}")
print(f"Answer: {answer}")

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Question: solar system
Answer: the solar system is a fascinating and complex structure , governed by the gravitational pull of the sun at its center . this system consists of a variety of celestial bodies , including eight planets , moons , asteroids , comets , and more . each planet has unique characteristics that contribute to the diversity of our cosmic neighborhood . the sun provides the energy and gravitational force that holds the solar system together , while the planets follow elliptical orbits around it . in this detailed exploration , we ’ ll focus on the sun , earth , mars , jupiter , saturn , uranus , and neptune , outlining their key features and importance within the system . the sun the sun is the heart of the solar system and accounts for about 99 . 86 % of its total mass . it is classified as a g - type main - sequence star ( often called a yellow dwarf ) and is located approximately 93 million miles ( 150 million kilometers ) from earth . the sun ' s immense gravity g

# T5

In [None]:
import fitz
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq

# Import the Dataset class from PyTorch
from torch.utils.data import Dataset

# Step 1: Extract Text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Step 2: Fine-tune the T5 Model
def fine_tune_t5_on_pdf_text(pdf_text):
    # Load Pretrained T5 Model and Tokenizer
    model = T5ForConditionalGeneration.from_pretrained('t5-small')
    tokenizer = T5Tokenizer.from_pretrained('t5-small')

    # Tokenize the PDF text
    def tokenize_text(text, tokenizer, block_size=512):
        inputs = ["question: " + text + " </s>"]
        tokenized_inputs = tokenizer(inputs, max_length=block_size, padding="max_length", truncation=True, return_tensors="pt")
        return tokenized_inputs

    # Create Dataset from Text for Fine-tuning
    def prepare_text_dataset(text_data, tokenizer, block_size=512):
        # Tokenize both input and target (assuming target is the same as input for now)
        inputs = ["question: " + text_data + " </s>"]
        targets = [text_data + " </s>"]  # Assuming target is the same as input
        tokenized_inputs = tokenizer(inputs, max_length=block_size, padding="max_length", truncation=True, return_tensors="pt")
        tokenized_targets = tokenizer(targets, max_length=block_size, padding="max_length", truncation=True, return_tensors="pt")

        # Create a dataset with input_ids and labels
        dataset = {
            'input_ids': tokenized_inputs['input_ids'],
            'labels': tokenized_targets['input_ids']
        }

        return dataset  # Return the complete dataset


    # Create Dataset from Text for Fine-tuning
    class TextDataset(Dataset): # Define a custom Dataset class
        def __init__(self, encodings):
            self.encodings = encodings

        def __getitem__(self, idx):
            return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

        def __len__(self):
            return len(self.encodings['input_ids'])


    # Convert the PDF text into tokenized form
    encodings = prepare_text_dataset(pdf_text, tokenizer) # Get encodings
    dataset = TextDataset(encodings) # Create TextDataset instance


    # Define the Training Arguments
    training_args = TrainingArguments(
        output_dir="./results",
        overwrite_output_dir=True,
        num_train_epochs=3,
        per_device_train_batch_size=4,
        save_steps=10_000,
        save_total_limit=2,
        logging_dir='./logs',
        report_to="none"  # Disable wandb reporting
    )

    # Data collator
    data_collator = DataCollatorForSeq2Seq(
        tokenizer=tokenizer, model=model,
    )

    # Create Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset
    )

    # Fine-tune the model
    trainer.train()
    return model, tokenizer

# Step 3: Generate Answers from the Fine-tuned T5 Model
def generate_answer(question, model, tokenizer, max_length=100):
    input_text = "question: " + question + " </s>"

    # Tokenize the input question
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=max_length)

    # Generate answer using the model
    outputs = model.generate(
        inputs['input_ids'],
        max_length=max_length,
        num_beams=5,
        early_stopping=True
    )

    # Decode and return the generated answer
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

In [None]:
# Example Usage
if __name__ == "__main__":
    # Extract text from the PDF
    pdf_text = extract_text_from_pdf("SolarSystem.pdf")

    # Fine-tune T5 on the extracted text
    model, tokenizer = fine_tune_t5_on_pdf_text(pdf_text)

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss


In [None]:
# Ask a question
question = "solar system"
answer = generate_answer(question, model, tokenizer)

print(f"Question: {question}")
print(f"Answer: {answer}")

Question: solar system
Answer: solar system
