In [None]:
pip install PyPDF2 sentence-transformers faiss-cpu



In [None]:
pip install transformers



In [None]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.14-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading PyMuPDF-1.24.14-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (19.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.24.14


In [None]:
import fitz

def extract_text_from_pdf(pdf_path):
    # with open(pdf_path, 'rb') as file:
    #     reader = PyPDF2.PdfReader(file)
    #     text = ""
    #     for page_num in range(len(reader.pages)):
    #         text += reader.pages[page_num].extract_text()
    # # new_text = text.replace("\n", "")
    # print(text)
    # return text


    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    print(text)
    return text

# Usage
pdf_text = extract_text_from_pdf('SolarSystem.pdf')
# print(pdf_text[:500])  # Print first 500 characters of extracted text


The Solar System is a fascinating and complex structure, governed by the
gravitational pull of the Sun at its center. This system consists of a variety
of celestial bodies, including eight planets, moons, asteroids, comets, and
more. Each planet has unique characteristics that contribute to the diversity
of our cosmic neighborhood. The Sun provides the energy and gravitational
force that holds the solar system together, while the planets follow elliptical
orbits around it. In this detailed exploration, we’ll focus on the Sun, Earth,
Mars, Jupiter, Saturn, Uranus, and Neptune, outlining their key features
and importance within the system.
The Sun
The Sun is the heart of the solar system and accounts for about 99.86% of
its total mass. It is classified as a G-type main-sequence star (often called
a yellow dwarf) and is located approximately 93 million miles (150 million
kilometers) from Earth. The Sun's immense gravity governs the orbits of
all the planets and celestial objects in the so

In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Initialize Sentence-BERT model for generating embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to split text into sentences or chunks
def split_text_into_chunks(text, chunk_size=50):
    sentences = text.split('. ')
    chunks = [' '.join(sentences[i:i+chunk_size]) for i in range(0, len(sentences), chunk_size)]
    return chunks

# Generate embeddings for chunks of PDF text
def generate_embeddings(text_chunks):
    embeddings = model.encode(text_chunks)
    return embeddings

# Create FAISS index and store text chunks
def create_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)  # Create a FAISS index (squared Euclidean distance)
    index.add(np.array(embeddings))  # Add embeddings to the index
    return index

# Search FAISS index with a threshold
def search_faiss_index(query, index, text_chunks, top_k=3, similarity_threshold=0.01):
    query_embedding = model.encode([query])

    # Search the FAISS index for the closest embeddings
    distances, indices = index.search(np.array(query_embedding), top_k)

    # Calculate cosine similarity (as FAISS uses L2 distance by default)
    similarities = 1 - distances / 2  # FAISS distances are squared Euclidean distances
    if similarities[0][0] < similarity_threshold:  # Check the top-1 similarity
        return "I don't know.", None

    results = [text_chunks[idx] for idx in indices[0]]
    return results, similarities[0]

In [None]:
# Example Usage
text_chunks = split_text_into_chunks(pdf_text, chunk_size=50)
embeddings = generate_embeddings(text_chunks)
faiss_index = create_faiss_index(embeddings)

# Relevent & Large Ans as well as Time Efficient

1. Using GPT-2 to Generate Long Answers


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load GPT-2 model and tokenizer
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2-large')
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')

# Function to generate long answers using GPT-2
def generate_long_answer_gpt2(query, text_chunks, max_length=300):
    input_text = f"Q: {query}\nA:"
    for chunk in text_chunks:
        input_text += f" {chunk}"


    # Set pad_token_id if not already set
    if gpt2_tokenizer.pad_token_id is None:
        gpt2_tokenizer.pad_token_id = gpt2_tokenizer.eos_token_id

     # Tokenize input and create attention mask
    inputs = gpt2_tokenizer.encode_plus(
        input_text,
        return_tensors='pt',
        truncation=True,
        padding='max_length',
        max_length=max_length  # Set max_length here for padding
    )

    # Generate a long answer with attention_mask and pad_token_id
    outputs = gpt2_model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,  # Pass attention_mask
        pad_token_id=gpt2_tokenizer.pad_token_id,  # Set pad_token_id
        max_new_tokens=5,  # Limit the generated text length
        num_return_sequences=1,
        no_repeat_ngram_size=3,
        do_sample=True,
        temperature=0.8,
        top_k=50,
        top_p=0.9,
    )

    # Decode and return the generated text
    answer = gpt2_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

In [None]:
# query = "solar system"
# top_chunks, similarities = search_faiss_index(query, faiss_index, text_chunks, top_k=3, similarity_threshold=0.2)

In [None]:
query = "solar system"
top_chunks, similarities = search_faiss_index(query, faiss_index, text_chunks, top_k=3, similarity_threshold=0.2)

# Example usage:
if top_chunks == "I don't know.":
    print(top_chunks)  # Prints "I don't know." if similarity is below threshold
else:
    long_answer = generate_long_answer_gpt2(query, top_chunks, max_length=300)
    # long_answer.replace('\n', ' ')
    print(long_answer)

Q: solar system
A: The Solar System is a fascinating and complex structure, governed by the
gravitational pull of the Sun at its center This system consists of a variety
of celestial bodies, including eight planets, moons, asteroids, comets, and
more Each planet has unique characteristics that contribute to the diversity
of our cosmic neighborhood The Sun provides the energy and gravitational
force that holds the solar system together, while the planets follow elliptical
orbits around it In this detailed exploration, we’ll focus on the Sun, Earth,
Mars, Jupiter, Saturn, Uranus, and Neptune, outlining their key features
and importance within the system.
The Sun
The Sun is the heart of the solar system and accounts for about 99.86% of
its total mass It is classified as a G-type main-sequence star (often called
a yellow dwarf) and is located approximately 93 million miles (150 million
kilometers) from Earth The Sun's immense gravity governs the orbits of
all the planets and celestial obje

# **Other Work:**



**1. Relevent Ans**
(deepset/roberta-base-squad2)

In [None]:
from transformers import pipeline

# Initialize a question-answering pipeline using a pre-trained model (like T5 or BART)
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

# Function to get answers from the retrieved chunks
def get_answer_from_chunks(query, text_chunks):
    answers = []
    for chunk in text_chunks:
        try:
            answer = qa_pipeline({'question': query, 'context': chunk})
            answers.append(answer['answer'])
        except:
            continue
    return answers

# Example usage
answers = get_answer_from_chunks(query, top_chunks)
for i, answer in enumerate(answers):
    print(f"Answer {i+1}: {answer}")


Answer 1: 
only of feature values without any known labels
Answer 2: 
only of feature values without any known labels
Answer 3: 
only of feature values without any known labels


**2. Using a Summarization Model for Generating Detailed Answers**
(facebook/bart-large-cnn)

In [None]:
from transformers import pipeline

# Initialize a summarization pipeline using a pre-trained model (like T5 or BART)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Function to summarize retrieved text chunks into a longer answer
def generate_long_answer_summarizer(query, text_chunks, max_length=500):
    context = " ".join(text_chunks)  # Join chunks for summarization
    summary = summarizer(context, max_length=max_length, min_length=100, do_sample=False)
    return summary[0]['summary_text']

# Example usage:
long_answer = generate_long_answer_summarizer(query, top_chunks, max_length=500)
print(long_answer)


Overfitting occurs when a model learns the training data too well, capturing noise and irrelevant patterns, leading to poor generalization on unseen data. Underfitting, on the other hand, happens when the model is too simple to capture the underlying structure of the data. Techniques for handling imbalanced datasets in machine learning include resampling and cross-validation. The bias -variance tradeoff refers to the balance between bias and variance in predictive models. High bias can cause underfitting, while high variance can lead to overfitting. It's crucial to find a balance to minimize both errors.


In [None]:
# query = "Ecorouting"
# top_chunks, similarities = search_faiss_index(query, faiss_index, text_chunks, top_k=3, similarity_threshold=0.2)

# if top_chunks == "I don't know.":
#     print(top_chunks)  # Prints "I don't know." if similarity is below threshold
# else:
#     for i, chunk in enumerate(top_chunks):
#         print(f"Result {i+1} (similarity {similarities[i]:.2f}):\n{chunk}\n")

I don't know.
