In [None]:
!pip install requests
!pip install beautifulsoup4
!pip install PyPDF2
!pip install rank-bm25

!pip install sentence-transformers

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2
Collecting sentence-transformers
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.0-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.2/255.2 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.2.0


In [None]:
import requests
from bs4 import BeautifulSoup
import PyPDF2
def extract_text_from_webpage(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    text = ' '.join([p.text for p in soup.find_all('p')])
    return text
def extract_text_from_pdf(pdf_url, save_path):
    response = requests.get(pdf_url)
    with open(save_path, 'wb') as f:
        f.write(response.content)
    with open(save_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        text = ""
        for page in range(len(reader.pages)):
            text += reader.pages[page].extract_text()
    return text
def preprocess_text(text):
    text = text.replace('\n', ' ')
    text = text.replace('\r', ' ')
    return text
if __name__ == "__main__":
    urls = [
        'https://www.lithoguru.com/scientist/lithobasics.html',
        'https://download.e-bookshelf.de/download/0000/5785/21/L-G-0000578521-0015286987.pdf',
        'https://www.cpuc.ca.gov/industries-and-topics/electrical-energy/electric-rates/general-rate-case/southern-california-edison-grc-proceedings'
    ]
    webpage_text_1 = extract_text_from_webpage(urls[0])
    webpage_text_3 = extract_text_from_webpage(urls[2])
    pdf_text = extract_text_from_pdf(urls[1], "document.pdf")
    webpage_text_1 = preprocess_text(webpage_text_1)
    webpage_text_3 = preprocess_text(webpage_text_3)
    pdf_text = preprocess_text(pdf_text)

    with open("webpage_text_1.txt", "w", encoding='utf-8') as f:
        f.write(webpage_text_1)
    with open("pdf_text.txt", "w", encoding='utf-8') as f:
        f.write(pdf_text)
    with open("webpage_text_3.txt", "w", encoding='utf-8') as f:
        f.write(webpage_text_3)
    print("Data downloaded and preprocessed successfully!")

Data downloaded and preprocessed successfully!


In [None]:
import os
import torch
from sentence_transformers import SentenceTransformer, util
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import nltk

nltk.download('punkt')
retrieval_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
def load_and_chunk_documents():
    files = ['webpage_text_1.txt', 'pdf_text.txt', 'webpage_text_3.txt']
    documents = []
    for file in files:
        with open(file, 'r', encoding='utf-8') as f:
            text = f.read()
            sentences = nltk.sent_tokenize(text)
            chunks = [" ".join(sentences[i:i + 10]) for i in range(0, len(sentences), 10)]
            documents.extend(chunks)
    return documents

def encode_documents(documents):
    return retrieval_model.encode(documents, convert_to_tensor=True)
def retrieve_document(query, document_embeddings, documents, top_k=15):
    query_embedding = retrieval_model.encode(query, convert_to_tensor=True)
    similarities = util.pytorch_cos_sim(query_embedding, document_embeddings)[0]
    top_results = torch.topk(similarities, k=top_k)
    results = []
    for idx in top_results.indices:
        results.append({
            'document': documents[idx],
            'score': similarities[idx].item()
        })
    return results

def answer_query_with_generative_model(query, context):
    tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
    model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")
    inputs = tokenizer(f"Provide an in-depth, detailed, and well-structured answer: {query} based on the following context: {context}. Make sure to cover all relevant aspects and examples.", return_tensors="pt", truncation=True, max_length=700)
    outputs = model.generate(
        inputs['input_ids'],
        max_length=500,
        num_beams=7,
        early_stopping=True,
        temperature=0.6,
        repetition_penalty=1.2
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)
if __name__ == "__main__":
    documents = load_and_chunk_documents()
    document_embeddings = encode_documents(documents)

    queries = [
        "What should be the ideal post-bake temperature to ensure thermal stability of resist?",
        "Discuss the role of immersion lithography in modern semiconductor manufacturing.",
        "What are the challenges associated with advanced lithography techniques?"
    ]
    for query in queries:
        print(f"\nQuery: {query}")
        retrieved_docs = retrieve_document(query, document_embeddings, documents, top_k=15)
        combined_context = " ".join([result['document'] for result in retrieved_docs])[:1500]
        answer = answer_query_with_generative_model(query, combined_context)
        print(f"Answer: {answer}")

  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.71k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Query: What should be the ideal post-bake temperature to ensure thermal stability of resist?


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]



Answer: The high temperatures used (120°C - 150°C) crosslink the resin polymer in the photoresist, thus making the image more thermally stable

Query: Discuss the role of immersion lithography in modern semiconductor manufacturing.
Answer: Aerial Image Formation – The Basics 2.1.1 Maxwell’s Equations and the Wave Equation 30 2.1.2 General Harmonic Fields and the Plane Wave in a Nonabsorbing Medium 32 2.1.3 Phasors and Wave Propagation in an Absorbing Medium 33 2.1.4 Intensity and the Poynting Vector 36 2.1.5 Intensity and Absorbed Electromagnetic Energy 37viii

Query: What are the challenges associated with advanced lithography techniques?
Answer: Aerial Image Formation – The Basics
