In [None]:
# Install Required Packages
!pip install -q langchain_community langchain_huggingface \
               youtube-transcript-api faiss-cpu tiktoken \
               python-dotenv pymupdf pytube beautifulsoup4 wikipedia streamlit pyngrok

In [None]:
from langchain_community.document_loaders import PyMuPDFLoader, WikipediaLoader
from langchain_community.document_loaders import YoutubeLoader
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
import re
import os
import streamlit as st
from pyngrok import ngrok


In [None]:
# -------- CONFIG --------
query = "Nuclear Fusion"
pdf_path = None
youtube_url = None

In [None]:
# -------- LOADERS --------
docs = []

# Wikipedia is always used
wiki_loader = WikipediaLoader(query=query,load_max_docs=17)
wiki_docs = wiki_loader.load()
for doc in wiki_docs:
    doc.metadata["source"] = f"Wikipedia: {doc.metadata.get('title', 'Unknown')}"
docs.extend(wiki_docs)

# Optional PDF
if pdf_path:
    try:
        for doc in pdf_docs:
          doc.metadata["source"] = f"PDF: {pdf_path}"
        docs.extend(pdf_docs)
        print("✅ PDF loaded")
    except Exception as e:
        print(f"⚠️ PDF loading failed: {e}")

# Optional YouTube
if youtube_url:
    match = re.search(r"(?:v=|youtu.be/)([\w-]+)", youtube_url)
    if match:
        video_id = match.group(1)
        try:
            transcript_list = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])
            transcript = " ".join(chunk["text"] for chunk in transcript_list)
            doc = Document(page_content=transcript, metadata={"source": f"YouTube: {youtube_url}"})
            docs.append(doc)
            print("✅ YouTube transcript loaded")
        except (TranscriptsDisabled, NoTranscriptFound):
            print("⚠️ No usable English transcript found. Skipping video.")
        except Exception as e: # Catch other potential errors during retrieval/parsing
            print(f"⚠️ Error fetching or parsing YouTube transcript: {e}. Skipping video.")
    else:
        print("⚠️ Invalid YouTube URL")


In [None]:
# -------- TEXT SPLITTING --------
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
split_docs = text_splitter.split_documents(docs[:5])
print(f"🔹 Total Chunks: {len(split_docs)}")

🔹 Total Chunks: 63


In [None]:
# -------- EMBEDDINGS --------
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectors = FAISS.from_documents(split_docs, embeddings)
retriever = vectors.as_retriever(search_type="similarity", k=5)


In [None]:
# -------- LLM --------
from google.colab import userdata
hf_api_token = userdata.get("huggingfacehub_api_token")

llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-7B-Instruct-v0.3",
    task="text-generation",
    huggingfacehub_api_token=hf_api_token
)
model = ChatHuggingFace(llm=llm)

In [None]:
# -------- PROMPT --------
prompt = PromptTemplate(
    template="""
    You are a helpful assistant.
    Use the context below to answer the query, and reference sources using [number] format.
    If the context is insufficient, just say you don't know.

    {context}

    Query: {query}
    Answer:
    """,
    input_variables=["context", "query"]
)


In [None]:
# -------- CHAIN --------
def format_docs_with_citations(docs):
    formatted = []
    for i, doc in enumerate(docs, 1):
        source = doc.metadata.get("source", "Unknown source")
        formatted.append(f"[{i}] {doc.page_content}\n(Source: {source})")
    return "\n\n".join(formatted)


parallel_chain = RunnableParallel({
    "context": retriever | RunnableLambda(format_docs_with_citations),
    "query": RunnablePassthrough()
})

final_chain = parallel_chain | prompt | model | StrOutputParser()

In [None]:
# -------- EXECUTE --------
answer = final_chain.invoke(query)
print("\n✅ Final Answer:\n", answer)


✅ Final Answer:
  Nuclear fusion is a reaction in which atomic nuclei combine to form a larger nucleus, releasing energy due to the difference in nuclear binding energy before and after the fusion reaction [1]. It is the process that powers all active stars [1]. Applications of fusion include fusion power, thermonuclear weapons, boosted fission weapons, neutron sources, and superheavy element production [2]. The conditions required for fusion processes, such as a high temperature, density, and confinement time, are only found in stellar cores, advanced nuclear weapons, and fusion power experiments [4]. For a more detailed timeline of significant events in the study and use of nuclear fusion, refer to the timeline of nuclear fusion [3].
