In [1]:
import os
os.environ['USER_AGENT'] = 'myagent'

In [1]:
!pip install langchain-community langchain-core
!pip install PyMuPDF
!pip install transformers sentence-transformers langchain
!pip install chromadb
!pip install langchain-huggingface
!pip install json-repair
!pip install -U langchain-google-genai  ## Using Chat Models
!pip install langchain_experimental
!pip install langchain
!pip install faiss-gpu

Collecting json-repair
  Using cached json_repair-0.39.1-py3-none-any.whl.metadata (11 kB)
Using cached json_repair-0.39.1-py3-none-any.whl (20 kB)
Installing collected packages: json-repair
Successfully installed json-repair-0.39.1
Collecting langchain-google-genai
  Downloading langchain_google_genai-2.0.11-py3-none-any.whl.metadata (3.6 kB)
Collecting filetype<2.0.0,>=1.2.0 (from langchain-google-genai)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting google-ai-generativelanguage<0.7.0,>=0.6.16 (from langchain-google-genai)
  Downloading google_ai_generativelanguage-0.6.16-py3-none-any.whl.metadata (5.7 kB)
Downloading langchain_google_genai-2.0.11-py3-none-any.whl (39 kB)
Downloading filetype-1.2.0-py2.py3-none-any.whl (19 kB)
Downloading google_ai_generativelanguage-0.6.16-py3-none-any.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: f

Collecting langchain_experimental
  Downloading langchain_experimental-0.3.4-py3-none-any.whl.metadata (1.7 kB)
Downloading langchain_experimental-0.3.4-py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.2/209.2 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langchain_experimental
Successfully installed langchain_experimental-0.3.4
[31mERROR: Could not find a version that satisfies the requirement faiss-gpu (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for faiss-gpu[0m[31m
[0m

In [2]:
import requests
import xml.etree.ElementTree as ET
import fitz  # PyMuPDF for working with PDFs
from bs4 import BeautifulSoup

# LangChain modules
from langchain.document_loaders import WebBaseLoader, UnstructuredPDFLoader
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInferenceAPIEmbeddings, OllamaEmbeddings
from langchain.chains import LLMChain, RetrievalQA
from langchain.prompts import ChatPromptTemplate, load_prompt
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# LangChain community and retrievers
from langchain_community.chat_models import ChatOllama
from langchain.retrievers import BM25Retriever, EnsembleRetriever

# Hugging Face modules
from langchain_huggingface import HuggingFaceEndpoint
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer
from langchain.llms import HuggingFaceHub
from langchain_google_genai import GoogleGenerativeAI,GoogleGenerativeAIEmbeddings



In [3]:
import os
import json
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.documents import Document
from langchain_community.graphs.networkx_graph import NetworkxEntityGraph
from langchain.chains import GraphQAChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from langchain import hub
from langchain.vectorstores import FAISS
from langchain.chains import ConversationalRetrievalChain
from langchain_core.documents import Document

In [4]:
# Step 1: Get arXiv Paper URLs
def get_arxiv_paper_urls(query, max_results):
    url = f"http://export.arxiv.org/api/query?search_query=all:{query}&start=0&max_results={max_results}"
    response = requests.get(url)
    root = ET.fromstring(response.content)

    paper_urls = []
    for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
        paper_url = entry.find("{http://www.w3.org/2005/Atom}id").text
        paper_urls.append(paper_url)

    return paper_urls

In [5]:
# Step 2: Extract Paper Metadata
def extract_paper_content(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    title_tag = soup.find("meta", {"name": "citation_title"})
    title = title_tag["content"] if title_tag else "Title not found"

    abstract_tag = soup.find("blockquote", {"class": "abstract"})
    abstract_text = abstract_tag.text.replace("Abstract: ", "").strip() if abstract_tag else "Abstract not found"

    pdf_url = url.replace("abs", "pdf") + ".pdf"

    return {"title": title, "abstract": abstract_text, "pdf_url": pdf_url}

In [6]:
# Step 3: Download and Extract PDF
def download_and_extract_pdf(pdf_url):
    response = requests.get(pdf_url)
    pdf_filename = pdf_url.split("/")[-1]
    with open(pdf_filename, "wb") as pdf_file:
        pdf_file.write(response.content)

    doc = fitz.open(pdf_filename)
    full_text = ""
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        full_text += page.get_text("text")
    doc.close()

    return full_text

In [7]:
def fetch_paper_urls(query, max_results):
    """Fetch arXiv paper URLs based on a query."""
    print("Fetching arXiv paper URLs...")
    urls = get_arxiv_paper_urls(query, max_results=max_results)
    if not urls:
        raise ValueError("No papers found for the given query.")
    return urls

In [8]:
def process_single_paper(url):
    """Process a single paper: extract metadata, download content, and return as a document."""
    try:
        content = extract_paper_content(url)
        print(f"Processing: {content['title']}")

        # Download and extract PDF content
        full_content = download_and_extract_pdf(content["pdf_url"])
        print(f"Extracted content length: {len(full_content)} characters")

        # Return document format
        return {"title": content["title"], "page_content": full_content}
    except Exception as e:
        print(f"Error processing {url}: {e}")
        return None

In [9]:
def process_papers(urls):
    """Process multiple papers and return a list of documents."""
    print("Processing papers...")
    docs = []
    for url in urls:
        doc = process_single_paper(url)
        # Only append the document if it is not None
        if doc is not None:
            docs.append(doc)

    # Raise an error if no valid documents were processed
    if not docs:
        raise ValueError("No valid content extracted from the papers.")

    return docs

In [10]:
def chunk_documents(documents, chunk_size=20000, chunk_overlap=2400):
    """Split documents into smaller chunks."""
    print("Splitting documents into chunks...")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return text_splitter.split_documents(documents)

In [11]:
def initialize_embeddings(api_key, model="models/text-embedding-004"):
    """Initialize the embedding model."""
    print("Initializing embeddings...")
    return GoogleGenerativeAIEmbeddings(model=model, google_api_key=api_key, task_type="retrieval_document")

In [12]:
def setup_FAISS_database(documents, embed):
    """Set up a database using the embeddings."""
    print("Setting up FAISS database...")
    return FAISS.from_documents(documents=documents, embedding=embed)

In [13]:
import math
import random
import time
import json
from time import sleep

def process_papers_and_create_database(query, max_results, batch_size=5):
    """
    Orchestrate the pipeline to fetch, process, and prepare a FAISS database for academic papers in batches.
    Also, combine all batch results into one consolidated JSON file.
    """
    try:
        GOOGLE_API_KEY = ""  # Add your Google API key here

        # Fetch all paper URLs
        all_urls = fetch_paper_urls(query, max_results=max_results)
        total_papers = len(all_urls)

        # Calculate the number of batches
        num_batches = math.ceil(total_papers / batch_size)
        random.shuffle(all_urls)  # Shuffle to avoid repeating papers

        all_results = []  # List to store all results from batches

        for batch_num in range(num_batches):
            start_index = batch_num * batch_size
            current_batch_urls = all_urls[start_index:start_index + batch_size]

            print(f"Processing batch {batch_num + 1}/{num_batches}...")

            raw_docs = []
            for i, url in enumerate(current_batch_urls):
                try:
                    # Process each paper individually and append to raw_docs if successful
                    raw_doc = process_single_paper(url)
                    if raw_doc:  # Only add successfully processed documents
                        raw_docs.append(raw_doc)
                    else:
                        print(f"Skipping paper {url} due to processing error.")
                except Exception as e:
                    print(f"Error processing paper {url}: {e}. Skipping this paper.")

            if not raw_docs:  # If no documents were processed in the batch, skip the batch
                print(f"No papers processed in batch {batch_num + 1}, skipping.")
                continue

            # Convert raw_docs (dictionaries) to LangChain Document objects
            docs = [
                Document(page_content=doc.get("page_content", ""), metadata={"source": current_batch_urls[i], "title": doc.get("title", "Untitled Document")})
                for i, doc in enumerate(raw_docs)
            ]

            # Chunk documents and set up database
            chunks = chunk_documents(docs)
            embed = initialize_embeddings(api_key=GOOGLE_API_KEY)
            db = setup_FAISS_database(chunks, embed)

            # Process queries and save the results
            queries = ["""
Summarize the main idea and key results of each paper using the provided excerpts and metadata. Include:
1. **Title** (from metadata)
2. **Abstract** (if available)
3. **Main idea and hypothesis**: It should clearly states the hypothesis which the paper is written based on it accordingly.
4. **Summary of Results**: Key findings, conclusions, or implications.
Provide clear and concise summaries for each paper.
"""]

            json_output_path = f"rag_papers_results_batch_{batch_num + 1}.json"
            setup_rag_pipeline_and_process_queries_single(db, GOOGLE_API_KEY, queries, json_output_path)

            # Load the JSON output for the current batch and append to all_results
            with open(json_output_path, 'r') as f:
                batch_results = json.load(f)
                all_results.extend(batch_results)

            print(f"Batch {batch_num + 1} complete. Results saved to {json_output_path}.")
            sleep(30)  # Adding sleep to prevent hitting API rate limits if necessary

        # After processing all batches, save the combined results to a single JSON file
        combined_json_output_path = "rag_papers_results_combined.json"
        with open(combined_json_output_path, 'w') as f:
            json.dump(all_results, f, indent=4)

        print(f"All batches processed. Combined results saved to {combined_json_output_path}.")

    except Exception as e:
        print(f"Pipeline error: {e}")
        return None


In [14]:
import os
import json
import math
from time import sleep

def setup_rag_pipeline_and_process_queries_single(db, api_key, queries, json_output_path):
    """
    Set up RAG pipeline, process multiple queries using the provided FAISS database, and save results to JSON.

    Parameters:
        db (FAISS database): The FAISS database containing processed papers.
        api_key (str): API key for Google Generative AI.
        queries (list): List of queries to process.
        json_output_path (str): Path to save the JSON output.
    """
    # Step 1: Check if db is provided, else raise error
    if db is None:
        print("Database is None. Exiting pipeline.")
        return None

    # Step 2: Configure MMR retriever
    print("Setting up retriever...")
    retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": 15, "lambda_mult": 0.5})

    # Step 3: Initialize the LLM and RAG Chain
    print("Setting up RAG chain...")
    llm = GoogleGenerativeAI(model="gemini-1.5-flash", google_api_key=api_key)
    rag_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        return_source_documents=True
    )

    # Step 4: Process each query and collect responses
    results = []
    for question in queries:
        print(f"Processing question: {question}")
        try:
            response = rag_chain.invoke({"query": question})  # Use invoke instead of __call__
            result_data = {
                "query": question,
                "response": response.get("result", "No result found."),
                "sources": [
                    doc.metadata.get("source", "Unknown source") for doc in response.get("source_documents", [])
                ]
            }
            results.append(result_data)
        except Exception as e:
            print(f"Error processing question '{question}': {e}")
            results.append({
                "query": question,
                "response": f"Error: {str(e)}",
                "sources": []
            })

    # Step 5: Save results to JSON
    print(f"Saving results to {json_output_path}...")
    try:
        with open(json_output_path, "w") as f:
            json.dump(results, f, indent=4)
    except Exception as e:
        print(f"Error saving results to {json_output_path}: {e}")

    print("Processing complete!")
    return results

In [15]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m48.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [16]:
query = "Carbon Emissions Reduction"
max_results = 200  # Number of papers to fetch
process_papers_and_create_database(query=query, max_results=max_results)

Fetching arXiv paper URLs...
Processing batch 1/40...
Processing: Global to local impacts on atmospheric CO2 caused by COVID-19 lockdown
Extracted content length: 50640 characters
Processing: Planning low-carbon distributed power systems: Evaluating the role of energy storage
Extracted content length: 77466 characters
Processing: Carbon Intensity-Aware Adaptive Inference of DNNs
Extracted content length: 11823 characters
Processing: Costs and Benefits of the Paris Climate Targets
Extracted content length: 36461 characters
Processing: A Knowledge-driven Memetic Algorithm for the Energy-efficient Distributed Homogeneous Flow Shop Scheduling Problem
Extracted content length: 62138 characters
Splitting documents into chunks...
Initializing embeddings...
Setting up FAISS database...
Setting up retriever...
Setting up RAG chain...
Processing question: 
Summarize the main idea and key results of each paper using the provided excerpts and metadata. Include:
1. **Title** (from metadata)
2. **Ab

In [17]:
import os
import zipfile

# Define the source directory (e.g., '/content/')
source_directory = '/content/'

# Define the output zip file path (e.g., '/home/json_files.zip')
output_zip_file = '/content/sample_data/json_files.zip'

# Create a ZipFile object in write mode
with zipfile.ZipFile(output_zip_file, 'w') as zipf:
    # Walk through the source directory
    for root, dirs, files in os.walk(source_directory):
        for file in files:
            if file.endswith('.json'):  # Only include JSON files
                # Get the full file path
                file_path = os.path.join(root, file)
                # Add the file to the zip, preserving the directory structure
                arcname = os.path.relpath(file_path, source_directory)
                zipf.write(file_path, arcname)

print(f"All JSON files in '{source_directory}' have been zipped and saved to '{output_zip_file}'.")


All JSON files in '/content/' have been zipped and saved to '/content/sample_data/json_files.zip'.
