In [1]:
%pwd

'/Users/nguyenphungbaohuy/Desktop/python/Research-ChatBot/research'

In [2]:
import os
os.chdir('../')

In [3]:
%pwd

'/Users/nguyenphungbaohuy/Desktop/python/Research-ChatBot'

In [5]:
import os
import re
import json
import logging
import requests
from pathlib import Path
from datetime import date
from io import BytesIO
from PyPDF2 import PdfReader
from transformers import pipeline, AutoTokenizer
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [6]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [7]:
def extract_link(url: str) -> str:
    links = {
        "arxiv_page": "",
        "arxiv_pdf": "",
        "project_page": "",
        "github_page": "",
        "summary": "",
    }
    
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")
        
    for a in soup.find_all("a", href=True):
        href = a["href"]
        full_url = urljoin(url, href)
        if re.search(r'arxiv\.org/abs/\d+\.\d+', href):
            links["arxiv_page"] = full_url
            arxiv_id = href.split("/")[-1]
            links["arxiv_pdf"] = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
        elif 'github.com' in href:
            links["github_page"] = full_url
        elif 'Project page' in a.get_text(strip=True):
            links["project_page"] = full_url
        
    return links

In [9]:
def parse_papers(url: str, date: str) -> list[dict]:
    full_url = f"{url}{date}"
    response = requests.get(full_url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")
    
    papers = []
    entries = soup.find_all("article", class_="relative flex flex-col overflow-hidden rounded-xl border")
    for entry in entries:
        info = entry.find("a", class_="line-clamp-3 cursor-pointer text-balance")
        
        paper_href = info.get("href")
        title = info.get_text(strip=True)
        paper_url = urljoin(url, paper_href)
        
        additional_links = extract_link(paper_url)
        
        paper_info = {
            "title": title,
            "huggingface_url": paper_url,
            **additional_links
        }
        
        papers.append(paper_info)
        
    return papers

In [10]:
def save_data_locally(data, filename, directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
        
    file_path = os.path.join(directory, filename)
    
    with open(file_path, 'w') as file:
        json.dump(data, file, indent=4)
    print(f'Data saved to {file_path}')

In [25]:
def summarize_pdf(url: str, summarizer) -> str:
    response = requests.get(url)
    response.raise_for_status()
        
    # Read the PDF content from bytes
    pdf_file = BytesIO(response.content)
    reader = PdfReader(pdf_file)
        
    # Extract text from each page of the PDF
    extracted_text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            extracted_text += page_text + "\n"
        
    if not extracted_text.strip():
        return "No text could be extracted from the PDF."
        
    words = extracted_text.split()
    truncated_text = " ".join(words[:2000])
        
    # Generate a summary. You can tweak max_length and min_length to control output.
    summary = summarizer(truncated_text, max_length=150, min_length=50, do_sample=False)
    return summary[0]['summary_text']

In [26]:
class Settings:
    DATE = date.today()
    # Paths
    # BASE_DIR = Path(__file__).resolve().parent.parent.parent

    # DATA_DIR = "data/"
    DB_PATH = "data/papers_db"
    # LOGS_DIR = DATA_DIR / "logs"
    
    # URLs
    HUGGINGFACE_PAPERS_URL = "https://huggingface.co"
    ARXIV_PDF_URL = "https://arxiv.org/pdf/"
    PAPER_DATE = f"/papers/date/{DATE}"
    
    # Model config
    SUMMARIZATION_MODEL = "facebook/bart-large-cnn"
    EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
    
settings = Settings()

In [None]:
def summarizer_model():
    summarizer = pipeline("summarization", model=settings.SUMMARIZATION_MODEL)
    return summarizer

In [28]:
def main():
    today_papers = parse_papers(settings.HUGGINGFACE_PAPERS_URL, settings.PAPER_DATE)
    summarizer = summarizer_model()
    if not today_papers:
        print("No papers found for today.")
        return
        
    print(len(today_papers))

    for paper in today_papers:
        if not paper['arxiv_pdf']:
            print(f"No PDF link found for {paper['title']}")
            continue

        paper['summary'] = summarize_pdf(paper['arxiv_pdf'], summarizer)
            
    filename = f"papers_{settings.DATE}.json"
    save_data_locally(today_papers, filename, settings.DB_PATH)


In [30]:
if __name__ == "__main__":
    main()

Device set to use mps:0


1
Data saved to data/papers_db/papers_2025-04-14.json


In [None]:
def download_pdf(pdf_url: str, arxiv_id: str) -> str:
    file_dir = "data/"
    os.makedirs(file_dir, exist_ok=True)

    file_path = os.path.join(file_dir, f"{arxiv_id}.pdf")
    response = requests.get(pdf_url)
    response.raise_for_status()

    with open(file_path, "wb") as f:
        f.write(response.content)

    print(f"Downloaded and saved in: {file_path}")

In [None]:
from dotenv import load_dotenv
load_dotenv()
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
HUGGINGFACE_API_KEY = os.enviro.get('HUGGINGFACE_API_KEY')

In [None]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
# Extract Data From the PDF File
def load_pdf_file(data: str) -> str:
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [None]:
extracted_data = load_pdf_file(data='data/')

In [None]:
# Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_splitter

In [None]:
text_chunk = text_split(extracted_data=extracted_data)
print(f"Lenght of chunck: {len(text_chunk)}")

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import pipeline

In [None]:
# Download the Embeddings from Hugging Face
def download_hugging_face_embeddings(model_name: str):
    embeddings =  HuggingFaceEmbeddings(model_name=model_name)
    return embeddings

In [None]:
embedding = download_hugging_face_embeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

NameError: name 'pipeline' is not defined

In [None]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "researchbot"

In [None]:
pc.create_index(name=index_name,
                dimension=384,
                metric="cosine",
                spec=ServerlessSpec(
                    cloud="aws",
                    region="us-east-1",
                ))

In [None]:
from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_documents(
    documents=text_chunk,
    index_name=index_name,
    embedding=embedding,
)

In [None]:
# Load Existing index
from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the emebeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding,
)

In [None]:
docsearch

In [None]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [None]:
from langchain_community.llms import Ollama
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [None]:
llm = Ollama(model="deepseek-r1", temperature=0.4)

In [None]:
SYSTEM_PROMPT = """
System Prompt for AI Research Assistant

Role and Scope:
You are a research assistant specialized in AI and large language models (LLMs). Your primary duties include discovering relevant papers on the Hugging Face platform, extracting PDF links, downloading the research documents, summarizing their contents, and indexing the results for efficient retrieval using a vectorstore database. Your output must be clear, detailed, and maintain academic rigor.

Functional Responsibilities:

1. Paper Discovery & Download:
   - Crawling: Continually monitor the Hugging Face page (or pages) dedicated to AI and LLM research for new papers.
   - Extraction: Identify and extract the PDF links from the page.
   - Downloading: Automatically download each PDF to the local system for further processing.

2. Content Summarization:
   - Parsing: Process the downloaded PDF and extract key sections—such as the abstract, introduction, methodology, experiments, results, and conclusion.
   - Summarizing: Generate a concise yet comprehensive summary that captures the paper’s main contributions, methods, findings, and any notable insights. Ensure the summary preserves the technical accuracy and context of the original document.
   - Validation: Verify that every summary is faithful to the paper's content, and highlight any potential uncertainties or limitations in the document.

3. Indexing with Vectorstore:
   - Embedding: Convert the text summary or key points into vector embeddings that represent the paper’s content.
   - Database Storage: Index these embeddings in the vectorstore to allow efficient and relevant retrieval during query time.
   - Query Matching: When a user query is received, search the vectorstore to retrieve the most semantically related papers and provide contextualized, aggregated insights.

4. User Query Response:
   - Contextual Answers: When asked about specific topics or research areas, integrate information from the vectorstore and provide detailed answers.
   - Direct References: When possible, refer to the paper sections (e.g., “based on the methodology described in section 3,”) to support your response.
   - Clarity & Detail: Ensure all outputs are clear, logically structured, and include sufficient technical details, especially when addressing experimental setups or complex methodologies.

Behavioral Guidelines:
- Professional Tone: Maintain a scholarly and objective tone throughout all interactions.
- Accuracy and Rigor: Base responses strictly on the data derived from the PDF documents. Avoid injecting personal opinions or unverified information.
- Error Handling: If any issue arises (e.g., PDF parsing errors, incomplete downloads), log a clear error message that describes the problem and, if possible, suggest remedial steps.
- Modularity: The agent should work in a modular fashion; for instance, if the PDF extraction fails, it must notify the user or log the error without disrupting other functionalities.

Example Instruction for a Query:
“When a user inquires about recent advances in transformer-based LLMs, search your indexed vectorstore for the most relevant papers, or the summarization of the papers. Summarize the key aspects of the top results—such as novel architectures, dataset insights, and experimental results—and provide a clear, consolidated answer that synthesizes these insights.”

Usage Note:
Embed this system prompt at the beginning of your agent’s session or configuration file to ensure that every module (crawling, summarizing, embedding, and querying) follows these guidelines. This will help the agent act in a coordinated manner, reliably reflecting the state-of-the-art research in AI LLMs from the Hugging Face page.
"""

In [None]:
prompt = ChatPromptTemplate.from_messages([
    ("system", SYSTEM_PROMPT),
    ("human", "{input}"),
])

In [None]:
# Create document-based QA chain
question_answer_chain = create_stuff_documents_chain(llm, prompt=prompt)
# Assume '