In [2]:
%pwd

'/Users/nguyenphungbaohuy/Desktop/python/Research-ChatBot/research'

In [3]:
import os
os.chdir('../')

In [4]:
%pwd

'/Users/nguyenphungbaohuy/Desktop/python/Research-ChatBot'

In [5]:
HUGGINGFACE_PAPERS_URL = "https://huggingface.co"
PDF_URL = "https://arxiv.org/pdf/"
DATE = "2025-04-10"
PAPER_DATE = f"/papers/date/{DATE}"

In [None]:
import os
import io
import logging
from PyPDF2 import PdfReader
from transformers import pipeline, AutoTokenizer
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
import json

In [None]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [None]:
def fetch_page(url: str, date: str) -> str:
    """Fetch HTML content from a URL with error handling."""
    full_url = f"{url}{date}"
    try:
        response = requests.get(full_url, timeout=10)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        logging.error(f"Failed to fetch {full_url}: {e}")
        raise

In [None]:
def extract_link(url: str) -> dict:
    """Extract relevant links from a paper page with robust parsing."""
    links = {
        "arxiv_id": "",
        "arxiv_page": "",
        "arxiv_pdf": "",
        "project_page": "",
        "github_page": "",
        "summary": "",
    }
    
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        logging.warning(f"Could not fetch {url}: {e}")
        return links

    soup = BeautifulSoup(response.text, "html.parser")
    
    # Find arXiv links
    arxiv_links = soup.find_all("a", href=re.compile(r'arxiv\.org/abs/\d+\.\d+'))
    if arxiv_links:
        href = arxiv_links[0]['href']
        links["arxiv_page"] = urljoin(url, href)
        links["arxiv_id"] = href.split("/")[-1]
        links["arxiv_pdf"] = f"{PDF_URL}{links['arxiv_id']}.pdf"
    
    # Find GitHub links
    github_links = soup.find_all("a", href=re.compile(r'github\.com'))
    if github_links:
        links["github_page"] = urljoin(url, github_links[0]['href'])
    
    # Find project page
    project_links = soup.find_all("a", string=re.compile(r'project page', re.I))
    if project_links:
        links["project_page"] = urljoin(url, project_links[0]['href'])
    
    return links

In [None]:
def parse_papers(base_url: str, html: str) -> list[dict]:
    """Parse paper entries from HTML with improved selectors."""
    soup = BeautifulSoup(html, "html.parser")
    papers = []
    
    entries = soup.find_all("article", class_=re.compile(r'flex flex-col overflow-hidden'))
    if not entries:
        logging.warning("No papers found - check HTML structure")
        return papers

    for entry in entries:
        try:
            info = entry.find("a", class_=re.compile(r'line-clamp-3'))
            if not info:
                continue

            paper_href = info.get("href")
            title = info.get_text(strip=True)
            paper_url = urljoin(base_url, paper_href)
            
            additional_links = extract_link(paper_url)
            
            papers.append({
                "title": title,
                "huggingface_url": paper_url,
                **additional_links
            })
        except Exception as e:
            logging.error(f"Error parsing paper entry: {e}")
    
    return papers

In [None]:
def get_today_papers(base_url: str, date: str) -> list[dict]:
    """Retrieve papers for a given date with error handling."""
    try:
        html = fetch_page(base_url, date)
        return parse_papers(base_url, html)
    except Exception as e:
        logging.error(f"Failed to get papers: {e}")
        return []

In [None]:
def extract_text_from_url(url: str) -> str:
    """Extract text from PDF with improved error handling."""
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        logging.warning(f"Could not download PDF {url}: {e}")
        return ""

    try:
        pdf_bytes = io.BytesIO(response.content)
        reader = PdfReader(pdf_bytes)
        return "\n".join(page.extract_text() or "" for page in reader.pages)
    except Exception as e:
        logging.error(f"PDF processing failed: {e}")
        return ""


In [None]:
def summarize_text(text: str, summarizer, tokenizer) -> str:
    """Generate summary using token-aware chunking."""
    if not text.strip():
        return ""
    
    # Tokenize and chunk properly
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
    chunks = [tokenizer.decode(inputs['input_ids'][0][i:i+1024]) 
              for i in range(0, len(inputs['input_ids'][0]), 512)]  # 512 overlap
    
    summaries = []
    for chunk in chunks:
        try:
            summary = summarizer(chunk, max_length=150, min_length=30, do_sample=False)
            summaries.append(summary[0]['summary_text'])
        except Exception as e:
            logging.error(f"Summarization failed: {e}")
    
    return " ".join(summaries)

In [None]:
def main():
    # Initialize models
    logging.info("Initializing models...")
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
    
    # Fetch papers
    logging.info(f"Fetching papers for {DATE}")
    today_papers = get_today_papers(HUGGINGFACE_PAPERS_URL, PAPER_DATE)
    
    # Process each paper
    for paper in today_papers:
        logging.info(f"Processing: {paper['title']}")
        
        if not paper.get("arxiv_pdf"):
            logging.warning("No arXiv PDF available, skipping summary")
            continue
            
        text = extract_text_from_url(paper['arxiv_pdf'])
        if not text:
            continue
            
        paper["summary"] = summarize_text(text, summarizer, tokenizer)
    
    # Save results
    with open("summaries.json", "w", encoding="utf-8") as f:
        json.dump(today_papers, f, ensure_ascii=False, indent=4)
    
    logging.info(f"Processed {len(today_papers)} papers")

if __name__ == "__main__":
    main()

In [12]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use mps:0


In [None]:
max_chunk_length=1000
today_papers = get_today_papers(HUGGINGFACE_PAPERS_URL, PAPER_DATE)

In [17]:
for paper in today_papers:
    text = extract_text_from_url(paper['arxiv_pdf'])
    chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
    summary = ""
    for chunk in chunks:
        max_length = int(len(chunk) * 0.5)
        summarized_chunk = summarizer(chunk, max_length=max_length, min_length=50, do_sample=False)
        summary += summarized_chunk[0]['summary_text'] + " "
    paper["smmary"] = summary.strip()

Your max_length is set to 500, but your input_length is only 305. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=152)
Your max_length is set to 500, but your input_length is only 246. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=123)
Your max_length is set to 500, but your input_length is only 239. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=119)
Your max_length is set to 500, but your input_length is only 237. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=1

In [None]:
import json

In [None]:
with open("summaries.json", "w", encoding="utf-8") as f:
    json.dump(today_papers, f, ensure_ascii=False, indent=4)

In [None]:
def download_pdf(pdf_url: str, arxiv_id: str) -> str:
    file_dir = "data"
    os.makedirs(file_dir, exist_ok=True)

    file_path = os.path.join(file_dir, f"{arxiv_id}.pdf")
    response = requests.get(pdf_url)
    response.raise_for_status()

    with open(file_path, "wb") as f:
        f.write(response.content)

    print(f"Downloaded and saved in: {file_path}")

In [None]:
for paper in today_papers[0:1]:
    download_pdf(paper['arxiv_pdf'], paper['title'])

Downloaded and saved as data/OmniSVG: A Unified Scalable Vector Graphics Generation Model.pdf
Downloaded and saved as data/Hogwild! Inference: Parallel LLM Generation via Concurrent Attention.pdf


In [None]:
from dotenv import load_dotenv
load_dotenv()
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
HUGGINGFACE_API_KEY = os.enviro.get('HUGGINGFACE_API_KEY')

In [None]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
# Extract Data From the PDF File
def load_pdf_file(data: str) -> str:
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [None]:
extracted_data = load_pdf_file(data='data/')

In [None]:
# Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_splitter

In [None]:
text_chunk = text_split(extracted_data=extracted_data)
print(f"Lenght of chunck: {len(text_chunk)}")

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import pipeline

In [None]:
# Download the Embeddings from Hugging Face
def download_hugging_face_embeddings(model_name: str):
    embeddings =  HuggingFaceEmbeddings(model_name=model_name)
    return embeddings

In [None]:
embedding = download_hugging_face_embeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

NameError: name 'pipeline' is not defined

In [None]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "researchbot"

In [None]:
pc.create_index(name=index_name,
                dimension=384,
                metric="cosine",
                spec=ServerlessSpec(
                    cloud="aws",
                    region="us-east-1",
                ))

In [None]:
from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_documents(
    documents=text_chunk,
    index_name=index_name,
    embedding=embedding,
)

In [None]:
# Load Existing index
from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the emebeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding,
)

In [None]:
docsearch

In [None]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [None]:
def summarize_paper(text: str) -> str:
    max_chunk_length = 1024
    
    chunks = [text[i:i+max_chunk_length] for i in range(0, len(text), max_chunk_length)]
    summary = ""
    for chunk in chunks:
        summarized_chunk = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
        summary += summarized_chunk + " "
    return summary.strip()

def chatbot_interface(context: str):
    print("Chatbot ready. Type your questions about the paper. Type 'exit' to quit.")
    while True:
        question = input("Your question: ")
        if question.lower() in ["exit", "quit"]:
            break

        # Use the QA pipeline with the given context.
        result = qa_pipeline(question=question, context=context)
        answer = result.get("answer", "I'm not sure about that.")
        print(f"Answer: {answer}\n")

if __name__ == "__main__":
    # Example paper input; you can later extend this
    # to process multiple inputs.
    paper_data = {
        "title": ("CrossWordBench: Evaluating the Reasoning Capabilities of LLMs and LVLMs "
                  "with Controllable Puzzle Generation"),
        "arxiv_pdf": "https://arxiv.org/pdf/2504.00043.pdf"
    }
    
    try:
        print(f"Downloading PDF for paper: {paper_data['title']}")
        pdf_path = download_pdf(paper_data["arxiv_pdf"])
        
        print("Extracting text from the PDF...")
        pdf_text = extract_pdf_text(pdf_path)
        
        print("Summarizing the paper...")
        summary = summarize_paper(pdf_text)
        print("Paper Summary:\n", summary)
        
        # Start chatbot interface based on the summary.
        chatbot_interface(summary)
        
    except Exception as e:
        print(f"An error occurred: {e}")

In [None]:
from langchain_community.llms import Ollama
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [None]:
llm = Ollama(model="deepseek-r1", temperature=0.4)

In [None]:
SYSTEM_PROMPT = """
System Prompt for AI Research Assistant

Role and Scope:
You are a research assistant specialized in AI and large language models (LLMs). Your primary duties include discovering relevant papers on the Hugging Face platform, extracting PDF links, downloading the research documents, summarizing their contents, and indexing the results for efficient retrieval using a vectorstore database. Your output must be clear, detailed, and maintain academic rigor.

Functional Responsibilities:

1. Paper Discovery & Download:
   - Crawling: Continually monitor the Hugging Face page (or pages) dedicated to AI and LLM research for new papers.
   - Extraction: Identify and extract the PDF links from the page.
   - Downloading: Automatically download each PDF to the local system for further processing.

2. Content Summarization:
   - Parsing: Process the downloaded PDF and extract key sections—such as the abstract, introduction, methodology, experiments, results, and conclusion.
   - Summarizing: Generate a concise yet comprehensive summary that captures the paper’s main contributions, methods, findings, and any notable insights. Ensure the summary preserves the technical accuracy and context of the original document.
   - Validation: Verify that every summary is faithful to the paper's content, and highlight any potential uncertainties or limitations in the document.

3. Indexing with Vectorstore:
   - Embedding: Convert the text summary or key points into vector embeddings that represent the paper’s content.
   - Database Storage: Index these embeddings in the vectorstore to allow efficient and relevant retrieval during query time.
   - Query Matching: When a user query is received, search the vectorstore to retrieve the most semantically related papers and provide contextualized, aggregated insights.

4. User Query Response:
   - Contextual Answers: When asked about specific topics or research areas, integrate information from the vectorstore and provide detailed answers.
   - Direct References: When possible, refer to the paper sections (e.g., “based on the methodology described in section 3,”) to support your response.
   - Clarity & Detail: Ensure all outputs are clear, logically structured, and include sufficient technical details, especially when addressing experimental setups or complex methodologies.

Behavioral Guidelines:
- Professional Tone: Maintain a scholarly and objective tone throughout all interactions.
- Accuracy and Rigor: Base responses strictly on the data derived from the PDF documents. Avoid injecting personal opinions or unverified information.
- Error Handling: If any issue arises (e.g., PDF parsing errors, incomplete downloads), log a clear error message that describes the problem and, if possible, suggest remedial steps.
- Modularity: The agent should work in a modular fashion; for instance, if the PDF extraction fails, it must notify the user or log the error without disrupting other functionalities.

Example Instruction for a Query:
“When a user inquires about recent advances in transformer-based LLMs, search your indexed vectorstore for the most relevant papers, or the summarization of the papers. Summarize the key aspects of the top results—such as novel architectures, dataset insights, and experimental results—and provide a clear, consolidated answer that synthesizes these insights.”

Usage Note:
Embed this system prompt at the beginning of your agent’s session or configuration file to ensure that every module (crawling, summarizing, embedding, and querying) follows these guidelines. This will help the agent act in a coordinated manner, reliably reflecting the state-of-the-art research in AI LLMs from the Hugging Face page.
"""

In [None]:
prompt = ChatPromptTemplate.from_messages([
    ("system", SYSTEM_PROMPT),
    ("human", "{input}"),
])

In [None]:
# Create document-based QA chain
question_answer_chain = create_stuff_documents_chain(llm, prompt=prompt)
# Assume '

In [None]:
import asyncio
from crawl4ai import AsyncWebCrawler
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig

async def main():
    browser_config = BrowserConfig()  # Default browser configuration
    run_config = CrawlerRunConfig(fit_markdown=True)   # Default crawl run configuration

    async with AsyncWebCrawler(config=browser_config) as crawler:
        result = await crawler.arun(
            url="https://huggingface.co/papers",
            config=Craw
        )
        print(result.markdown)  # Print clean markdown content

In [4]:
result = await crawler.arun(
    url="https://example.com",
    config=CrawlerRunConfig(fit_markdown=True)
)

# Different content formats
print(result.html)         # Raw HTML
print(result.cleaned_html) # Cleaned HTML
print(result.markdown.raw_markdown) # Raw markdown from cleaned html
print(result.markdown.fit_markdown) # Most relevant content in markdown

# Check success status
print(result.success)      # True if crawl succeeded
print(result.status_code)  # HTTP status code (e.g., 200, 404)

# Access extracted media and links
print(result.media)        # Dictionary of found media (images, videos, audio)
print(result.links)        # Dictionary of internal and external links

NameError: name 'crawler' is not defined