In [1]:
import os
from dotenv import load_dotenv
from langchain_community.tools.tavily_search import TavilySearchResults

load_dotenv()

True

In [2]:
os.environ['TAVILY_API_KEY'] = os.getenv("TAVILY_API_KEY")

In [3]:
search_tool = TavilySearchResults(max_results=3)

In [4]:
search_tool.name, search_tool.description

('tavily_search_results_json',
 'A search engine optimized for comprehensive, accurate, and trusted results. Useful for when you need to answer questions about current events. Input should be a search query.')

In [5]:
search_tool.invoke("What is ICDD dataset?")

[{'title': 'The International Centre for Diffraction Data -',
  'url': 'https://www.icdd.com/',
  'content': 'The International Centre for Diffraction Data (ICDD®) is a non-profit scientific organization dedicated to collecting, editing, publishing, and distributing powder diffraction data for the identification of materials. The membership of the ICDD consists of worldwide representation from academe, government, and industry.\n\nDIFFRACTION DATABASES YOU CAN TRUST [...] ICDD database operations, based in Pennsylvania, USA, is the only crystallographic database organization in the world with its Quality Management System ISO 9001:2015 certified by DEKRA.\n\nContact ICDD\n\nMost Visited Pages [...] effectiveness in their applications of XRD, XRF, and Rietveld refinement in various scientific fields.',
  'score': 0.921591},
 {'title': 'International Centre for Diffraction Data ICDD Session I',
  'url': 'https://www.nationalacademies.org/event/03-28-2022/international-centre-for-diffract

### Extract abstract of a paper


In [6]:
from langchain_community.tools.tavily_search import TavilySearchResults

def get_paper_abstract_with_langchain(paper_title: str, api_key: str = None) -> str:
    """
    Uses LangChain's TavilySearchResults to get the abstract of a paper based on its title.
    
    Args:
        paper_title (str): Title of the paper to search.
        api_key (str, optional): Tavily API key (or set via env var TAVILY_API_KEY).
    
    Returns:
        str: Extracted abstract or message if not found.
    """
    # You can also set TAVILY_API_KEY as an env variable, and skip passing api_key
    search_tool = TavilySearchResults(api_key=api_key, search_depth="advanced") if api_key else TavilySearchResults()

    query = f"abstract of the academic paper titled '{paper_title}'"
    results = search_tool.invoke(query, )
    
    for result in results:
        content = result.get("content", "")
        if "abstract" in content.lower():
            return content.strip()
    
    return "Abstract not found. You may need to refine the paper title or check the source manually."


In [7]:
raw_content = get_paper_abstract_with_langchain("Attention Is All You Need")
print(raw_content)


Attention Is All You Need Ashish Vaswani∗ Google Brain avaswani@google.com Noam Shazeer∗ Google Brain noam@google.com Niki Parmar∗ Google Research nikip@google.com Jakob Uszkoreit∗ Google Research usz@google.com Llion Jones∗ Google Research llion@google.com Aidan N. Gomez∗† University of Toronto aidan@cs.toronto.edu Łukasz Kaiser∗ Google Brain lukaszkaiser@google.com Illia Polosukhin∗‡ illia.polosukhin@gmail.com Abstract The dominant sequence transduction models are based on complex recurrent or [...] 7 Conclusion In this work, we presented the Transformer, the ﬁrst sequence transduction model based entirely on attention, replacing the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention.


In [8]:
import re

def extract_abstract(text: str) -> str:
    """
    Extracts the abstract from a block of academic paper text using a more flexible regex
    that works even when sections aren't separated by newlines.
    """
    # Match "Abstract" followed by some text, stopping before common section keywords
    pattern = r"(?i)abstract[\s:]*([\s\S]{20,2000}?)\s(?:1\s|I\.?|introduction|background|methods|conclusion|keywords|results|we present|in this paper|the proposed)"

    match = re.search(pattern, text)
    if match:
        return match.group(1).strip()

    return "Abstract not found."


In [9]:
raw_content

'Attention Is All You Need Ashish Vaswani∗ Google Brain avaswani@google.com Noam Shazeer∗ Google Brain noam@google.com Niki Parmar∗ Google Research nikip@google.com Jakob Uszkoreit∗ Google Research usz@google.com Llion Jones∗ Google Research llion@google.com Aidan N. Gomez∗† University of Toronto aidan@cs.toronto.edu Łukasz Kaiser∗ Google Brain lukaszkaiser@google.com Illia Polosukhin∗‡ illia.polosukhin@gmail.com Abstract The dominant sequence transduction models are based on complex recurrent or [...] 7 Conclusion In this work, we presented the Transformer, the ﬁrst sequence transduction model based entirely on attention, replacing the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention.'

In [10]:
raw_content = get_paper_abstract_with_langchain("Attention Is All You Need")
abstract = extract_abstract(raw_content)
print(abstract)

The dominant sequence transduction models are based on complex recurrent or [...] 7


In [11]:
import requests
from bs4 import BeautifulSoup

def extract_abstract_from_html(url: str) -> str:
    """
    Downloads and parses an academic paper HTML page to extract the abstract.
    
    Args:
        url (str): URL of the paper page (e.g., arXiv or Springer).
    
    Returns:
        str: Extracted abstract, or message if not found.
    """
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        # Common pattern for arXiv
        abstract_div = soup.find("blockquote", class_="abstract")
        if abstract_div:
            return abstract_div.get_text(strip=True).replace("Abstract:", "").strip()
        
        # Other common patterns (e.g., Springer)
        abstract_section = soup.find("section", {"class": "Abstract"})
        if abstract_section:
            return abstract_section.get_text(strip=True)

        # Generic fallback
        possible_abstracts = soup.find_all(["p", "div"], string=lambda s: s and "abstract" in s.lower())
        for element in possible_abstracts:
            if "abstract" in element.get_text().lower():
                return element.get_text(strip=True).split("abstract", 1)[-1].strip()

        return "Abstract not found in HTML."

    except Exception as e:
        return f"Error fetching or parsing HTML: {e}"


In [12]:
from langchain_community.tools.tavily_search import TavilySearchResults

def get_paper_abstract_from_html(paper_title: str, api_key: str = None) -> str:
    tavily = TavilySearchResults(api_key=api_key) if api_key else TavilySearchResults()
    query = f"{paper_title} site:arxiv.org OR site:springer.com OR site:nature.com"
    results = tavily.invoke(query)

    for result in results:
        url = result.get("url")
        if url:
            abstract = extract_abstract_from_html(url)
            if "not found" not in abstract.lower():
                return abstract
    return "Abstract could not be extracted from HTML."


In [13]:
raw_content = get_paper_abstract_from_html("Attention Is All You Need")
print(raw_content)

The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transforme

In [14]:
from langchain_community.tools.tavily_search import TavilySearchResults

def get_paper_abstract_from_html(paper_title: str, api_key: str = None) -> str:
    tavily = TavilySearchResults(api_key=api_key) if api_key else TavilySearchResults()
    query = f"{paper_title} site:arxiv.org OR site:springer.com OR site:nature.com"
    results = tavily.invoke(query)

    for result in results:
        url = result.get("url")
        if url:
            abstract = extract_abstract_from_html(url)
            if "not found" not in abstract.lower():
                return abstract
    return "Abstract could not be extracted from HTML."


In [15]:
raw_content = get_paper_abstract_from_html("Attention Is All You Need")
print(raw_content)

The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transforme

In [16]:
# Test on my paper
paper_title = "Mechanochemical Association Reaction of Interfacial Molecules Driven by Shear"
abstract = get_paper_abstract_from_html(paper_title)
print(abstract)

Abstract could not be extracted from HTML.


⚠️ Problem with ACS and Other Journals
1. Abstract is often available, but:
It’s embedded inside complex HTML structures (JavaScript-heavy pages).

It may be hidden behind a cookie or paywall modal, even though the abstract is public.

Some require a browser-like session to load the full DOM.

2. Requests + BeautifulSoup may not see the abstract at all:
The page may return a limited version of the HTML without running JS.

You might get a skeleton page or a redirect message.

✅ Solutions by Journal Type
Journal	Abstract Availability	Scrape-Friendly?	Notes
arXiv	✅ Always available	✅ Easy	Standard HTML structure
Springer/Nature	✅ Public abstracts	✅ With parsing	Usually inside <section class="Abstract">
ACS Publications	✅ Abstract shown	⚠️ Sometimes	May need session headers or JS rendering
Elsevier (ScienceDirect)	✅	⚠️ Often JS	Use Selenium/Playwright or trafilatura
IEEE/ACM	✅	⚠️	Often HTML is wrapped or requires custom headers
Wiley/Taylor & Francis	✅	⚠️	Similar JS-heavy issues


In [17]:
raw_content = get_paper_abstract_from_html(paper_title)
print(raw_content)

Abstract could not be extracted from HTML.


In [19]:
import trafilatura

def extract_abstract_with_trafilatura(url: str) -> str:
    downloaded = trafilatura.fetch_url(url)
    if not downloaded:
        return "Failed to download page."
    
    result = trafilatura.extract(downloaded, include_comments=False, include_tables=False, favor_recall=True)
    return result or "Abstract not found in content."


In [21]:
raw_content = extract_abstract_with_trafilatura("Attention Is All You Need")
print(raw_content)

Failed to download page.


In [34]:
# Tavily result
url = "https://pubs.acs.org/doi/full/10.1021/acs.langmuir.8b00315"

# Then use your own fetcher
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

In [35]:
soup

<!DOCTYPE html>
<html lang="en-US"><head><title>Just a moment...</title><meta content="text/html; charset=utf-8" http-equiv="Content-Type"/><meta content="IE=Edge" http-equiv="X-UA-Compatible"/><meta content="noindex,nofollow" name="robots"/><meta content="width=device-width,initial-scale=1" name="viewport"/><style>*{box-sizing:border-box;margin:0;padding:0}html{line-height:1.15;-webkit-text-size-adjust:100%;color:#313131;font-family:system-ui,-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,Helvetica Neue,Arial,Noto Sans,sans-serif,Apple Color Emoji,Segoe UI Emoji,Segoe UI Symbol,Noto Color Emoji}body{display:flex;flex-direction:column;height:100vh;min-height:100vh}.main-content{margin:8rem auto;max-width:60rem;padding-left:1.5rem}@media (width <= 720px){.main-content{margin-top:4rem}}.h2{font-size:1.5rem;font-weight:500;line-height:2.25rem}@media (width <= 720px){.h2{font-size:1.25rem;line-height:1.5rem}}#challenge-error-text{background-image:url(data:image/svg+xml;base64,PHN2ZyB4bWx

In [39]:
def extract_abstract_from_html_soup(soup) -> str:
    # arXiv
    if (div := soup.find("blockquote", class_="abstract")):
        return div.get_text(strip=True).replace("Abstract:", "").strip()

    # Springer
    if (sec := soup.find("section", class_="Abstract")):
        return sec.get_text(strip=True)

    # ACS
    if (div := soup.find("div", class_="abstractSection abstractInFull")):
        return div.get_text(strip=True)

    # Generic fallback
    for tag in soup.find_all(["div", "section", "p"]):
        class_id_str = ' '.join(tag.get("class", [])) + ' ' + tag.get("id", "")
        if "abstract" in class_id_str.lower():
            return tag.get_text(strip=True)

    return "Abstract not found."


In [40]:
extract_abstract_from_html_soup(soup)

'Abstract not found.'

This doesn't work for ACS and other jornals!