In [6]:
import requests
import bibtexparser
import json
import io
import html2text
import urllib.request
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader


In [9]:
def extract_content_html(url="https://www.annualreviews.org/content/journals/10.1146/annurev-economics-080614-115430"):
  """
  Goal: Given a URL, extract the content of the page in plain text.
  """
  
  user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:128.0) Gecko/20100101 Firefox/128.0"

  headers={
    'User-Agent':user_agent,
    'Accept-Encoding': 'gzip, deflate, br, zstd',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    'Accept-Encoding': 'none',
    'Accept-Language': 'en-US,en;q=0.8',
    'Connection': 'keep-alive'
  }

  request = urllib.request.Request(url, None, headers)
  try:
      html = urllib.request.urlopen(request).read()
      html_str = html.decode('utf-8')
      text = html2text.html2text(html_str)
      print(text)
  except urllib.error.HTTPError as e:
      print(f"HTTP Error {e.code}: {e.reason}")
      
extract_content_html()


Menu

  * [Publications A-Z](/content/publications)
  *   [Journal Information](/journal-info)
  *   [About](/about)
  *   [Subscribe](/page/subscriptions/general-information)
  * [Give](/support-annual-reviews)
  *     * [ Personal Register/Sign-in ](/registration/signin-or-register.action?signInTarget=%2Fcontent%2Fjournals%2F10.1146%2Fannurev-economics-080614-115430)

    *     * [ 0 Cart ](/cart "Show shopping cart")

    * [Help](/help/main)

Journal Information

  * [Author Resource Center](/page/authors/general-information)
  * [Copyright & Permissions](/page/about/copyright-and-permissions)
  * [Add To Your Course Reader](/page/help/coursereader)
  * [Expected Publication Dates](/page/journal/pubdates)
  * [Impact Factor Rankings](/page/about/isi-rankings)
  * [Access Metadata](/page/about/metadata)
  * [RSS Feeds](/page/about/rssfeeds)

About

  * [What We Do](/about/what-we-do)
  * [Subscribe to Open](/S2O)
  * [Founder & History](/about/our-founder-and-early-history)
  * [_Kn

In [None]:
def get_paper_data(doi):
    url = f'https://api.openalex.org/works/https://doi.org/{doi}'
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Failed to retrieve data for DOI: {doi}")
        return None

In [None]:
def extract_content(url):
    response = requests.get(url)
    if response.status_code == 200:
        if url.endswith('.pdf'):
            try:
                pdf = PdfReader(io.BytesIO(response.content))
                content = ""
                for page in pdf.pages:
                    content += page.extract_text()
                return content
            except:
                print(f"Failed to extract PDF content from {url}")
        else:
            return extract_content_html(url)
    return None

In [None]:
def process_paper(paper):
    doi = paper.get('doi')
    if not doi:
        return None

    data = get_paper_data(doi)
    if not data:
        return None

    result = {
        'doi': doi,
        'title': data.get('title'),
        'authors': [author['author']['display_name'] for author in data.get('authorships', [])],
        'abstract': data.get('abstract'),
        'references': data.get('referenced_works', []),
        'metadata': data
    }

    # Try to extract full content
    for location in data.get('locations', []):
        pdf_url = location.get('pdf_url')
        landing_page_url = location.get('landing_page_url')
        if landing_page_url:
            content = extract_content(landing_page_url)
            if content:
                result['full_content'] = content
                break
        elif pdf_url:
            content = extract_content(pdf_url)
            if content:
                result['full_content'] = content
                break
        

    if 'full_content' not in result:
        print(f"Could not extract full content for DOI: {doi}")

    return result

In [None]:
def main():
    with open('seedPapers.bib') as bibtex_file:
        bib_database = bibtexparser.load(bibtex_file)
    papers = bib_database.entries

    all_papers_data = []

    for paper in papers:
        paper_data = process_paper(paper)
        if paper_data:
            all_papers_data.append(paper_data)

            # Process references
            for ref_doi in paper_data['references']:
                ref_data = process_paper({'doi': ref_doi})
                if ref_data:
                    all_papers_data.append(ref_data)

    with open('all_papers_data.json', 'w') as outfile:
        json.dump(all_papers_data, outfile, indent=2)

if __name__ == "__main__":
    main()

### Testing different extract_html_content methods:

In [12]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

def extract_content_html(url="https://www.annualreviews.org/content/journals/10.1146/annurev-economics-080614-115430"):
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36")

    driver = webdriver.Chrome(options=chrome_options)
    
    try:
        driver.get(url)
        
        # Wait for the full text to load
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CLASS_NAME, "hlFld-Fulltext"))
        )
        
        # Scroll to load all content
        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)  # Wait for page to load
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height
        
        # Extract the full text
        full_text_element = driver.find_element(By.CLASS_NAME, "hlFld-Fulltext")
        full_text = full_text_element.text
        
        print(full_text)
        return full_text
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")
    
    finally:
        driver.quit()

extract_content_html()

An error occurred: Message: 
Stacktrace:
0   chromedriver                        0x0000000102db2a0c chromedriver + 4385292
1   chromedriver                        0x0000000102dab318 chromedriver + 4354840
2   chromedriver                        0x00000001029c8b0c chromedriver + 281356
3   chromedriver                        0x0000000102a0b2f8 chromedriver + 553720
4   chromedriver                        0x0000000102a43d24 chromedriver + 785700
5   chromedriver                        0x00000001029ffeec chromedriver + 507628
6   chromedriver                        0x0000000102a008c4 chromedriver + 510148
7   chromedriver                        0x0000000102d7a3c8 chromedriver + 4154312
8   chromedriver                        0x0000000102d7ee2c chromedriver + 4173356
9   chromedriver                        0x0000000102d5ff84 chromedriver + 4046724
10  chromedriver                        0x0000000102d7f718 chromedriver + 4175640
11  chromedriver                        0x0000000102d52f44 chr

In [18]:
from requests_html import HTMLSession, AsyncHTMLSession
import time

def extract_content_html(url="https://www.annualreviews.org/content/journals/10.1146/annurev-economics-080614-115430"):
    session = AsyncHTMLSession()
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:128.0) Gecko/20100101 Firefox/128.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-User': '?1',
        'Cache-Control': 'max-age=0',
    }
    
    try:
        response = session.get(url, headers=headers)
        response.html.render(sleep=5, keep_page=True, scrolldown=5, timeout=30)
        
        # Wait for the full text to load
        time.sleep(10)
        
        # Extract the full text
        full_text_element = response.html.find('.hlFld-Fulltext', first=True)
        
        if full_text_element:
            full_text = full_text_element.text
            print(full_text)
            return full_text
        else:
            print("Full text element not found.")
            return None
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")
    
    finally:
        session.close()

extract_content_html()

An error occurred: '_asyncio.Future' object has no attribute 'html'


  session.close()


In [19]:
import asyncio
from requests_html import AsyncHTMLSession
import time

async def extract_content_html(url="https://www.annualreviews.org/content/journals/10.1146/annurev-economics-080614-115430"):
    session = AsyncHTMLSession()
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:128.0) Gecko/20100101 Firefox/128.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-User': '?1',
        'Cache-Control': 'max-age=0',
    }
    
    try:
        response = await session.get(url, headers=headers)
        await response.html.arender(sleep=5, keep_page=True, scrolldown=5, timeout=30)
        
        # Wait for the full text to load
        await asyncio.sleep(10)
        
        # Extract the full text
        full_text_element = response.html.find('.hlFld-Fulltext', first=True)
        
        if full_text_element:
            full_text = full_text_element.text
            print(full_text)
            return full_text
        else:
            print("Full text element not found.")
            return None
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")
    
    finally:
        await session.close()

# Run the asynchronous function
asyncio.run(extract_content_html())

RuntimeError: asyncio.run() cannot be called from a running event loop

In [23]:
import asyncio
from requests_html import AsyncHTMLSession
import time
import nest_asyncio

# Apply the nest_asyncio patch
nest_asyncio.apply()

async def extract_content_html(url="https://www.annualreviews.org/content/journals/10.1146/annurev-economics-080614-115430"):
    session = AsyncHTMLSession()
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:128.0) Gecko/20100101 Firefox/128.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-User': '?1',
        'Cache-Control': 'max-age=0',
    }
    
    try:
        response = await session.get(url, headers=headers)
        await response.html.arender(sleep=5, keep_page=True, scrolldown=5, timeout=30)
        
        # Wait for the full text to load
        await asyncio.sleep(10)
        
        # Extract the full text
        full_text_element = response.html.find('.hlFld-Fulltext', first=True)
        
        if full_text_element:
            full_text = full_text_element.text
            print(full_text)
            return full_text
        else:
            print("Full text element not found.")
            return None
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")
    
    finally:
        await session.close()

# Run the asynchronous function
await extract_content_html()

Full text element not found.


In [25]:
import requests
from bs4 import BeautifulSoup
import PyPDF2
import re
from collections import deque
from io import BytesIO

def get_html_content(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.content
    else:
        return None

def extract_text_from_pdf(pdf_content):
    pdf_text = ""
    pdf_file = BytesIO(pdf_content)
    pdf_reader = PyPDF2.PdfFileReader(pdf_file)
    for page in range(pdf_reader.numPages):
        pdf_text += pdf_reader.getPage(page).extract_text()
    return pdf_text

def extract_content_from_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    text_content = soup.get_text(separator="\n")
    return text_content

def get_article_content(url):
    html_content = get_html_content(url)
    if html_content:
        soup = BeautifulSoup(html_content, 'html.parser')
        pdf_link = soup.find('a', string='PDF')
        
        if pdf_link:
            pdf_url = pdf_link['href']
            pdf_content = requests.get(pdf_url).content
            return extract_text_from_pdf(pdf_content)
        else:
            return extract_content_from_html(html_content)
    return None

def process_paper(doi):
    openalex_url = f"https://api.openalex.org/works/{doi}"
    metadata = requests.get(openalex_url).json()
    landing_page_url = metadata.get('primary_location', {}).get('landing_page_url', None)
    pdf_url = metadata.get('primary_location', {}).get('pdf_url', None)
    
    if landing_page_url:
        main_content = get_article_content(landing_page_url)
    elif pdf_url:
        pdf_content = requests.get(pdf_url).content
        main_content = extract_text_from_pdf(pdf_content)
    else:
        main_content = None

    if main_content:
        abstract = metadata.get('abstract', 'Abstract not found')
        introduction = re.search(r'Introduction(.*?)\n\n', main_content, re.DOTALL)
        introduction = introduction.group(1) if introduction else "Introduction not found"
        
        paper_data = {
            "metadata": metadata,
            "abstract": abstract,
            "introduction": introduction,
            "main_content": main_content
        }
        return paper_data
    return None

def main():
    doi_list = [
        "https://10.1146/annurev-economics-080614-115430"
        # Add more DOIs as needed
    ]
    
    papers_data = []
    processed_dois = set(doi_list)  # Initialize with DOIs from the original list
    queue = deque([(doi, 0) for doi in doi_list])  # Queue for BFS with level tracking

    while queue:
        current_doi, level = queue.popleft()
        if level > 2:  # Stop after level 2
            continue
        
        paper_data = process_paper(current_doi)
        if paper_data:
            if level == 0:
                papers_data.append(paper_data)
            else:
                for parent in papers_data:
                    if current_doi in parent['metadata']['referenced_works']:
                        parent['references'].append(paper_data)
                        break
            
            if level < 2:
                references = get_references(current_doi, processed_dois)
                for ref_data in references:
                    queue.append((ref_data['metadata']['doi'], level + 1))

    with open("papers_data.json", "w") as f:
        json.dump(papers_data, f, indent=4)

if __name__ == "__main__":
    main()


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [26]:
import requests
import bibtexparser
from PyPDF2 import PdfReader
import json
import io
from bs4 import BeautifulSoup
from collections import deque

def get_paper_data(doi):
    url = f'https://api.openalex.org/works/https://doi.org/{doi}'
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Failed to retrieve data for DOI: {doi} - {str(e)}")
        return None

def extract_pdf_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        pdf = PdfReader(io.BytesIO(response.content))
        content = ""
        for page in pdf.pages:
            content += page.extract_text()
        return content
    except Exception as e:
        print(f"Failed to extract PDF content from {url}: {str(e)}")
    return None

def extract_html_content(url):
    try:
        user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:128.0) Gecko/20100101 Firefox/128.0"
        headers = {
            'User-Agent': user_agent,
            'Accept-Encoding': 'gzip, deflate, br, zstd',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
            'Accept-Encoding': 'none',
            'Accept-Language': 'en-US,en;q=0.8',
            'Connection': 'keep-alive'
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        article_content = soup.find('div', class_='article-content')
        if article_content:
            return article_content.get_text(separator='\n', strip=True)
        else:
            return soup.get_text(separator='\n', strip=True)
    except Exception as e:
        print(f"Failed to extract HTML content from {url}: {str(e)}")
    return None

def extract_content(locations):
    for location in locations:
        pdf_url = location.get('pdf_url')
        landing_page_url = location.get('landing_page_url')
        
        if landing_page_url:
            content = extract_html_content(landing_page_url)
            if content:
                return content
            
        elif pdf_url:
            content = extract_pdf_content(pdf_url)
            if content:
                return content
        
    return None

def process_paper(paper):
    doi = paper.get('doi')
    if not doi:
        return None

    data = get_paper_data(doi)
    if not data:
        return None

    result = {
        'doi': doi,
        'title': data.get('title'),
        'authors': [author['author']['display_name'] for author in data.get('authorships', [])],
        'abstract': data.get('abstract'),
        'references': data.get('referenced_works', []),
        'metadata': data,
        'level': paper.get('level', 0)
    }

    # Try to extract full content
    locations = data.get('locations', [])
    if locations:
        content = extract_content(locations)
        if content:
            result['full_content'] = content
        else:
            print(f"Could not extract full content for DOI: {doi}")
    else:
        print(f"No locations found for DOI: {doi}")

    return result

def breadth_first_search(seed_papers, max_level=2):
    all_papers_data = []
    processed_dois = set()
    queue = deque([(paper, 0) for paper in seed_papers])

    while queue:
        current_paper, level = queue.popleft()
        
        if level > max_level:
            continue

        doi = current_paper.get('doi')
        if doi and doi not in processed_dois:
            processed_dois.add(doi)
            current_paper['level'] = level
            paper_data = process_paper(current_paper)
            
            if paper_data:
                all_papers_data.append(paper_data)
                
                if level < max_level:
                    for ref_doi in paper_data['references']:
                        if ref_doi not in processed_dois:
                            queue.append(({'doi': ref_doi}, level + 1))

    return all_papers_data

def main():
    with open('seedPapers.bib') as bibtex_file:
        bib_database = bibtexparser.load(bibtex_file)
    seed_papers = bib_database.entries

    all_papers_data = breadth_first_search(seed_papers, max_level=2)

    with open('all_papers_data.json', 'w') as outfile:
        json.dump(all_papers_data, outfile, indent=2)

if __name__ == "__main__":
    main()


  self.object_was_parsed(o)


Failed to extract HTML content from https://doi.org/10.1029/2020ef001532: 403 Client Error: Forbidden for url: https://agupubs.onlinelibrary.wiley.com/doi/10.1029/2020EF001532


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Failed to extract HTML content from https://doi.org/10.2307/20049302: 403 Client Error: Forbidden for url: https://www.jstor.org/stable/10.2307/20049302?origin=crossref
Could not extract full content for DOI: https://openalex.org/W1576727408
Failed to extract HTML content from https://doi.org/10.3982/ecta10233: 403 Client Error: Forbidden for url: https://onlinelibrary.wiley.com/resolve/doi?DOI=10.3982/ECTA10233
Could not extract full content for DOI: https://openalex.org/W1595896150
Failed to extract HTML content from https://doi.org/10.2993/0278-0771(2005)25[143:br]2.0.co;2: 403 Client Error: Forbidden for url: https://journals.sagepub.com/doi/full/10.2993/0278-0771_2005_25_143_br_2.0.co_2
Could not extract full content for DOI: https://openalex.org/W1858777606
Failed to extract HTML content from https://doi.org/10.1113/expphysiol.2007.041848: 403 Client Error: Forbidden for url: https://physoc.onlinelibrary.wiley.com/doi/10.1113/expphysiol.2007.041848
Failed to extract HTML content 

KeyboardInterrupt: 