In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import pandas as pd
import os
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_argument("--log-level=3")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])

# Path to ChromeDriver
chromedriver_path = r"C:\Users\yujit\OneDrive\Desktop\chromedriver-win64\chromedriver.exe"

# Create output folder
output_folder = "IMF"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Base URL of IMF eLibrary
base_url = "https://www.elibrary.imf.org/browse?page="

# Function to initialize WebDriver
def init_driver():
    service = Service(chromedriver_path)
    return webdriver.Chrome(service=service, options=chrome_options)

# Function to scrape a page
def scrape_page(page_num):
    driver = init_driver()
    driver.get(f"{base_url}{page_num}&pageSize=50&sort=datedescending")
    time.sleep(5)  # Allow page to load

    local_data = []
    articles = driver.find_elements(By.CLASS_NAME, "typography-body.text-title.fw-4")

    for article in articles:
        try:
            title_element = article.find_element(By.TAG_NAME, "a")
            title = title_element.text.strip()
        except:
            title = "N/A"

        def extract_data(xpath):
            try:
                return driver.find_element(By.XPATH, xpath).text.strip()
            except:
                return "N/A"

        # Extracting metadata using provided XPaths
        data = {
            "Title": title,
            "Type": extract_data("//dl[@class='type c-List__items']/dd/span"),
            "Source Title": extract_data("//dl[@class='sourcetitle c-List__items']/dd/span"),
            "Volume": extract_data("//dl[@class='volumeissue c-List__items']/dd/span/a") or extract_data("//dl[@class='volumeissue c-List__items']/dd/span"),
            "Series": extract_data("//dl[@class='editorialType c-List__items']/dd/span"),
            "Publisher": extract_data("//dl[@class='publisher c-List__items']/dd/span"),
            "Publication Date": extract_data("//dl[@class='printpubdate c-List__items']/dd/span"),
            "Language": extract_data("//dl[@class='language c-List__items']/dd/span"),
            "DOI": extract_data("//dl[@class='doi c-List__items']/dd/span/a"),
            "ISBN": extract_data("//dl[@class='formatisbn c-List__items']/dd/span"),
        }

        local_data.append(data)

    driver.quit()

    # Save data to a separate file for each page
    df = pd.DataFrame(local_data)
    output_file = os.path.join(output_folder, f"IMF_Page_{page_num}.xlsx")
    df.to_excel(output_file, index=False)
    print(f"Page {page_num} scraped and saved as {output_file}")

    return local_data

# Multi-threaded scraping
page_number = 1  # Start page
max_workers = 2  # Number of pages to scrape in parallel
pages_scraped = 0

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    while True:
        futures = {executor.submit(scrape_page, page_num): page_num for page_num in range(page_number, page_number + max_workers)}

        all_results = []
        for future in as_completed(futures):
            result = future.result()
            if not result:
                print(f"No more data found on page {futures[future]}. Stopping scraping.")
                break
            all_results.extend(result)
            pages_scraped += 1

        page_number += max_workers

        # Pause for 2 minutes after every 12 pages scraped
        if pages_scraped >= 12:
            print("Pausing for 2 minutes to avoid detection...")
            time.sleep(120)
            pages_scraped = 0  # Reset counter

        if len(all_results) == 0:
            break

print("Scraping complete.")

Page 1987 scraped and saved as IMF\IMF_Page_1987.xlsxPage 1986 scraped and saved as IMF\IMF_Page_1986.xlsx

Page 1988 scraped and saved as IMF\IMF_Page_1988.xlsx
Page 1989 scraped and saved as IMF\IMF_Page_1989.xlsx
Page 1991 scraped and saved as IMF\IMF_Page_1991.xlsx
Page 1990 scraped and saved as IMF\IMF_Page_1990.xlsx
Page 1992 scraped and saved as IMF\IMF_Page_1992.xlsx
Page 1993 scraped and saved as IMF\IMF_Page_1993.xlsx
Page 1994 scraped and saved as IMF\IMF_Page_1994.xlsx
Page 1995 scraped and saved as IMF\IMF_Page_1995.xlsx
Page 1997 scraped and saved as IMF\IMF_Page_1997.xlsx
Page 1996 scraped and saved as IMF\IMF_Page_1996.xlsx
Pausing for 2 minutes to avoid detection...
Page 1998 scraped and saved as IMF\IMF_Page_1998.xlsx
Page 1999 scraped and saved as IMF\IMF_Page_1999.xlsx
Page 2000 scraped and saved as IMF\IMF_Page_2000.xlsx
Page 2001 scraped and saved as IMF\IMF_Page_2001.xlsx
Page 2002 scraped and saved as IMF\IMF_Page_2002.xlsx
Page 2003 scraped and saved as IMF\IMF