In [None]:
import json
import time
import os
import re
import glob
import random

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

In [None]:
# # List of proxies
# proxies = [
#     "103.116.82.135:8080",
#     "102.216.84.18:8080",
#     "102.68.128.211:8080",
#     "138.117.84.199:8080"
# ]

# # Choose a random proxy from the list
# proxy = random.choice(proxies)

In [None]:
def init_google_scholar_scrapping_driver():
    
    chrome_options = Options() # Initialize the Chrome Driver

    chrome_options.add_argument("--headless") # Run in headless mode (no GUI)
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--log-level=3")  # Suppress console logs (INFO = 0, WARNING = 1, LOG_ERROR = 2, LOG_FATAL = 3)
    chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])
#     chrome_options.add_argument(f'--proxy-server={proxy}')

    driver_path = os.path.join("utils", "chromedriver-win64", "chromedriver-win64", "chromedriver.exe")

    service = Service(driver_path)

    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    return driver

In [None]:
def init_pdf_downloader_driver(downloads_directory: str):
    chrome_options = Options()

    chrome_options.add_argument("--headless") # Run in headless mode (no GUI)
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--log-level=3")   # Suppress console logs (INFO = 0, WARNING = 1, LOG_ERROR = 2, LOG_FATAL = 3)
    chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])
#     chrome_options.add_argument(f'--proxy-server={proxy}')


    driver_path = os.path.join("utils", "chromedriver-win64", "chromedriver-win64", "chromedriver.exe")
    
    service = Service(driver_path)

    preferences = {
        "download.default_directory": downloads_directory,  # Set the download directory
        "plugins.always_open_pdf_externally": True,  # Automatically download PDFs
        "download.prompt_for_download": False,       # Disable download prompt
        "download.directory_upgrade": True,
        "safebrowsing.enabled": True                 # Disable safe browsing check
    }
    chrome_options.add_experimental_option("prefs", preferences)

    # Initialize Chrome WebDriver
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
#     driver.execute_cdp_cmd("Page.setDownloadBehavior", {
#     "behavior": "allow",
#     "downloadPath": downloads_directory
#     })


    return driver

In [None]:
google_scholar_main_page_link = "https://scholar.google.com"

In [None]:
def get_article_title_and_link(article):

    title = article.find_element(By.TAG_NAME, "h3").text
    
    try:
        link = article.find_element(By.CLASS_NAME, "gs_or_ggsm").find_element(By.TAG_NAME, "a").get_attribute("href")
    except:
        link = None
    
    return title, link

In [None]:
def update_articles_links_for_topic(driver, topic: str):

    if os.path.exists(os.path.join("utils", f"{re.sub(' ', '_', topic)}_articles_list.json")):
        with open(os.path.join("utils", f"{re.sub(' ', '_', topic)}_articles_list.json"), "r") as infile:
            articles_list = json.load(infile)
        saved_links = [article["google_scholar_link"] for article in articles_list]
                    
    else:
        articles_list = []             
        saved_links = []
        
    driver.get(google_scholar_main_page_link)
    

    input_bar = driver.find_element(By.XPATH, "/html/body/div/div[7]/div[1]/div[2]/form/div/input")
    input_bar.send_keys(topic)
    input_bar.send_keys(Keys.ENTER)
    
    time.sleep(5)
    
    for index in range(5):
        print(f"Page {index+1}")
        

        results_list_object = driver.find_element(By.XPATH, "/html/body/div/div[10]/div[2]/div[3]/div[2]")

        for result in results_list_object.find_elements(By.CLASS_NAME, "gs_r"):
            
            # Skip the section with similar results for chosen topic
            if len(result.find_elements(By.TAG_NAME, "h2"))>0:
                continue
            else:
                title, link = get_article_title_and_link(result)

                if link != None and link not in saved_links:
                    print(title, link)
                    articles_list.append({
                        "topic": re.sub(' ', '_', topic),
                        "title": title,
                        "google_scholar_link": link,
                    })
        time.sleep(5)
        
        try:
            next_page_button = driver.find_element(By.XPATH, "/html/body/div[1]/div[10]/div[2]/div[3]/div[3]/div[3]/center/table/tbody/tr/td[12]/a")
        except:
            next_page_button = driver.find_element(By.XPATH, "/html/body/div/div[10]/div[2]/div[3]/div[3]/div[2]/center/table/tbody/tr/td[12]/a")
            
        driver.execute_script("arguments[0].click();", next_page_button)
        time.sleep(5)
        
        
        with open(os.path.join("utils", f"{re.sub(' ', '_', topic)}_articles_list.json"), "w") as outfile:
             outfile.write(json.dumps(articles_list))

In [None]:
def get_last_downloaded_file(downloads_directory):
    # Get a list of all files in the downloads directory
    files = glob.glob(os.path.join(downloads_directory, '*'))
    
    if not files:
        return None

    # Find the file with the latest modification time
    latest_file = max(files, key=os.path.getmtime)
    
    return latest_file

In [None]:
def download_pdfs_from_list(driver, topic:str):
    
    downloads_directory = os.path.join("output", f"{re.sub(' ', '_', topic)}")
    
    with open(os.path.join("utils", f"{re.sub(' ', '_', topic)}_articles_list.json"), "r") as infile:
        articles_list = json.load(infile)
    
    for index in range(len(articles_list)):
        if "downloaded_file_name" not in articles_list[index].keys():
            print(f"Downloading file: {articles_list[index]['title']}")
            previous_downloaded_file_name = get_last_downloaded_file(downloads_directory)
            try:
                driver.get(articles_list[index]["google_scholar_link"])
                time.sleep(5)
            except:
                encountered_web_driver_exception = True
                while encountered_web_driver_exception == True:
                    try:
                        driver.get(articles_list[index]["google_scholar_link"])
                        encountered_web_driver_exception = False
                        time.sleep(5)
                    except:
                        pass
            
        
            downloaded_file_name = get_last_downloaded_file(downloads_directory)
            if previous_downloaded_file_name != downloaded_file_name:
                articles_list[index]["downloaded_file_name"] = downloaded_file_name.split("\\")[-1]
            else:
                articles_list[index]["downloaded_file_name"] = None
        
        with open(os.path.join("utils", f"{re.sub(' ', '_', topic)}_articles_list.json"), "w") as outfile:
             outfile.write(json.dumps(articles_list))

In [None]:
def wrapper(google_scholar_scrapping_driver, pdf_downloader_driver, topic:str):
    print(f"Started extracting links for topic: {topic}")
    update_articles_links_for_topic(google_scholar_scrapping_driver, topic)
    print(f"Ended extracting links for topic: {topic}")
    
    
    print(f"Started downloading the pdfs from the links list")
    download_pdfs_from_list(pdf_downloader_driver, topic)
    print(f"Ended downloading the pdfs from the links list")

In [None]:
topic = "Computer vision in medicine"

In [None]:
topic = "INSERT YOUR TOPIC HERE"

In [None]:
downloads_directory = os.path.join(os.getcwd(), "output", f"{re.sub(' ', '_', topic)}")

In [None]:
google_scholar_scrapping_driver = init_google_scholar_scrapping_driver()

In [None]:
pdf_downloader_driver = init_pdf_downloader_driver(downloads_directory)

In [None]:
wrapper(google_scholar_scrapping_driver, pdf_downloader_driver, topic)