In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

from bs4 import BeautifulSoup
import re
import time
import openpyxl

In [2]:
# load Excel sheet
wb = openpyxl.load_workbook('articles.xlsx')
ws = wb.active

# extract all the article titles in column A and store them in a list
article_titles = [cell.value for cell in ws['A']]
# article_titles

In [3]:
# set up webdriver

# create ChromeOptions object to configure headless and incognito modes
options = Options()
# options.add_argument('--headless')
options.add_argument('--disable-gpu')
# options.add_argument('--incognito')

# create ChromeDriver object with headless and incognito modes enabled
driver = webdriver.Chrome(options=options)

# wait for browser to launch
time.sleep(2)

# navigate to Google Scholar
driver.get('https://scholar.google.com/schhp?hl=en')

In [None]:
for article_title in article_titles:
    print(article_title)

    # find search box and enter article title
    search_box = driver.find_element(By.NAME, 'q')
    search_box.clear()
    search_box.send_keys(article_title)
    search_box.send_keys(Keys.RETURN)

    # wait for search result page to load
    time.sleep(2)
    
    ## GET AUTHOR NAMES FOR SELF CITATIONS

    # get the HTML code of the search result page
    html_code = driver.page_source

    # parse the HTML code with BeautifulSoup
    soup = BeautifulSoup(html_code, 'html.parser')

    # find the <div> tag with class "gs_a"
    div_tag = soup.find('div', class_='gs_a')
    
    # wait for search result page to load
    time.sleep(5)
    
    if div_tag is not None:
        text_content = div_tag.get_text()
        
        # split the text content by commas and strip any whitespace
        author_names = [name.strip() for name in text_content.split(',')]

        # split the second-to-last element of the list by the dash character
        second_last_element_parts = author_names[-2].split('-')

        # keep only the first part (i.e., the author name)
        author_names[-2] = second_last_element_parts[0].strip()

        author_names_cleaned = author_names[:-1]
        # author_names_cleaned

        # extract the author names and add "-author:" in front of each name
        authors_list = ['-author:' + author for author in author_names_cleaned]

        # join the author names into a single string with spaces in between
        authors_str = ' '.join(authors_list)
        # print(authors_str)
        row_num = article_titles.index(article_title) + 1  # get the row number of the current article title
        ws.cell(row=row_num, column=3, value=authors_str)
    
    else:
        print('Could not find author names')

    # # extract all the text content within the <div> tag
    # text_content = div_tag.get_text()


    # check if there is a "Cited by" number
    try:
        cited_by = driver.find_element(By.PARTIAL_LINK_TEXT, 'Cited by')
        cited_by.click()
        
        # wait for Captcha to load
        time.sleep(20)
    
        # check "Search within citing articles" option
        search_within = driver.find_element(By.ID, 'gs_scipsc')
        search_within.click()

        
        # input the author names in search box
        search_box = driver.find_element(By.NAME, 'q')
        search_box.clear()
        search_box.send_keys(authors_str)
        search_box.send_keys(Keys.RETURN)
        
        # find the number of search results and record it in the same row in column B
        result_stats = driver.find_element(By.ID, 'gs_ab_md')
        text_results = result_stats.text

        try:
            num_results = re.findall(r'\d+', text_results)[0]
            print(num_results)

            row_num = article_titles.index(article_title) + 1  # get the row number of the current article title
            ws.cell(row=row_num, column=2, value=num_results)
        except IndexError:
            print('No search results found')      
        
    except NoSuchElementException:
        # record 0 if there are no remaining
        num_results = 0
        print(num_results)
        row_num = article_titles.index(article_title) + 1  # get the row number of the current article title
        ws.cell(row=row_num, column=2, value=num_results)

    # wait for main search page to load
    time.sleep(1)

    # navigate to Google Scholar
    driver.get('https://scholar.google.com/schhp?hl=en')

Centrifuge modelling of energy piles subjected to heating and cooling cycles in clay
39
Effects of root geometry and transpiration on pull-out resistance
11
Cyclic behaviour of an unsaturated silt at various suctions and temperatures
50
Three-dimensional centrifuge modelling of pile group responses to side-by-side twin tunnelling
32
A modified analytical solution of soil stress distribution for XCC pile foundations
3
A novel root system for simulating transpiration-induced soil suction in centrifuge
10
The dilatant behaviour of sand-pile interface subjected to loading and stress relief
29
A new theoretical method for analyzing confined granular flows
11
Observed ground and pile group responses due to tunneling in Bangkok stiff clay
19
Flume Investigation of Landslide Debris Resisting Baffles
31
Effects of the tip depth of a pre-existing fracture on surface fault ruptures in cemented clay
6
Effects of stiffness nonlinearity on E’ standard penetration test N correlations for analysing wa

In [None]:
# save the Excel sheet and close the webdriver
wb.save('articles.xlsx')

# close the browser window
driver.quit()

In [None]:
# # Wait until the CAPTCHA is solved by a human user
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC

#         # Wait for the CAPTCHA to appear
#         captcha_present = False
#         while not captcha_present:
#             try:
#                 # Select the CAPTCHA element
#                 captcha_element = WebDriverWait(driver, 30).until(
#                     EC.presence_of_element_located((By.CSS_SELECTOR, "#recaptcha-anchor"))
#                 )
#                 captcha_present = True
#             except:
#                 # Wait for a while before trying again
#                 time.sleep(5)

#         # Wait for the user to solve the CAPTCHA and click submit
#         while True:
#             try:
#                 # Check if the submit button is enabled
#                 submit_button = driver.find_element(By.CSS_SELECTOR, "#recaptcha-verify-button")
#                 if submit_button.is_enabled():
#                     # Wait for the user to click the submit button
#                     submit_button.click()
#                     break
#             except:
#                 # Wait for a while before trying again
#                 time.sleep(10)