In [212]:
import csv
import time
import re
from selenium import webdriver 
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException


driver = webdriver.Firefox(executable_path= '/Users/sebastian/Downloads/geckodriver')


In [156]:

# Initial setup
# Do not change
done = False
page_counter = 1
author_links = []
max_pages = 1
id_start = 1
id_author = 1
#------------------------


def print_results(results):
    for result in results:
        print("ID_paper:", result['ID'])
        print("Title:", result['Title'])
        print("Subtitle:", result['Subtitle'])
        print("Authors:", result['Authors'])
        print("Year:", result['Year'])
        print("Publisher:", result['Publisher'])
        print("Referenced:", result['Referenced'])
        print("Versions:", result['Versions'])
        print("--------------------")

def export_to_csv_results(results, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['ID_paper', 'Title', 'Subtitle', 'Authors', 'Year', 'Publisher', 'Referenced', 'Versions']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for result in results:
            writer.writerow(result)
            
            
def export_to_csv_links(results, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['ID_author', 'ID_paper', 'link']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for result in results:
            writer.writerow(result)
            
def navigate_to_next_page(driver):
    global done
    global page_counter
    global max_pages
    try:
        # Find the <b> tag containing the text "Weiter"
        next_button = driver.find_element_by_xpath("//b[contains(text(), 'Weiter')]")
        
        #print("page_counter: " + str(page_counter))
        #print("max_pages: " + str(max_pages))
        
        if (page_counter >= max_pages):
            done = True
            
            print("          ")
            print("Done")
        else:
            next_button.click()
            
            print("Page moved: " + str(page_counter))
            print("          ")
            page_counter = page_counter + 1
        
    except NoSuchElementException:
        done = True
        print("No more next page button found. End of search results.")
    except ElementNotInteractableException:
        done = True
        print("Next page button is not interactable. End of search results.")
        
def extract_number(s):
    match = re.search(r'\d+', s)
    if match:
        return int(match.group())
    else:
        return 1

def scrape_google_scholar_results(url):
    # Set up the Selenium webdriver
    #driver = webdriver.Firefox()
    global id_start
    global id_author
    driver.get(url)

    # Wait for the page to load
    driver.implicitly_wait(10)

    results = []
    

    while(done != True):
    # Find all the search result elements
    
        time.sleep(3)

        search_results = driver.find_elements_by_css_selector('.gs_ri')

        for result in search_results:
            
            id = id_start
            title = result.find_element_by_css_selector('h3').text
            subtitle = ''
            authors = ''
            year = ''
            publisher = ''
            referenced = ''
            versions = ''

            try:
                metadata = result.find_element_by_css_selector('.gs_a')
                
                metadata_splited = metadata.text.split('-')
                authors = metadata_splited[0].strip()
                year = metadata_splited[-2].strip()
                
                if(year == authors):
                    year = None
                else:    
                    parts = year.split(",")
                    if len(parts) > 1:
                        year = parts[-1].strip()
                        subtitle = ", ".join(parts[:-1]).strip()
                    
                publisher = metadata_splited[-1].strip()
                
                
                # Execute JavaScript to extract href attributes for each result
                href_script = """
                var hrefs = [];
                var elements = arguments[0].querySelectorAll('a');
                elements.forEach(function(element) {
                    hrefs.push(element.href);
                });
                return hrefs;
                """

                # Execute the script and retrieve href attributes for the current result
                hrefs = driver.execute_script(href_script, metadata)
                
                if(hrefs != []):
                    for href in hrefs:
                        author_links.append({
                            'ID_author': id_author,
                            'ID_paper': id_start,
                            'link': href
                        })
                        
                        id_author = id_author + 1
                
            except NoSuchElementException:
                pass

            try:
                referenced = result.find_element_by_css_selector('.gs_fl a:nth-of-type(3)').text.split()[-1]
                if referenced == "Artikel":
                    referenced = 0
            except NoSuchElementException:
                pass

            try:
                # Check for the presence of the versions element, if it exists
                #versions = result.find_element_by_css_selector('.gs_fl a:nth-of-type(5)').text.split()[-2]
                versions = result.find_element_by_css_selector('.gs_fl a:nth-of-type(5)').text
                versions = extract_number(versions)
            except NoSuchElementException:
                pass

            results.append({
                'ID_paper': id_start,
                'Title': title,
                'Subtitle': subtitle,
                'Authors': authors,
                'Year': year,
                'Publisher': publisher,
                'Referenced': referenced,
                'Versions': versions
            })
            
            id_start = id_start + 1
            
            #print("Result appended: " + title[:10])
            

        # Close the webdriver
        #driver.quit()
        navigate_to_next_page(driver)

    return results



In [157]:
# Modifiy
max_pages = 98

# Test url 16 pages
#url = 'https://scholar.google.com/scholar?start=0&q=blockhaus+1700+gr%C3%BCn+kuh&hl=de&as_sdt=0,5'


url = 'https://scholar.google.com/scholar?as_vis=1&q=large+language+model&hl=de&as_sdt=0,5'
#------------------------


driver.get(url)




results = scrape_google_scholar_results(url)


#print(author_links)
#print_results(results)


export_to_csv_results(results, 'Output_Scraping_GS_Authors/gs_llm.csv')
export_to_csv_links(author_links, 'Output_Scraping_GS_Authors/gs_llm_links.csv')

Page moved: 1
          
Page moved: 2
          
Page moved: 3
          
Page moved: 4
          
Page moved: 5
          
Page moved: 6
          
Page moved: 7
          
Page moved: 8
          
Page moved: 9
          
Page moved: 10
          
Page moved: 11
          
Page moved: 12
          
Page moved: 13
          
Page moved: 14
          
Page moved: 15
          
Page moved: 16
          
Page moved: 17
          
Page moved: 18
          
Page moved: 19
          
Page moved: 20
          
Page moved: 21
          
Page moved: 22
          
Page moved: 23
          
Page moved: 24
          
Page moved: 25
          
Page moved: 26
          
Page moved: 27
          
Page moved: 28
          
Page moved: 29
          
Page moved: 30
          
Page moved: 31
          
Page moved: 32
          
Page moved: 33
          
Page moved: 34
          
Page moved: 35
          
Page moved: 36
          
Page moved: 37
          
Page moved: 38
          
Page moved: 39
      

list

In [158]:
import csv

authors = []

with open('Output_Scraping_GS_Authors/gs_llm_links.csv', newline='') as csvfile:

    csv_reader = csv.reader(csvfile)
    
    # Initialize an empty list to store tuples
    
    next(csv_reader)  
    # Iterate over each row in the CSV file
    for row in csv_reader:
        # Convert the row into a tuple and append it to the list
        authors.append(tuple(row))
        
        
print(authors)

[('1', '1', 'https://scholar.google.com/citations?user=3qb1AYwAAAAJ&hl=de&oi=sra'), ('2', '1', 'https://scholar.google.com/citations?user=KbrpC8cAAAAJ&hl=de&oi=sra'), ('3', '1', 'https://scholar.google.com/citations?user=BE_lVTQAAAAJ&hl=de&oi=sra'), ('4', '2', 'https://scholar.google.com/citations?user=JUsooa0AAAAJ&hl=de&oi=sra'), ('5', '2', 'https://scholar.google.com/citations?user=4iG4IC4AAAAJ&hl=de&oi=sra'), ('6', '3', 'https://scholar.google.com/citations?user=Q7Ieos8AAAAJ&hl=de&oi=sra'), ('7', '3', 'https://scholar.google.com/citations?user=hBZ_tKsAAAAJ&hl=de&oi=sra'), ('8', '3', 'https://scholar.google.com/citations?user=KVeRu2QAAAAJ&hl=de&oi=sra'), ('9', '3', 'https://scholar.google.com/citations?user=go3sFxcAAAAJ&hl=de&oi=sra'), ('10', '4', 'https://scholar.google.com/citations?user=48GJrbsAAAAJ&hl=de&oi=sra'), ('11', '4', 'https://scholar.google.com/citations?user=206vNCEAAAAJ&hl=de&oi=sra'), ('12', '4', 'https://scholar.google.com/citations?user=oUYfjg0AAAAJ&hl=de&oi=sra'), 

In [214]:
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementClickInterceptedException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

name = ''

def show_more_results(driver):
    try:
        button = WebDriverWait(driver, 5).until(
        EC.element_to_be_clickable((By.ID, 'gsc_bpf_more'))
        )
        
        while(button.is_enabled()):
            #print("Results Expanded")
            button.click()
            time.sleep(0.5)
        
    except NoSuchElementException:
        print("No more next page button found. End of search results.")
        print(name)
    except TimeoutException:
        print("Timed out waiting for the next page button to be clickable.")
        print(name)
    except ElementClickInterceptedException:
        print("Next page button is not clickable. End of search results.")
        print(name)

def scrape_google_scholar_author(url):
    # Set up the Selenium webdriver
    #driver = webdriver.Firefox()
    global id_start
    global id_author

    global name
    global organization
    
    driver.get(url)
    
    time.sleep(1)
    
    
    

    name_element = driver.find_element_by_id('gsc_prf_in')
    name = name_element.text.strip()

    # Scrape the organization
    try:
        organization_element = driver.find_element_by_class_name('gsc_prf_ila')
        organization = organization_element.text.strip()
    except NoSuchElementException as e:
        organization = "Organization not found"
    
    # Print the scraped data
    #print("Name:", name)
    #print("Organization:", organization)
    
    show_more_results(driver)
    
    
    elements = driver.find_elements(By.CLASS_NAME, 'gsc_a_tr')
    
    result = []

    # Iterate through each tr element and extract the desired information
    for element in elements:
        result.append({
            'Title': element.find_element(By.CLASS_NAME, 'gsc_a_at').text,
            'Authors': element.find_elements(By.CLASS_NAME, 'gs_gray')[0].text,
            'part_of': element.find_elements(By.CLASS_NAME, 'gs_gray')[1].text,
            'Year': element.find_element(By.CLASS_NAME, 'gsc_a_y').text,
            'Zitiert_von': element.find_element(By.CLASS_NAME, 'gsc_a_ac').text
        })


        # Print the extracted information
        #print("Title:", title)
        #print("Authors:", authors)
        #print("Part of:", part_of)
        #print("Year:", year)
        #print("Zitiert von:", zitiert_von)
        #print("-------------------------------")
        
    return result




def export_new_authors(results, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['ID_author', 'ID_paper', 'link', 'name', 'organization']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for result in results:
            writer.writerow(result)
            
            
def export_author_papers(results, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['ID_author', 'Title', 'Authors', 'Part_of', 'Year', 'Zitiert_von']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for result in results:
            writer.writerow(result)
    
    
new_authors = []
author_articles = []


    
def iterate_scrape(input_authors):
    
    progress_counter = 0
    total_amount_links = len(authors)
    
    for element in input_authors:
        
        papers = scrape_google_scholar_author(element[2])
        new_authors.append({
                'ID_author': element[0],
                'ID_paper': element[1],
                'link': element[2],
                'name': name,
                'organization': organization
            })
        
        paper_counter = 0
        total_amount_found = len(papers)
        
        for paper in papers: 
            author_articles.append({
                'ID_author': element[0],
                'Title': paper['Title'],
                'Authors': paper['Authors'],
                'Part_of': paper['part_of'],
                'Year': paper['Year'],
                'Zitiert_von': paper['Zitiert_von']
            })
            
            paper_counter = paper_counter + 1
            
        progress_counter = progress_counter + 1
        print("--------------------------------------------")
        print(name)
        print("Papers scraped: " + str(paper_counter) + " out of " + str(total_amount_found))
        print("Progress Authors: " + str(progress_counter) + " out of " + str(total_amount_links))
        print("--------------------------------------------")
                
    print("start export")
    export_new_authors(new_authors, 'Output_Scraping_GS_Authors/gs_enhanced_authors.csv')
    export_author_papers(author_articles, 'Output_Scraping_GS_Authors/gs_author_articles.csv')
            
        
        
        
        
        
        

In [215]:
#url = authors[0][2]

#url = "https://scholar.google.com/citations?user=3qb1AYwAAAAJ&hl=de&oi=sra"
#print(url[20:])

#scrape_google_scholar_author(url)

import time

start = time.time()

iterate_scrape(authors)

end = time.time()
print("--------------------------------------------")
print(" ")
print("--------------------------------------------")
print("Total time needed: " + str(end - start))





--------------------------------------------
Arun Thirunavukarasu
Papers scraped: 20 out of 20
Progress Authors: 1 out of 200
--------------------------------------------
--------------------------------------------
Darren S J Ting
Papers scraped: 154 out of 154
Progress Authors: 2 out of 200
--------------------------------------------
Timed out waiting for the next page button to be clickable.
Kabilan Elangovan
--------------------------------------------
Kabilan Elangovan
Papers scraped: 13 out of 13
Progress Authors: 3 out of 200
--------------------------------------------
--------------------------------------------
abeba birhane
Papers scraped: 70 out of 70
Progress Authors: 4 out of 200
--------------------------------------------
--------------------------------------------
Atoosa Kasirzadeh
Papers scraped: 37 out of 37
Progress Authors: 5 out of 200
--------------------------------------------
Timed out waiting for the next page button to be clickable.
Xu Wang
---------------

In [180]:
import pandas as pd

# Read the CSV file into a pandas DataFrame
df = pd.read_csv('Output_Scraping_GS_Authors/gs_llm_links.csv')

# Extract the last 20 characters from the "link" column
df['link_last_20'] = df['link'].str[-20:]

# Identify and remove duplicate rows based on the last 20 characters of the link
df = df.drop_duplicates(subset='link_last_20', keep='first')

# Drop the intermediate column used for comparison
df = df.drop(columns=['link_last_20'])

# Write the modified DataFrame back to a new CSV file
df.to_csv('cleaned_author_links.csv', index=False)

In [213]:
authors = []

with open('Output_Scraping_GS_Authors/cleaned_author_links_splitted/cleaned_author_links_1.csv', 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    next(csvreader)  # Skip header
    for row in csvreader:
        authors.append(row)

# Print the list of arrays (optional)
print(authors)

[['1', '1', 'https://scholar.google.com/citations?user=3qb1AYwAAAAJ&hl=de&oi=sra'], ['2', '1', 'https://scholar.google.com/citations?user=KbrpC8cAAAAJ&hl=de&oi=sra'], ['3', '1', 'https://scholar.google.com/citations?user=BE_lVTQAAAAJ&hl=de&oi=sra'], ['4', '2', 'https://scholar.google.com/citations?user=JUsooa0AAAAJ&hl=de&oi=sra'], ['5', '2', 'https://scholar.google.com/citations?user=4iG4IC4AAAAJ&hl=de&oi=sra'], ['6', '3', 'https://scholar.google.com/citations?user=Q7Ieos8AAAAJ&hl=de&oi=sra'], ['7', '3', 'https://scholar.google.com/citations?user=hBZ_tKsAAAAJ&hl=de&oi=sra'], ['8', '3', 'https://scholar.google.com/citations?user=KVeRu2QAAAAJ&hl=de&oi=sra'], ['9', '3', 'https://scholar.google.com/citations?user=go3sFxcAAAAJ&hl=de&oi=sra'], ['10', '4', 'https://scholar.google.com/citations?user=48GJrbsAAAAJ&hl=de&oi=sra'], ['11', '4', 'https://scholar.google.com/citations?user=206vNCEAAAAJ&hl=de&oi=sra'], ['12', '4', 'https://scholar.google.com/citations?user=oUYfjg0AAAAJ&hl=de&oi=sra'], 