In [47]:
import csv
import time
import re
from selenium import webdriver 
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException


driver = webdriver.Firefox(executable_path= '/Users/sebastian/Downloads/geckodriver')


In [63]:

# Initial setup
# Do not change
done = False
page_counter = 1
author_links = []
max_pages = 1
id_start = 1
id_author = 1
#------------------------


def print_results(results):
    for result in results:
        print("ID_paper:", result['ID'])
        print("Title:", result['Title'])
        print("Subtitle:", result['Subtitle'])
        print("Authors:", result['Authors'])
        print("Year:", result['Year'])
        print("Publisher:", result['Publisher'])
        print("Referenced:", result['Referenced'])
        print("Versions:", result['Versions'])
        print("--------------------")

def export_to_csv_results(results, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['ID_paper', 'Title', 'Subtitle', 'Authors', 'Year', 'Publisher', 'Referenced', 'Versions']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for result in results:
            writer.writerow(result)
            
            
def export_to_csv_links(results, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['ID_author', 'ID_paper', 'link']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for result in results:
            writer.writerow(result)
            
def navigate_to_next_page(driver):
    global done
    global page_counter
    global max_pages
    try:
        # Find the <b> tag containing the text "Weiter"
        next_button = driver.find_element_by_xpath("//b[contains(text(), 'Weiter')]")
        
        #print("page_counter: " + str(page_counter))
        #print("max_pages: " + str(max_pages))
        
        if (page_counter >= max_pages):
            done = True
            
            print("          ")
            print("Done")
        else:
            next_button.click()
            
            print("Page moved: " + str(page_counter))
            print("          ")
            page_counter = page_counter + 1
        
    except NoSuchElementException:
        done = True
        print("No more next page button found. End of search results.")
    except ElementNotInteractableException:
        done = True
        print("Next page button is not interactable. End of search results.")
        
def extract_number(s):
    match = re.search(r'\d+', s)
    if match:
        return int(match.group())
    else:
        return 1

def scrape_google_scholar_results(url):
    # Set up the Selenium webdriver
    #driver = webdriver.Firefox()
    global id_start
    global id_author
    driver.get(url)

    # Wait for the page to load
    driver.implicitly_wait(10)

    results = []
    

    while(done != True):
    # Find all the search result elements
    
        time.sleep(3)

        search_results = driver.find_elements_by_css_selector('.gs_ri')

        for result in search_results:
            
            id = id_start
            title = result.find_element_by_css_selector('h3').text
            subtitle = ''
            authors = ''
            year = ''
            publisher = ''
            referenced = ''
            versions = ''

            try:
                metadata = result.find_element_by_css_selector('.gs_a')
                
                metadata_splited = metadata.text.split('-')
                authors = metadata_splited[0].strip()
                year = metadata_splited[-2].strip()
                
                if(year == authors):
                    year = None
                else:    
                    parts = year.split(",")
                    if len(parts) > 1:
                        year = parts[-1].strip()
                        subtitle = ", ".join(parts[:-1]).strip()
                    
                publisher = metadata_splited[-1].strip()
                
                
                # Execute JavaScript to extract href attributes for each result
                href_script = """
                var hrefs = [];
                var elements = arguments[0].querySelectorAll('a');
                elements.forEach(function(element) {
                    hrefs.push(element.href);
                });
                return hrefs;
                """

                # Execute the script and retrieve href attributes for the current result
                hrefs = driver.execute_script(href_script, metadata)
                
                if(hrefs != []):
                    for href in hrefs:
                        author_links.append({
                            'ID_author': id_author,
                            'ID_paper': id_start,
                            'link': href
                        })
                        
                        id_author = id_author + 1
                
            except NoSuchElementException:
                pass

            try:
                referenced = result.find_element_by_css_selector('.gs_fl a:nth-of-type(3)').text.split()[-1]
                if referenced == "Artikel":
                    referenced = 0
            except NoSuchElementException:
                pass

            try:
                # Check for the presence of the versions element, if it exists
                #versions = result.find_element_by_css_selector('.gs_fl a:nth-of-type(5)').text.split()[-2]
                versions = result.find_element_by_css_selector('.gs_fl a:nth-of-type(5)').text
                versions = extract_number(versions)
            except NoSuchElementException:
                pass

            results.append({
                'ID_paper': id_start,
                'Title': title,
                'Subtitle': subtitle,
                'Authors': authors,
                'Year': year,
                'Publisher': publisher,
                'Referenced': referenced,
                'Versions': versions
            })
            
            id_start = id_start + 1
            
            print("Result appended: " + title[:10])
            

        # Close the webdriver
        #driver.quit()
        navigate_to_next_page(driver)

    return results



In [64]:
# Modifiy
max_pages = 1

# Test url 16 pages
#url = 'https://scholar.google.com/scholar?start=0&q=blockhaus+1700+gr%C3%BCn+kuh&hl=de&as_sdt=0,5'


url = 'https://scholar.google.com/scholar?hl=de&as_sdt=0%2C5&q=test&btnG='
#------------------------


driver.get(url)




results = scrape_google_scholar_results(url)


#print(author_links)
#print_results(results)


export_to_csv_results(results, 'Output_Scraping_GS_Authors/gs_test_intial.csv')
export_to_csv_links(author_links, 'Output_Scraping_GS_Authors/gs_test_intial_links.csv')

Result appended: [BUCH] Ein
Result appended: Qualitätsa
Result appended: Test revie
Result appended: The logran
Result appended: [BUCH] TRO
Result appended: The tuberc
Result appended: Chamber te
Result appended: Classical 
Result appended: Überlebens
Result appended: The HTP te
          
Done
