In [1]:
import re
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
import time 

In [2]:
def extract_author_metrics(driver, author_id):
    # Access the author's URL
    driver.get(f"https://www.scopus.com/authid/detail.uri?authorId={author_id}#tab=metrics")

    # Dictionary to store the metrics
    author_data = {
        "Nom_Complet": "N/A",
        "Affiliation": "N/A",
        "Citations": 0,
        "Documents": 0,
        "h-index": 0,
        "FWCI": 0.0
    }

    wait = WebDriverWait(driver, 30) 

    # Wait for the author's name to be visible
    try:
        author_data["Nom_Complet"] = wait.until(EC.visibility_of_element_located(
            (By.CLASS_NAME, "Typography-module__lVnit.Typography-module__oFCaL")
        )).text
    except Exception as e:
        print(f"Nom de l'auteur indisponible: {e}")

    # Scrape the affiliation (country or institution)
    try:
        elem = wait.until(EC.visibility_of_element_located((By.CLASS_NAME, "AuthorHeader-module__DRxsE")))
        raw_affiliation = elem.find_elements(By.CLASS_NAME, "Typography-module__lVnit.Typography-module__Nfgvc")[-1].text
        author_data["Affiliation"] = raw_affiliation.lstrip(', ').replace(', ', ' - ')
    except Exception as e:
        print(f"Affiliation indisponible: {e}")

    # Wait for the metrics section to be visible
    try:
        metrics = wait.until(EC.visibility_of_element_located((By.CLASS_NAME, "MetricSection-module__s8lWB")))
        metrics_text = metrics.text.lower()

        # Extract Citations
        if "citations" in metrics_text:
            citations_section = metrics_text.split("citations")[0]
            author_data["Citations"] = int(citations_section.split()[-1].replace(',', ''))

        # Extract Documents
        if "documents" in metrics_text:
            documents_section = metrics_text.split("documents")[1]
            author_data["Documents"] = int(documents_section.split()[-1].replace(',', ''))

        # Extract h-index
        if "h-index" in metrics_text:
            h_index_section = metrics_text.split("h-index")[0]
            author_data["h-index"] = int(h_index_section.split()[-1].replace(',', ''))

    except Exception as e:
        print(f"Erreur lors de l'extraction des métriques: {e}")

    # Wait for the FWCI to be visible
    try:
        fwci_element = wait.until(EC.visibility_of_element_located((By.ID, 'metrics-panel')))
        fwci_text = fwci_element.text.lower()

        # Extract the FWCI value using a similar method
        if "field-weighted citation impact" in fwci_text:
            fwci_value = fwci_text.split("field-weighted citation impact")[-1].strip().split()[0]
            author_data["FWCI"] = float(fwci_value)

    except Exception as e:
        print(f"FWCI indisponible ou erreur lors de l'extraction: {e}")

    return author_data


In [3]:
def get_doc_infos(driver, link):
    driver.get(link)

    title, year_pub, citation_count, publisher, issn, doi, doc_type, source_type = "N/A", "N/A", 0, "N/A", "N/A", "N/A", "N/A", "N/A"
    abstract,authors='',[]

    # Extract title
    title_element = WebDriverWait(driver, 60).until(EC.visibility_of_element_located(
        (By.CLASS_NAME, "Typography-module__lVnit.Typography-module__o9yMJ.Typography-module__JqXS9.Typography-module__ETlt8")
    ))
    title = title_element.text.strip() if title_element else "N/A"

    # Extract publication year
    year_pub_element = WebDriverWait(driver, 60).until(EC.visibility_of_element_located(
        (By.XPATH, "//span[contains(text(), '2024')]")
    ))
    year_pub = year_pub_element.text.split()[-1] if year_pub_element else "N/A"

    # Extract number of citations
    citation_element = WebDriverWait(driver, 60).until(EC.visibility_of_element_located(
        (By.XPATH, "//span[contains(text(), 'Citations')]")
    ))
    citation_count = ''.join(filter(str.isdigit, citation_element.text)) if citation_element else 0

    #Locate all elements 
    elements = WebDriverWait(driver, 60).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "Box-module__DQ5q3")))

    if len(elements) > 0:
        target_div = elements[2]  
        print(f"Found target_div with content: {target_div.get_attribute('innerHTML')}")

        # Extract Document Type
        doc_type_elements = target_div.find_elements(By.XPATH, ".//dl[@data-testid='source-info-entry-document-type']/dd")
        doc_type = doc_type_elements[0].text.strip() if doc_type_elements else "N/A"

        # Extract Source Type
        source_type_elements = target_div.find_elements(By.XPATH, ".//dl[@data-testid='source-info-entry-source-type']/dd")
        source_type = source_type_elements[0].text.strip() if source_type_elements else "N/A"

        # Extract ISSN
        issn_elements = target_div.find_elements(By.XPATH, ".//dl[@data-testid='source-info-entry-issn']/dd")
        issn = issn_elements[0].text.strip() if issn_elements else "N/A"

        # Extract DOI
        doi_elements = target_div.find_elements(By.XPATH, ".//dl[@data-testid='source-info-entry-doi']/dd")
        doi = doi_elements[0].text.strip() if doi_elements else "N/A"

        # Extract Publisher
        publisher_elements = target_div.find_elements(By.XPATH, ".//dl[@data-testid='source-info-entry-publisher']/dd")
        publisher = publisher_elements[0].text.strip() if publisher_elements else "N/A"  
    # #abstract
  
    abstract_element = WebDriverWait(driver, 60).until(EC.visibility_of_element_located(
            (By.CLASS_NAME, "Typography-module__lVnit Typography-module__ETlt8 Typography-module__GK8Sg")
        ))
    abstract = abstract_element.text.strip() if abstract_element else "N/A"
    #key words of authors
    try:
        authors_sections = driver.find_elements(By.CLASS_NAME, "DocumentHeader-module__LpsWx")
        if len(authors_sections) > 1:  
            authors_section = authors_sections[1]  
            author_elements = authors_section.find_elements(By.TAG_NAME, "li")
            for author in author_elements:
                author_name_span = author.find_element(By.TAG_NAME, "span")
                if author_name_span:
                    author_name = author_name_span.text.strip()
                    authors.append(author_name)
    except Exception as e:
        print(f"Could not extract authors: {e}")
    
    # reve infos
    sjr_link=f"https://www.scimagojr.com/journalsearch.php?q={issn}"
    driver.get(sjr_link)
    journal_info = {}
    try:
        search_results = WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.CLASS_NAME, "search_results")))
        journal_links = search_results.find_elements(By.TAG_NAME, "a")
        for journal_link in journal_links:
            journal_name = journal_link.find_element(By.CLASS_NAME, "jrnlname").text.strip() if journal_link.find_element(By.CLASS_NAME, "jrnlname") else "N/A"
            journal_href = journal_link.get_attribute("href")
            journal_info['Nom de la revue'] = journal_name
            #extract additional info from the journal href
            driver.get(journal_href)
            # H-hindex 
            h_index_element = WebDriverWait(driver, 60).until(EC.visibility_of_element_located((By.CLASS_NAME, "hindexnumber")))
            h_index = h_index_element.text.strip() if h_index_element else "N/A"
            journal_info['H-index'] = h_index
            #editeur
            publisher_element = WebDriverWait(driver, 60).until(EC.visibility_of_element_located((By.XPATH, "//h2[contains(text(), 'Publisher')]")))
            publisher = publisher_element.find_element(By.XPATH, "./following-sibling::p/a").text.strip()
            journal_info['Editeur'] = publisher
            #issn
            journal_info['issn']=issn
            #index
            journal_info['index']='Scopus'
            #portée thématique
            scope_element = WebDriverWait(driver, 60).until(EC.visibility_of_element_located((By.XPATH, "//h2[contains(text(), 'Scope')]")))
            scope = scope_element.find_element(By.XPATH, "./following-sibling::p").text.strip()
            journal_info['portée thématique'] = scope
            #Quartille
            quartile_element = WebDriverWait(driver, 60).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, "cellside")))[1]
            quartile = quartile_element.find_elements(By.TAG_NAME, "td")[-1].text.strip()
            journal_info['Quartile'] = quartile
            #score sjr
            sjr_element = WebDriverWait(driver, 60).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, "cellside")))[3]
            sjr = sjr_element.find_elements(By.TAG_NAME, "td")[-1].text.strip()
            journal_info['Score SJR'] = sjr
        
    except Exception as e:
        print(f"Could not extract journal information: {e}")
            
    except Exception as e:
        print(f"Could not extract journal information: {e}")
    return {
        "title": title,
        "pub year": year_pub,
        "citations": citation_count,
        "publisher": publisher,
        "issn": issn,
        "DOI": doi,
        "document type": doc_type,
        "source type": source_type,
        "abstract":abstract,
        "authors":authors,
        "journal_info": journal_info
    }

In [3]:
driver = webdriver.Chrome()  # Assurez-vous que ChromeDriver est installé et configuré correctement
author_id ="7006835644"  # Remplacez par l'ID de l'auteur réel
metrics = extract_author_metrics(driver, author_id)
print(metrics)
# N'oubliez pas de fermer le driver après utilisation
driver.quit()


FWCI indisponible ou erreur lors de l'extraction: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF6E34F3AB5+28005]
	(No symbol) [0x00007FF6E34583B0]
	(No symbol) [0x00007FF6E32F580A]
	(No symbol) [0x00007FF6E3345A3E]
	(No symbol) [0x00007FF6E3345D2C]
	(No symbol) [0x00007FF6E338EA97]
	(No symbol) [0x00007FF6E336BA7F]
	(No symbol) [0x00007FF6E338B8B3]
	(No symbol) [0x00007FF6E336B7E3]
	(No symbol) [0x00007FF6E33375C8]
	(No symbol) [0x00007FF6E3338731]
	GetHandleVerifier [0x00007FF6E37E643D+3118829]
	GetHandleVerifier [0x00007FF6E3836C90+3448640]
	GetHandleVerifier [0x00007FF6E382CF0D+3408317]
	GetHandleVerifier [0x00007FF6E35BA40B+841403]
	(No symbol) [0x00007FF6E346340F]
	(No symbol) [0x00007FF6E345F484]
	(No symbol) [0x00007FF6E345F61D]
	(No symbol) [0x00007FF6E344EB79]
	BaseThreadInitThunk [0x00007FFC30467374+20]
	RtlUserThreadStart [0x00007FFC3061CC91+33]

{'Nom_Complet': 'Broekaert, Joseé Alfons Clement', 'Affiliation': 'Hamburg - Germany', 'Citations': 7232, 'Documents': 277, '

In [4]:
def co_authors(driver, author_id, num_co_authors=5):
    url = f"https://www.scopus.com/search/submit/coAuthorSearch.uri?authorId={author_id}&origin=AuthorProfile&sot=al&sdt=coaut&zone=coAuthorsTab"
    driver.get(url)
    driver.get(url)
    # Initialize the list to store co-author IDs
    co_author_ids = []

    # Wait for the co-author table or relevant section to load
    wait = WebDriverWait(driver, 30)
    try:
        # Locate all rows that might contain co-author information
        rows = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "tr.searchArea")))

        # Limit the number of co-authors to retrieve based on the specified number
        for row in rows[:num_co_authors]:
            # Find the input element with the co-author ID in each row
            input_element = row.find_element(By.CSS_SELECTOR, "input[type='checkbox'][id^='auid_']")
            co_author_id = input_element.get_attribute("value")
            co_author_ids.append(co_author_id)

    except Exception as e:
        print(f"Erreur lors de l'extraction des IDs des co-auteurs: {e}")

    return {author_id:co_author_ids}


In [5]:
driver = webdriver.Chrome() 
co_author_list = co_authors(driver, author_id='7103251673', num_co_authors=5)
#quit the drirver
driver.quit()
print("Co-author IDs:", co_author_list)


Erreur lors de l'extraction des IDs des co-auteurs: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=130.0.6723.69)
Stacktrace:
	GetHandleVerifier [0x00007FF6E34F3AB5+28005]
	(No symbol) [0x00007FF6E34583B0]
	(No symbol) [0x00007FF6E32F580A]
	(No symbol) [0x00007FF6E32CFA85]
	(No symbol) [0x00007FF6E3372AD7]
	(No symbol) [0x00007FF6E338B1B1]
	(No symbol) [0x00007FF6E336B7E3]
	(No symbol) [0x00007FF6E33375C8]
	(No symbol) [0x00007FF6E3338731]
	GetHandleVerifier [0x00007FF6E37E643D+3118829]
	GetHandleVerifier [0x00007FF6E3836C90+3448640]
	GetHandleVerifier [0x00007FF6E382CF0D+3408317]
	GetHandleVerifier [0x00007FF6E35BA40B+841403]
	(No symbol) [0x00007FF6E346340F]
	(No symbol) [0x00007FF6E345F484]
	(No symbol) [0x00007FF6E345F61D]
	(No symbol) [0x00007FF6E344EB79]
	BaseThreadInitThunk [0x00007FFC30467374+20]
	RtlUserThreadStart [0x00007FFC3061CC91+33]

Co-author IDs: {'7103251673': []}


In [5]:
def get_co_authors_of_co_authors(driver, author_id, num_co_authors=5):
    # Get the metrics for the main author
    author_metrics = extract_author_metrics(driver, author_id)

    # Get the co-authors of the main author
    co_author_data = co_authors(driver, author_id, num_co_authors)

    # Initialize the dictionary to hold all author data
    author_dict = {
        author_id: {
            "author_info": author_metrics,
            "co_authors": {}
        }
    }

    # Loop through each co-author's ID
    for co_author_id in co_author_data[author_id]:
        # Extract metrics for the co-author
        co_author_metrics = extract_author_metrics(driver, co_author_id)

        # Initialize the structure for co-authors of the current co-author
        author_dict[author_id]["co_authors"][co_author_id] = {
            "co_author_info": co_author_metrics,
            "co_authors": {}
        }

        # Get co-authors of the current co-author
        co_authors_of_co_author = co_authors(driver, co_author_id, num_co_authors)

        # Loop through co-authors and add their metrics
        for inner_co_author_id in co_authors_of_co_author[co_author_id]:
            # Extract metrics for each inner co-author
            inner_co_author_metrics = extract_author_metrics(driver, inner_co_author_id)

            # Add inner co-author info
            author_dict[author_id]["co_authors"][co_author_id]["co_authors"][inner_co_author_id] = inner_co_author_metrics

    return author_dict


In [7]:
#use co_authors_co_authors function
driver = webdriver.Chrome()
author_data = get_co_authors_of_co_authors(driver, author_id='7103251673', num_co_authors=2)
driver.quit()
print(author_data)

{'7103251673': {'author_info': {'Nom_Complet': 'Popp, Jürgen R.', 'Affiliation': 'Jena - Germany', 'Citations': 41972, 'Documents': 1287, 'h-index': 87, 'FWCI': 1.25}, 'co_authors': {'55154772500': {'co_author_info': {'Nom_Complet': 'Schubert, Ulrich S.', 'Affiliation': 'Jena - Germany', 'Citations': 95225, 'Documents': 1966, 'h-index': 128, 'FWCI': 1.25}, 'co_authors': {'7103251673': {'Nom_Complet': 'Popp, Jürgen R.', 'Affiliation': 'Jena - Germany', 'Citations': 41972, 'Documents': 1287, 'h-index': 87, 'FWCI': 1.25}, '36050113100': {'Nom_Complet': 'Lehn, Jean Maríe Pierre', 'Affiliation': 'Strasbourg - France', 'Citations': 91511, 'Documents': 979, 'h-index': 141, 'FWCI': 1.39}}}, '7005817505': {'co_author_info': {'Nom_Complet': 'Tünnermann, Andreas', 'Affiliation': 'Jena - Germany', 'Citations': 44325, 'Documents': 1518, 'h-index': 103, 'FWCI': 1.4}, 'co_authors': {'36038963000': {'Nom_Complet': 'Kivshar, Yuri S.', 'Affiliation': 'Canberra - Australia', 'Citations': 87072, 'Document

In [8]:
import json
print(json.dumps(author_data, indent=4))


{
    "7103251673": {
        "author_info": {
            "Nom_Complet": "Popp, J\u00fcrgen R.",
            "Affiliation": "Jena - Germany",
            "Citations": 41972,
            "Documents": 1287,
            "h-index": 87,
            "FWCI": 1.25
        },
        "co_authors": {
            "55154772500": {
                "co_author_info": {
                    "Nom_Complet": "Schubert, Ulrich S.",
                    "Affiliation": "Jena - Germany",
                    "Citations": 95225,
                    "Documents": 1966,
                    "h-index": 128,
                    "FWCI": 1.25
                },
                "co_authors": {
                    "7103251673": {
                        "Nom_Complet": "Popp, J\u00fcrgen R.",
                        "Affiliation": "Jena - Germany",
                        "Citations": 41972,
                        "Documents": 1287,
                        "h-index": 87,
                        "FWCI": 1.25
             

In [6]:
def get_author_document_links(driver, author_id):
    # Construct the URL for the author's page
    url = f"https://www.scopus.com/authid/detail.uri?authorId={author_id}"
    driver.get(url)

    # Scroll down the page by 500px increments to load all content
    last_position = driver.execute_script("return window.pageYOffset;")
    
    while True:
        # Scroll down by 500 pixels
        driver.execute_script("window.scrollBy(0, 800);")
        
        # Wait to allow content to load
        time.sleep(1)  # Adjust sleep time if needed
        
        # Get the new position after scrolling
        new_position = driver.execute_script("return window.pageYOffset;")
        
        # If the position hasn't changed, we've reached the bottom or no new content is loading
        if new_position == last_position:
            break
        last_position = new_position

    # Initialize a list to store document links
    document_links = []

    try:
        # Wait for the page to load completely and the specific elements to be visible
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'li[data-testid="results-list-item"]'))
        )

        # Locate all list items that contain the results
        li_elements = driver.find_elements(By.CSS_SELECTOR, 'li[data-testid="results-list-item"]')

        # Debugging: Print the number of list elements found
        print(f"Number of list elements found: {len(li_elements)}")

        # Loop through each <li> and extract the href from the <a> tag within it
        for li in li_elements:
            try:
                # Find the <a> element within the <li> and get its href attribute
                link_element = li.find_element(By.CSS_SELECTOR, 'a[href^="/record/display.uri"]')
                href = link_element.get_attribute('href')
                if href:
                    # If a valid link is found, append it to the list
                    document_links.append(href)
            except Exception as inner_e:
                print(f"Error retrieving link in one of the elements: {inner_e}")
                continue  

    except Exception as e:
        print(f"An error occurred while fetching document links: {e}")
      
    return document_links


In [7]:
driver=webdriver.Chrome()
document_links = get_author_document_links(driver, author_id='7103251673')
driver.quit()
print(document_links)

Number of list elements found: 10
Error retrieving link in one of the elements: Message: no such element: Unable to locate element: {"method":"css selector","selector":"a[href^="/record/display.uri"]"}
  (Session info: chrome=130.0.6723.69); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF6E34F3AB5+28005]
	(No symbol) [0x00007FF6E34583B0]
	(No symbol) [0x00007FF6E32F580A]
	(No symbol) [0x00007FF6E3345A3E]
	(No symbol) [0x00007FF6E3345D2C]
	(No symbol) [0x00007FF6E333937C]
	(No symbol) [0x00007FF6E336BA7F]
	(No symbol) [0x00007FF6E3339246]
	(No symbol) [0x00007FF6E336BC50]
	(No symbol) [0x00007FF6E338B8B3]
	(No symbol) [0x00007FF6E336B7E3]
	(No symbol) [0x00007FF6E33375C8]
	(No symbol) [0x00007FF6E3338731]
	GetHandleVerifier [0x00007FF6E37E643D+3118829]
	GetHandleVerifier [0x00007FF6E3836C90+3448640]
	GetHandleVerifier [0x00007FF6E382CF0D+3408317]
	Ge