In [7]:
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
import time
import pandas as pd
from bs4 import BeautifulSoup
import re
import csv

# Configure le service EdgeDriver
service = Service(executable_path=r"C:\Users\Electro Fatal\Desktop\file\edgedriver_win64\msedgedriver.exe")

# Lance une instance de Microsoft Edge
driver = webdriver.Edge(service=service)


In [8]:
driver.get("https://www.webofscience.com.eressources.imist.ma/")

In [9]:
email_field = driver.find_element(By.ID, "email")
password_field = driver.find_element(By.ID, "password")

email_field.send_keys('youness.elmeki@usms.ac.ma')
password_field.send_keys('Youness@@2002')

password_field.send_keys(Keys.RETURN)

In [10]:
def get_author_information(id):
    # Charge la page de l'auteur et vérifie si elle est prête
    if wait_for_page_to_load(id):
        infos = {}
        infos['ID de l\'Auteur'] = id
        for key, item in author_information.items():
            # Si la clé est "Co-auteurs", extrait les noms des co-auteurs
            if key == 'Co-auteurs':
                co_auteurs = driver.find_elements(By.CLASS_NAME, 'authors-list-link')
                list_co_auteurs = [auteur.text for auteur in co_auteurs]
                infos['co_auteurs'] = list_co_auteurs
            # Si l'item est un dictionnaire, récupère des métriques spécifiques
            elif isinstance(item, dict):
                metric_descriptor = driver.find_elements(By.CLASS_NAME, key)
                for metric in metric_descriptor:
                    if metric.text in item.values():
                        value = metric.find_element(By.XPATH, './preceding-sibling::div').text
                        infos[metric.text] = value
            # Récupère les autres informations
            else:
                value = driver.find_element(By.CLASS_NAME, f'{item}').text
                infos[key] = value
    return infos

def extract_article_details(driver, article_link):
    # Ouvre le lien de l'article
    driver.get(article_link)
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, 'title'))
    )
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Extrait l'ISSN de l'article
    try:
        match = re.search(r"KeyISSN=\d{4}-\d{3}[\dxX]", str(soup.find_all("a")))
        issn = match.group().split('=')[1]
    except:
        print('ISSN error')

    infos = {}
    try:
        issn
        for info in infos_article:
            try:
                # Récupère le nombre de citations
                if info == 'citation-count':
                    infoElem = driver.find_element(By.CLASS_NAME, info).text.split('\n')
                    infos[info] = 0 if infoElem[1] == 'Cited References' else infoElem[0]
                # Récupère le titre, les mots-clés, ...
                elif info in ['title', 'keywordsPlusLink', 'summary-source-title-link']:
                    if info == 'keywordsPlusLink':
                        infos[info] = ' , '.join([key.text for key in driver.find_elements(By.CLASS_NAME, 'keywordsPlusLink')])
                    else:
                        infos[info] = driver.find_element(By.CLASS_NAME, info).text
                elif info == 'SumAuthTa-DisplayName-author-en-':
                    infos[info] = ' ; '.join([element.text for element in driver.find_elements(By.XPATH, "//a[starts-with(@id,'SumAuthTa-DisplayName-author-en-')]")])
                else:
                    infos[info] = driver.find_element(By.ID, info).text
            except:
                print(info, 'not exist')
    except:
        print("Variable 'issn' does not exist.")
    return infos

def search_journal_by_issn(issn):
    # Recherche la revue par ISSN sur Scimago
    driver.get('https://www.scimagojr.com/')
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, 'searchinput'))
    )
    try:
        # Remplit et soumet le champ de recherche avec l'ISSN
        search_box = driver.find_element(By.ID, "searchinput")
        search_box.send_keys(issn)
        search_box.submit()

        # Accède au premier résultat de la recherche
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//div[@class='search_results']/a[@href]")))
        journal_link = driver.find_element(By.XPATH, "//div[@class='search_results']/a[@href]")
        journal_link.click()

        # Récupère les informations de la revue
        journal_info = {}
        journal_info['Nom'] = driver.find_element(By.XPATH, "//h1").text

        # Récupère l'éditeur, l'ISSN, l'indexation, H-index, quartile, SJR, impact factor, et portée thématique
        try:
            journal_info['Editeur'] = driver.find_element(By.XPATH, "//div[h2[text()='Publisher']]/p/a").text
        except NoSuchElementException:
            journal_info['Editeur'] = 'Non disponible'
        try:
            issn_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "//div[h2[text()='ISSN']]/p"))
            )
            journal_info['ISSN'] = issn_element.text
        except NoSuchElementException:
            journal_info['ISNN'] = 'Non disponible'
        try:
            journal_info['Index'] = driver.find_element(By.XPATH, "//div[h2[text()='Coverage']]/p").text
        except NoSuchElementException:
            journal_info['Index'] = 'Non disponible'
        try:
            journal_info['H-index'] = driver.find_element(By.XPATH, "//div[h2[text()='H-Index']]/p").text
        except NoSuchElementException:
            journal_info['H-index'] = 'Non disponible'
        try:
            journal_info['Quartile'] = driver.find_element(By.XPATH, "(//div[@class='cellcontent']//table/tbody/tr[last()]/td[3])[1]").text
        except NoSuchElementException:
            journal_info['Quartile'] = 'Non disponible'
        try:
            journal_info['SJR'] = driver.find_element(By.XPATH, "(//div[@class='cellcontent']//table/tbody/tr[last()]/td[3])[2]").text
        except NoSuchElementException:
            journal_info['SJR'] = 'Non disponible'
        try:
            journal_info['Impact factor'] = driver.find_element(By.XPATH, "(//div[@class='cellcontent']//table/tbody/tr[last()]/td[3])[4]").text
        except NoSuchElementException:
            journal_info['Impact factor'] = 'Non disponible'
        try:
            journal_info['Portee thematique'] = driver.find_element(By.CLASS_NAME, 'fullwidth').text.split('\n', 1)[-1].strip()
        except NoSuchElementException:
            journal_info['Portee thematique'] = 'Non disponible'

        return journal_info

    except NoSuchElementException:
        print(f"Revue avec ISSN {issn} non trouvée.")
        return None

def wait_for_page_to_load(user_id):
    # Charge la page de l'auteur et vérifie qu'elle est prête
    driver.get(f"https://www.webofscience.com.eressources.imist.ma/wos/author/record/{user_id}")
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'onetrust-close-btn-handler'))
        )
        btn_cookies(driver)
        scroll_slowly(driver, scroll_pause_time=0.2, scroll_increment=100)
        return True
    except Exception as e:
        print(f"Erreur lors de l'attente du chargement de la page : {e}")
        return False

def btn_cookies(driver):

    try:
        cookies_button = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'onetrust-close-btn-handler'))
        )
        if cookies_button.is_displayed():
            WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CLASS_NAME, 'onetrust-close-btn-handler'))
            ).click()
    except Exception as e:
        print(f"Erreur lors de la gestion des cookies : {e}")

def get_article_titles():
    titles = []

    while True:
        # Récupère les liens des articles présents sur la page
        articles = driver.find_elements(By.CSS_SELECTOR, '.title')
        for article in articles:
            titles.append(article.get_attribute('href'))

        # Passe à la page suivante si elle existe
        try:
            next_button = driver.find_element(By.XPATH, '//button[@data-ta="next-page-button"]')
            if 'mat-button-disabled' in next_button.get_attribute('class'):
                break
            else:
                next_button.click()
                scroll_slowly(driver, scroll_pause_time=0.2, scroll_increment=100)
        except NoSuchElementException:
            break

    return titles

def scroll_slowly(driver, scroll_pause_time=0.1, scroll_increment=100):
    # Scrolle lentement
    last_height = driver.execute_script("return document.body.scrollHeight")
    current_scroll_position = 0

    while current_scroll_position < last_height:
        current_scroll_position += scroll_increment
        driver.execute_script(f"window.scrollTo(0, {current_scroll_position});")
        time.sleep(scroll_pause_time)
        last_height = driver.execute_script("return document.body.scrollHeight")


In [11]:
#informations auteurs /articles
author_information = {
    'nom_complet' : 'wat-author-name',
    'pays_affiliation' : 'more-details',
    'Co-auteurs' : 'authors-list-link',
    'wat-author-metric-descriptor' : {
        'H-index' : 'H-Index',
        'Citations_totales' : 'Sum of Times Cited'
        }
}
infos_article = ['title','SumAuthTa-DisplayName-author-en-','FullRTa-pubdate' , 'summary-source-title-link' ,'keywordsPlusLink', 'citation-count','FullRTa-DOI','FullRTa-abstract-basic','FullRTa-doctype-0']

In [12]:
#ID de l'auteur
author_data = get_author_information(1410815)

Erreur lors de l'attente du chargement de la page : Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF66DCACEC5+12997]
	Microsoft::Applications::Events::EventProperty::empty [0x00007FF66DF2BCA4+1897908]
	(No symbol) [0x00007FF66DA5F1CC]
	(No symbol) [0x00007FF66DAA672E]
	(No symbol) [0x00007FF66DAA67B5]
	(No symbol) [0x00007FF66DAE4ED7]
	(No symbol) [0x00007FF66DAC7F8F]
	(No symbol) [0x00007FF66DA9C09D]
	(No symbol) [0x00007FF66DAE2887]
	(No symbol) [0x00007FF66DAC7BB3]
	(No symbol) [0x00007FF66DA9B5AC]
	(No symbol) [0x00007FF66DA9AA8D]
	(No symbol) [0x00007FF66DA9B171]
	Microsoft::Applications::Events::EventProperty::empty [0x00007FF66DE59DD4+1038052]
	Microsoft::Applications::Events::EventProperty::~EventProperty [0x00007FF66DBD08BF+56655]
	Microsoft::Applications::Events::EventProperty::~EventProperty [0x00007FF66DBC3063+1267]
	Microsoft::Applications::Events::EventProperty::empty [0x00007FF66DE58B5D+1033325]
	Microsoft::Applications::Events::ILogConfiguration::operator* [0x00007FF

UnboundLocalError: cannot access local variable 'infos' where it is not associated with a value

In [91]:
file_path = 'author_data.csv'

# Writing to the CSV file
with open(file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    # Write the headers
    writer.writerow(author_data.keys())

    # Write the data
    writer.writerow([author_data["ID de l'Auteur"],
                     author_data["nom_complet"],
                     author_data["pays_affiliation"],
                     ', '.join(author_data["co_auteurs"]),
                     author_data["H-Index"],
                     author_data["Sum of Times Cited"]])

articles = get_article_titles()

print(f"Data saved successfully to {file_path}")

KeyError: 'H-Index'

In [56]:
len(articles)

10

In [57]:
import json
from selenium.common.exceptions import TimeoutException

all_articles_data = []  # Liste pour stocker data de tous les articles

for article_link in articles:
    try:
        article_infos = extract_article_details(driver, article_link)
        all_articles_data.append(article_infos)
    except TimeoutException:
        print(f"Timeout occurred while loading {article_link}. Skipping to the next article.")

data_to_save = {
    "Auteur": author_data,
    "Articles publiés": all_articles_data
}

# Enregistrer data dans un fichier JSON
with open("author_articles_data.json", "w", encoding="utf-8") as json_file:
    json.dump(data_to_save, json_file, ensure_ascii=False, indent=4)

print("Les informations ont été enregistrées dans 'author_articles_data.json'.")


ISSN error
Variable 'issn' does not exist.
ISSN error
Variable 'issn' does not exist.
ISSN error
Variable 'issn' does not exist.
ISSN error
Variable 'issn' does not exist.
ISSN error
Variable 'issn' does not exist.
citation-count not exist
FullRTa-DOI not exist
ISSN error
Variable 'issn' does not exist.
ISSN error
Variable 'issn' does not exist.
ISSN error
Variable 'issn' does not exist.
Les informations ont été enregistrées dans 'author_articles_data.json'.


In [58]:
# Exemple d'utilisation
issn = "1077-3118"
journal_data = search_journal_by_issn(issn)

if journal_data:
    print("Informations sur la revue:")
    for key, value in journal_data.items():
        print(f"{key}: {value}")
else:
    print("Aucune information trouvée pour cet ISSN.")


Informations sur la revue:
Nom: Applied Physics Letters
Editeur: American Institute of Physics
ISSN: 00036951, 10773118
Index: 1962-2023
H-index: 478
Quartile: Q1
SJR: 3.462
Impact factor: 3.604
Portee thematique: Applied Physics Letters (APL) features concise, up-to-date reports on significant new findings in applied physics. Emphasizing rapid dissemination of key data and new physical insights, APL offers prompt publication of new experimental and theoretical papers reporting applications of physics phenomena to all branches of  science, engineering, and modern technology. In addition to regular articles, the journal also publishes invited Fast Track, Perspectives, and in-depth Editorials which report on cutting-edge areas in applied physics. APL Perspectives are forward-looking invited letters which highlight recent developments or discoveries. Emphasis is placed on very recent developments, potentially disruptive technologies, open questions and possible solutions. They also includ