# Selenium ve BeautifulSoup Kütüphaneleri İle Makale Bilgilerini Çekme

## Kütüphaneleri import etme 

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
from bs4 import BeautifulSoup
import pandas as pd
import re

## Fonksiyon 1: Arama yapma ve bağlantıları toplama

In [2]:
def perform_search(driver, query):
    # Initialize the browser to go to Pubmed
    driver.get("https://pubmed.ncbi.nlm.nih.gov")

    # Click the "Advanced" search button
    advanced_search_button = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "adv-search-link"))
    )
    advanced_search_button.click()

    # Select the Field Selector
    field_selector = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "field-selector"))
    )
    field_selector_select = Select(field_selector)
    field_selector_select.select_by_value("Title/Abstract")

    # Find the query box
    query_box = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "id_term"))
    )

    # Send the search query
    query_box.send_keys(query)

    # Click the "Add" button
    add_button = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "add-button"))
    )
    add_button.click()

    # Click the "Search" button
    search_button = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "search-btn"))
    )
    search_button.click()

    # Collect links from the search results
    link_list = []
    while True:
        try:
            # Find all titles on the page
            titles = driver.find_elements(By.CLASS_NAME, 'docsum-title')

            # Collect links for each title
            for title in titles:
                try:
                    # Use XPath to find the direct link within the title element
                    link = title.get_attribute('href')
                    link_list.append(link)
                    print(link)
                except NoSuchElementException:
                    print("Link not found in the title.")
                    continue
        except StaleElementReferenceException:
            print("Window closed. Exiting the loop.")
            break
        except Exception as e:
            print(f"Error collecting links: {e}")

        # Find and click the "Next" button
        try:
            next_button = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "next-page-btn"))
            )
            if "disabled" in next_button.get_attribute("class"):
                print("No more pages. Exiting the loop.")
                break  # Exit the loop if the "Next" button is disabled
            else:
                next_button.click()
                # Optionally, wait for the page to load (can be adjusted)
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "docsum-title"))
                )
        except StaleElementReferenceException:
            print("Window closed. Exiting the loop.")
            break
        except Exception as e:
            print(f"Error clicking Next button: {e}")
            print("Exiting the loop due to the error.")
            break

    return link_list


## Verilen bağlantıdaki bilgileri çekme

In [3]:
def extract_information(driver, link):
    # Visit the given link
    driver.get(link)

    # Get the page source
    page_source = driver.page_source

    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(page_source, 'html.parser')

    # Find the article type (if available)
    article_type = soup.find('div', class_='publication-type').text.strip() if soup.find('div', class_='publication-type') else "N/A"

    # Find the journal name (if available)
    journal_name = soup.find('button', id='full-view-journal-trigger').text.strip() if soup.find('button', id='full-view-journal-trigger') else "N/A"

    # Find the year
    year_match = re.search(r'\b(\d{4})\b', soup.find('span', class_='cit').text)
    year = year_match.group(1) if year_match else "N/A"

    # Find the article title
    title = soup.find('h1', class_='heading-title').text.strip() if soup.find('h1', class_='heading-title') else "N/A"

    # Find the affiliation information
    affiliation_text = soup.find('li', {'data-affiliation-id': 'full-view-affiliation-1'}).text.strip() if soup.find('li', {'data-affiliation-id': 'full-view-affiliation-1'}) else "N/A"
    city_state_match = re.search(r'([^\d,]+),\s*([^\d,]+)$', affiliation_text)
    country = f"{city_state_match.group(1)}, {city_state_match.group(2)}" if city_state_match else "N/A"

    # Find the DOI number
    doi_text = soup.find('span', class_='identifier doi').text.strip() if soup.find('span', class_='identifier doi') else "N/A"
    doi_match = re.search(r'10\.\d+\/[^\s]+', doi_text)
    doi = doi_match.group() if doi_match else "N/A"

    # Find the abstract section
    abstract = soup.find('div', class_='abstract-content').find('p').text.strip() if soup.find('div', class_='abstract-content') else "N/A"

    # Find keywords
    keywords = [button.text.strip() for button in soup.select('.keywords-list .keyword-actions-trigger')]

    # Return the information as a tuple
    return article_type, journal_name, year, title, country, doi, abstract, ', '.join(keywords), link


## Verileri DataFrame'e ekleme ve CSV'ye kaydetme

In [4]:
# Main function: Entry point of the program
def main():
    # Initialize the browser
    driver = webdriver.Chrome()

    # Initialize the data dictionary to store article information
    data = {
        'Article Name': [],
        'Link': [],
        'DOI Number': [],
        'Country': [],
        'Year': [],
        'Journal Name': [],
        'Abstract': [],
        'Keywords': [],
        'Article Type': []
    }

    try:
        # Define search queries
        queries = [
            "anesthesiology AND artificial intelligence",
            "anesthesiology AND machine learning",
            "anesthesiology AND Deep learning"
        ]

        # Perform search for each query
        for query in queries:
            link_list = perform_search(driver, query)

            # Process each link
            for link in link_list:
                # Extract article information using the data retrieval function
                article_type, journal_name, year, title, country, doi, abstract, keywords, link = extract_information(driver, link)

                # Add information to the DataFrame
                data['Article Name'].append(title)
                data['Link'].append(link)
                data['DOI Number'].append(doi)
                data['Country'].append(country)
                data['Year'].append(year)
                data['Journal Name'].append(journal_name)
                data['Abstract'].append(abstract)
                data['Keywords'].append(keywords)
                data['Article Type'].append(article_type)

    finally:
        # Close the browser
        driver.quit()

        # Create a DataFrame containing the data
        df = pd.DataFrame(data)

        # Save the DataFrame to a CSV file
        df.to_csv('article_information.csv', index=False)

# Main process: The part where the program is executed
main()


https://pubmed.ncbi.nlm.nih.gov/30973516/
https://pubmed.ncbi.nlm.nih.gov/31939856/
https://pubmed.ncbi.nlm.nih.gov/35164492/
https://pubmed.ncbi.nlm.nih.gov/37864754/
https://pubmed.ncbi.nlm.nih.gov/32287116/
https://pubmed.ncbi.nlm.nih.gov/35928743/
https://pubmed.ncbi.nlm.nih.gov/37194240/
https://pubmed.ncbi.nlm.nih.gov/29686578/
https://pubmed.ncbi.nlm.nih.gov/31845543/
https://pubmed.ncbi.nlm.nih.gov/37865848/
https://pubmed.ncbi.nlm.nih.gov/37307038/
https://pubmed.ncbi.nlm.nih.gov/38106626/
https://pubmed.ncbi.nlm.nih.gov/33785475/
https://pubmed.ncbi.nlm.nih.gov/32962932/
https://pubmed.ncbi.nlm.nih.gov/3057121/
https://pubmed.ncbi.nlm.nih.gov/37697415/
https://pubmed.ncbi.nlm.nih.gov/33713933/
https://pubmed.ncbi.nlm.nih.gov/32494358/
https://pubmed.ncbi.nlm.nih.gov/37336139/
https://pubmed.ncbi.nlm.nih.gov/32318789/
https://pubmed.ncbi.nlm.nih.gov/32287130/
https://pubmed.ncbi.nlm.nih.gov/38124357/
https://pubmed.ncbi.nlm.nih.gov/37572764/
https://pubmed.ncbi.nlm.nih.gov/361

In [5]:
df=pd.read_csv('article_information.csv') # read the csv file

In [6]:
df # show the dataframe

Unnamed: 0,Article Name,Link,DOI Number,Country,Year,Journal Name,Abstract,Keywords,Article Type
0,Artificial Intelligence and Machine Learning i...,https://pubmed.ncbi.nlm.nih.gov/30973516/,10.1097/ALN.0000000000002694,"Boston, Massachusetts.",2019,Anesthesiology,Commercial applications of artificial intellig...,"Review, Algorithms*, Anesthesiology / methods,...",Review
1,Artificial Intelligence in Anesthesiology: Cur...,https://pubmed.ncbi.nlm.nih.gov/31939856/,10.1097/ALN.0000000000002960,"Cambridge, Massachusetts (G.R.).",2020,Anesthesiology,Artificial intelligence has been advancing in ...,"Review, Anesthesiology / methods*, Anesthesiol...",Review
2,Artificial intelligence and telemedicine in an...,https://pubmed.ncbi.nlm.nih.gov/35164492/,10.23736/S0375-9393.21.16241-8,"Parma, Italy.",2022,Minerva Anestesiol,The application of novel technologies like art...,"Review, Research Support, Non-U.S. Gov't, Anes...",Review
3,Artificial intelligence and its clinical appli...,https://pubmed.ncbi.nlm.nih.gov/37864754/,10.1007/s10877-023-01088-0,,2023,J Clin Monit Comput,Purpose:\n \n \n Application ...,Review,Review
4,Artificial Intelligence in Anesthesiology: Hyp...,https://pubmed.ncbi.nlm.nih.gov/32287116/,10.1213/ANE.0000000000004751,,2020,Anesth Analg,,"Editorial, Anesthesiology / methods, Anesthesi...",Editorial
...,...,...,...,...,...,...,...,...,...
126,On the Horizon: Specific Applications of Autom...,https://pubmed.ncbi.nlm.nih.gov/38106626/,10.1007/s40140-023-00558-0,"MA, USA.",2023,Curr Anesthesiol Rep,Purpose of review:\n \n \n Th...,,
127,Patent and Bibliometric Analysis of the Scient...,https://pubmed.ncbi.nlm.nih.gov/37998496/,10.3390/healthcare11223003,"Kharkiv, Ukraine.",2023,Healthcare (Basel),This study conducted a comprehensive patent an...,,
128,Comparative analysis of popular predictors for...,https://pubmed.ncbi.nlm.nih.gov/36451753/,10.1016/j.heliyon.2022.e11761,"Limerick, Ireland.",2022,Heliyon,Difficult laryngoscopy is associated with airw...,,
129,Study on miR-384-5p activates TGF-β signaling ...,https://pubmed.ncbi.nlm.nih.gov/31813493/,10.1016/j.artmed.2019.101740,"Henan Province, China.",2019,Artif Intell Med,Background:\n \n \n Any ailme...,"Animals, Basal Ganglia / cytology, Basal Gangl...",
