# Scraping Retraction Database

Link: http://retractiondatabase.org/RetractionSearch.aspx

First block contains:
- necessary imports
- functions

In [11]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import time


"""
Function: insert_and_submit(value)

Handles the search form of the website.
In our case we need to select a journal from the list and submit.
"""
def insert_and_submit(value):
    # Selects the corresponding element in the form to select a journal.
    driver.find_element_by_id("txtSrchJournal").send_keys(value)
    
    # Wait for the dropdown menu to appear. 
    WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".ui-menu-item")))
    
    # Click on the first option in the dropdown menu
    driver.find_element_by_css_selector(".ui-menu-item").click()
    
    # Add a short delay before submitting to ensure everything is loaded properly
    time.sleep(1)
    
    # Submit the form
    driver.find_element_by_id("btnSearch").click()
    

"""
Function extract_row_data(row)

Extracting data in one row; this function is called from within another function.
Purpose is to avoid to convoluted functions. 

TODO: rewrite to save from which column the data is
"""
def extract_row_data(row):
    data = []

    # Iterate over each td element in the row
    for td in row.find_elements_by_tag_name("td"):
        # Extract text and append to the data list
        data.append(td.text)
    return data


""" 
Function print_row_data(all_data_rows)

"""
def print_row_data(all_data_rows):
    
    row_counter = 1
    
    for row in all_data_rows:
        print("Row: " + str(row_counter))
        row_counter = row_counter + 1 
        
        for element in row:
            print(element)
        
    


# Input - Block

Next block can be used to define what journal(s) should be scraped. <br>
The ability to input a list as well as other parameters can be added later.

In [None]:
#journal = "Digital Scholarship in the Humanities"
#journal = "Artificial Intelligence and Computational Intelligence (AICI 2009)"
#journal = "Artificial Intelligence in China (Proceedings of the 2nd International Conference on Artificial Intelligence in China)"

# ... 

journal = "Applied Artificial Intelligence"

In [8]:

# Step 1:
# Open the website and set up selenium driver.
url = "http://retractiondatabase.org/RetractionSearch.aspx"
driver = webdriver.Firefox(executable_path= '/Users/sebastian/Downloads/geckodriver')
driver.get(url)

# Wait for 10 seconds because the page takes a time to load.
# This step is not repeated so we can wait long enough to be sure everything is loaded.
time.sleep(10)

# Step 2:
# Insert value and submit the form
insert_and_submit(journal)

# Step 3:
# Find all rows with class "mainrow"
rows = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "tr.mainrow")))

# Step 4:
# Iterate over each row, extract data, and append to the list
all_rows_data = []

for row in rows:
    row_data = extract_row_data(row)
    all_rows_data.append(row_data)

# Step 5:
# Print or export data
print_row_data(all_rows_data)

Row 1: ['', 'Wind Farm Layout Design Using Cuckoo Search Algorithms\n(B/T) Computer Science; (ENV) Environmental Sciences; (PHY) Energy;\nApplied Artificial Intelligence ---Taylor and Francis\nResearch Institute, King Fahd University of Petroleum & Minerals, Dhahran, Saudi Arabia Center of Intelligent Signal & Imaging Research, Universiti Teknologi PETRONAS, Bandar Seri Iskandar, Tronoh, Malaysia Computer Science Department, University of Pretoria, Pretoria, South Africa', '+Duplicate Publication through Error by Journal/Publisher', 'Shafiqur Rehman\nS S Ali\nS A Kahn', '10/11/2018\n00000000\n10.1080/08839514.2018.1525521', '06/30/2021\n00000000\n10.1080/08839514.2021.1940459', 'Research Article\nRetraction', 'Malaysia\nSaudi Arabia\nSouth Africa\nNo']
Row 2: ['', 'Automated Text Summarization: An Overview\n(B/T) Data Science;\nApplied Artificial Intelligence ---Taylor and Francis\nComputer Department , University College of Nabi Akram , Tabriz , Iran Computer Department , University o

In [13]:
print_row_data(all_rows_data)

Row: 1

Wind Farm Layout Design Using Cuckoo Search Algorithms
(B/T) Computer Science; (ENV) Environmental Sciences; (PHY) Energy;
Applied Artificial Intelligence ---Taylor and Francis
Research Institute, King Fahd University of Petroleum & Minerals, Dhahran, Saudi Arabia Center of Intelligent Signal & Imaging Research, Universiti Teknologi PETRONAS, Bandar Seri Iskandar, Tronoh, Malaysia Computer Science Department, University of Pretoria, Pretoria, South Africa
+Duplicate Publication through Error by Journal/Publisher
Shafiqur Rehman
S S Ali
S A Kahn
10/11/2018
00000000
10.1080/08839514.2018.1525521
06/30/2021
00000000
10.1080/08839514.2021.1940459
Research Article
Retraction
Malaysia
Saudi Arabia
South Africa
No
Row: 1

Automated Text Summarization: An Overview
(B/T) Data Science;
Applied Artificial Intelligence ---Taylor and Francis
Computer Department , University College of Nabi Akram , Tabriz , Iran Computer Department , University of Tabriz , Tabriz , Iran
+Euphemisms for Plagi