# Collecting the records of InCites Journal Citation Reports (Web of Science)

In [None]:
# Importing the required libraries.
import re, csv, pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, TimeoutException, StaleElementReferenceException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options

## 1. Getting the data from its URL

In [None]:
def init_webdriver(url, is_firefox=True):
    # Choosing the webdriver.
    if not is_firefox:
        # Running the PhantomJS webdriver.
        driver = webdriver.PhantomJS()
        driver.set_window_size(1120, 550)
    else:
        # Defining the option to the Firefox webdriver.
        options = Options()
        options.headless = True

        # Running the Firefox webdriver.
        driver = webdriver.Firefox(
            executable_path = "/home/breno/geckodriver/geckodriver", options=options)

    if is_firefox & options.headless:
        driver.set_window_size(1120, 550)
        driver.maximize_window()

    # Getting the web page.
    driver.get(url)

    return driver

In [None]:
def authenticate(driver):
    # Waiting for 10 seconds.
    driver.implicitly_wait(10)

    # Authenticating with user's account data.
    username_field = driver.find_element_by_id("username")
    password_field = driver.find_element_by_id("password")
    username_field.send_keys(">>> VALID USER/E-MAIL <<<")
    password_field.send_keys(">>> YOUR PASSWORD <<<")
    password_field.send_keys(Keys.RETURN)

    # Redirecting the list of journals.
    WebDriverWait(driver, 120).until(EC.element_to_be_clickable(
            (By.CSS_SELECTOR, "a[title='Browse journals']"))).click()

In [None]:
def get_data(url):
    # Getting webdriver.
    driver = init_webdriver(url)

    # Authenticating the valid user.
    authenticate(driver)

    data = []
    flag = True
    while flag:
        try:
            # Waiting to load the records.
            WebDriverWait(driver, 120).until(EC.invisibility_of_element(
                    (By.CSS_SELECTOR, "div.backdrop")))

            # Defining the scraper.
            html_soup = BeautifulSoup(driver.page_source, "html.parser")

            # Getting the rows.
            rows = html_soup.find_all("mat-row")
            for idx, row in enumerate(rows):
                try:
                    record = {}
                    # Getting the columns/cells of data.
                    cells = row.select("mat-cell > span")

                    # Journal name.
                    record["journal_name"] = re.sub(r"\s+", " ", cells[0].string).strip()

                    # ISSN.
                    record["issn"] = re.sub(r"\s+", " ", cells[1].string).strip()

                    # eISSN.
                    record["e_issn"] = re.sub(r"\s+", " ", cells[2].string).strip()

                    # Category.
                    if cells[3].select("span.multiple > mat-expansion-panel > div > div > span"):
                        items = cells[3].select("span.multiple > mat-expansion-panel > div > div > span")
                        record["category"] = [re.sub(r"\s+", " ", item.string).strip() for item in items]
                    else:
                        record["category"] = re.sub(r"\s+", " ", cells[3].string).strip()

                    # Total citations.
                    record["total_citations"] = re.sub(r"\s+", " ", cells[4].string).strip()

                    # 2020 JIF.
                    record["impact_factor_2020"] = re.sub(r"\s+", " ", cells[5].string).strip()

                    # JIF Quartile.
                    if cells[6].select("span.multiple > mat-expansion-panel > div > div > span"):
                        items = cells[6].select("span.multiple > mat-expansion-panel > div > div > span")
                        record["jif_quartile"] = [re.sub(r"\s+", " ", item.string).strip() for item in items]
                    else:
                        record["jif_quartile"] = re.sub(r"\s+", " ", cells[6].string).strip()

                    # 2020 JCI.
                    record["2020_jci"] = re.sub(r"\s+", " ", cells[7].string).strip()

                    # % of OA Gold.
                    record["percent_oa_gold"] = re.sub(r"\s+", " ", cells[8].string).strip()

                    data.append(record)
                except Exception as e:
                    print(idx)
                    raise e

            # Waiting to load the button "Next page".
            WebDriverWait(driver, 120).until(EC.element_to_be_clickable(
                    (By.CSS_SELECTOR, "button.mat-paginator-navigation-next")))

            # Clicking the button.
            button = driver.find_element_by_css_selector("button.mat-paginator-navigation-next")
            driver.execute_script("arguments[0].scrollIntoView();", button)
            flag = False if button.get_attribute("disabled") else True
            if flag:
                button.click()
        except NoSuchElementException:
            break
        except TimeoutException:
            break
        except StaleElementReferenceException:
            break

    # Closing the webdriver.
    driver.quit()

    return data

In [None]:
# Determining the URL of target page.
url = "https://jcr.clarivate.com/jcr/home"

In [None]:
# Collecting the data.
data = get_data(url)

In [None]:
# Printing the number of records collected.
print("Number of records collected: {}.".format(len(data)))

## 2. Saving the data collected

In [None]:
# Creating the dataframe object.
df_data = pd.DataFrame(data)

In [None]:
# Preprocessing the data.
df_data.replace({"n/a": None, "N/A": None}, inplace=True)
df_data[["category", "jif_quartile"]] = df_data[["category", "jif_quartile"]].apply(
    lambda x: x.apply(lambda y: tuple(y) if type(y) == list else y), axis=1)
df_data.drop_duplicates(keep="first", inplace=True)

In [None]:
# Checking the information about the dataset.
df_data.info()

In [None]:
# Exporting the data to CSV file.
df_data.to_csv("jcr_2020_wos.csv", index=False, quoting=csv.QUOTE_ALL)