# Collecting the records of InCites Journal Citation Reports (Web of Science)

In [1]:
# Importing the required libraries.
import re, traceback, csv, pandas as pd, time, os
import playwright._impl._errors as errors
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
from twisted.internet.error import TCPTimedOutError, TimeoutError

## 1. Defining the class of Crawler

In [2]:
class InCitesSpider:
    def __init__(self, url, login, password):
        self.__url_base = url
        self.__username = login
        self.__password = password
        self.__max_attempts = 1
        self.__user_agent = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) "
                             "Chrome/122.0.0.0 Safari/537.36 OPR/108.0.0.0")
        self.__data = None
        self.__playwright = None
        self.__browser = None
        self.__page = None
        os.environ["USER_AUTHENTICATED"] = "False"

    @property
    def get_data(self):
        return self.__data

    async def __get_html(self, url, css_selector=None, to_close=True):
        if self.__playwright is None:
            self.__playwright = await async_playwright().start()
        if self.__browser is None or not self.__browser.is_connected():
            self.__browser = await self.__playwright.chromium.launch(headless=True, args=["--start-maximized"])
        if self.__page is None or self.__page.is_closed():
            self.__page = await self.__browser.new_page(user_agent=self.__user_agent)
            # self.__page = await self.__browser.new_page()
        await self.__page.goto(url)
        if css_selector is not None:
            await self.__page.wait_for_selector(css_selector)
        html = await self.__page.content()
        if to_close:
            await self.__browser.close()
            await self.__playwright.stop()
        return html

    async def authenticate(self, num_attempt=0):
        try:
            # Initializing the webdriver.
            await self.__get_html(self.__url_base, to_close=False)

            # Clicking in the button "Sign In".
            css_select = "button[aria-label='Sign In']"
            button = self.__page.locator(css_select)
            if await button.is_visible() and await button.is_enabled():
                await button.click()

            # Authenticating with user's account data.
            username_field = self.__page.locator("input#mat-input-0")
            await username_field.wait_for(state="visible")
            await username_field.fill(self.__username)
            password_field = self.__page.locator("input#mat-input-1")
            await password_field.wait_for(state="visible")
            await password_field.fill(self.__password)
            await password_field.press("Enter")

            # Enabling the cookies.
            css_select = "button#onetrust-accept-btn-handler"
            button = self.__page.locator(css_select)
            await button.wait_for(state="visible", timeout=120000)
            if await button.is_visible() and await button.is_enabled():
                await button.click()

            # Redirecting the list of journals.
            css_select = "div[aria-label='Journals']"
            button = self.__page.locator(css_select)
            await button.scroll_into_view_if_needed()
            await button.wait_for(state="visible", timeout=120000)
            if await button.is_visible() and await button.is_enabled():
                await button.click()

            os.environ["USER_AUTHENTICATED"] = "True"
        except (errors.TimeoutError, errors.TargetClosedError, errors.Error, AttributeError, Exception,
                TCPTimedOutError, TimeoutError) as e:
            print(f"[ERROR-DEBUG] {e}: {self.__url_base}")
            print("".join(traceback.format_tb(e.__traceback__)))
            if num_attempt <= self.__max_attempts:
                num_attempt += 1
                print(f"Number of attempting in 'authenticate': {num_attempt}")
                await self.authenticate(num_attempt)

    async def parse_items(self, num_attempt=0):
        self.__data = []
        try:
            num_item_per_page = None
            flag = True

            # Getting the number of journals.
            element = self.__page.locator("div#liveRegion > p")
            await element.wait_for(state="visible")
            num_journals = await element.text_content()
            num_journals = int(num_journals.replace(",", "").split(" ")[0])

            # Setting the number of items per page.
            element = self.__page.locator("mat-select[aria-label='Items per page:']")
            await element.wait_for(state="visible")
            await element.scroll_into_view_if_needed()
            await element.click()
            element = self.__page.locator("div[aria-label='Items per page:'] > mat-option:nth-child(5)")
            await element.click()
            css_select = "mat-select[aria-label='Items per page:'] > div > div > span > span"
            element = self.__page.locator(css_select)
            num_item_per_page = int(await element.text_content())
            print("Number of Items per Page:", num_item_per_page)

            while flag:
                # Showing the progress.
                print(f"Collected: {len(self.__data)} of {num_journals} ({((len(self.__data) / num_journals) * 100):.2f}%)")
                time.sleep(10)

                # Waiting to load the records.
                element = self.__page.locator("div.backdrop")
                await element.wait_for(state="hidden", timeout=120000)

                # Waiting to load the table of records.
                table = self.__page.locator("section.table-section > mat-table[class*='mat-table']")
                await table.wait_for(state="visible", timeout=120000)
                if await table.is_visible():
                    # Defining the scraper.
                    html_soup = BeautifulSoup(await self.__page.content(), "html.parser")

                    # Getting the rows.
                    rows = html_soup.find_all("mat-row")
                    for idx, row in enumerate(rows):
                        try:
                            record = {}
                            # Getting the columns/cells of data.
                            cells = row.select("mat-cell > span")

                            # Journal name.
                            record["journal_name"] = re.sub(r"\s+", " ", cells[0].string).strip()

                            # ISSN.
                            record["issn"] = re.sub(r"\s+", " ", cells[1].string).strip()

                            # eISSN.
                            record["e_issn"] = re.sub(r"\s+", " ", cells[2].string).strip()

                            # Category and Edition.
                            if cells[3].select("span.multiple > mat-expansion-panel > div > div > div > span"):
                                items = cells[3].select("span.multiple > mat-expansion-panel > div > div > div > span")
                                record["category"] = tuple([re.sub(r"\s+", " ", item.string).strip() for item in items])
                                items = cells[4].select("span.table-cell-edition > mat-expansion-panel > div > div > span")
                                record["edition"] = tuple([re.sub(r"\s+", " ", item.string).strip() for item in items])
                            else:
                                record["category"] = re.sub(r"\s+", " ", cells[3].string).strip()
                                record["edition"] = tuple([re.sub(r"\s+", " ", item).strip() for item in cells[4].string.split(",")])

                            # Total citations.
                            record["total_citations"] = re.sub(r"\s+", " ", cells[5].string).strip()

                            # 2023 JIF.
                            record["impact_factor_2023"] = re.sub(r"\s+", " ", cells[6].string).strip()

                            # JIF Quartile.
                            if cells[7].select("span.multiple > mat-expansion-panel > div > div > span"):
                                items = cells[7].select("span.multiple > mat-expansion-panel > div > div > span")
                                record["jif_quartile"] = tuple([re.sub(r"\s+", " ", item.string).strip() for item in items])
                            else:
                                record["jif_quartile"] = re.sub(r"\s+", " ", cells[7].string).strip()

                            # 2023 JCI.
                            record["jci_2023"] = re.sub(r"\s+", " ", cells[8].string).strip()

                            # % of OA Gold.
                            record["percent_oa_gold"] = re.sub(r"\s+", " ", cells[9].string).strip()

                            self.__data.append(record)
                        except Exception as e:
                            print(idx)
                            raise e

                    # Clicking the button.
                    button = self.__page.locator("button.mat-paginator-navigation-next")
                    await button.scroll_into_view_if_needed()
                    flag = await button.is_enabled()
                    if flag:
                        await button.click()
                else:
                    flag = False

            # Closing the webdriver.
            await self.__browser.close()
            await self.__playwright.stop()
        except (errors.TimeoutError, errors.TargetClosedError, errors.Error, AttributeError, Exception,
                TCPTimedOutError, TimeoutError) as e:
            print(f"[ERROR-DEBUG] {e}: {self.__url_base}")
            print("".join(traceback.format_tb(e.__traceback__)))
            if num_attempt <= self.__max_attempts:
                num_attempt += 1
                print(f"Number of attempting in 'parse_items': {num_attempt}")
                await self.parse_items(num_attempt)

## 2. Getting the data from its URL

In [None]:
# Defining the credentials.
username = ">>> VALID USER/E-MAIL <<<"
password = ">>> YOUR PASSWORD <<<"

# Determining the URL of target page.
url = "https://jcr.clarivate.com/jcr/home"

# Creating the spider.
spider = InCitesSpider(url, username, password)

# Authenticating the valid user.
await spider.authenticate()
is_authenticated = bool(os.environ["USER_AUTHENTICATED"])

# Crawling the data.
if is_authenticated:
    await spider.parse_items()

In [6]:
# Getting the collected data.
data = spider.get_data

In [None]:
# Printing the number of records collected.
print("Number of records collected: {}.".format(len(data)))

## 3. Saving the data collected

In [10]:
# Creating the Pandas' DataFrame object.
df_data = pd.DataFrame(data)

In [11]:
# Preprocessing the data.
df_data.replace({"n/a": None, "N/A": None}, inplace=True)
df_data[["category", "edition", "jif_quartile"]] = df_data[["category", "edition", "jif_quartile"]].apply(
    lambda x: x.apply(lambda y: tuple(y) if type(y) == list else y), axis=1)
df_data.drop_duplicates(keep="first", inplace=True)

In [None]:
# Checking the information about the dataset.
df_data.info()

In [13]:
# Exporting the data to CSV file.
df_data.to_csv("jcr_2023_wos.csv", index=False, quoting=csv.QUOTE_ALL)