# Collecting the Scopus publications related to the Tiny GenAI from Search String

For collecting the Scopus publications related to the Tiny GenAI, we used the "pybliometrics" library. It is avaliable on https://pypi.org/project/pybliometrics/.

In [None]:
# Uncomment to install the library.
# %pip install pybliometrics

In [None]:
# Importing the required libraries.
import csv, re, pandas as pd, numpy as np
from datetime import datetime
from pybliometrics.scopus import config, AbstractRetrieval, ScopusSearch
from pybliometrics.scopus.utils import create_config
from pybliometrics.scopus.exception import Scopus404Error, Scopus429Error, Scopus500Error
from urllib.error import HTTPError
from urllib3.exceptions import ConnectionError, NewConnectionError

## 1. Getting the data from "pybliometrics" library

In [None]:
# Uncomment to config on the first time.
# create_config()

In [None]:
# Setting the Scopus API Key.
_keys = ["92000a2d452f9ea0c044912dbf77da5c", "af387293c71c3a147da20311592ba45d", "a4dbae0606ee36026dea368046c1c256",
         "c6259162f73a52eb3727e8e926a5c966", "cdfde14b2456b74716586a3581602686", "1d721d4f3bdb0ebcad44aafd34b1068c",
         "49302964a02a015b80cd00a8a9db57f0", "3ad76d12b8f5eaf801978c83b8b442ee", "c76b3904d998b86eae87bad221110569",
         "eb278f648acbc07067e819fe18d53fa0"]
config["Authentication"]["APIKey"] = _keys.pop()

### 1.1. Getting the list of publications' EIDs using *ScopusSearch*

In [None]:
# Defining the query.
query = 'TITLE-ABS-KEY((lightweight OR tiny) AND ("generative artificial intelligence" OR "large language model" OR llm))'

In [None]:
# Creating the ScopusSearch object.
scopus = ScopusSearch(query, refresh=True, view="STANDARD")

In [None]:
# Printing the number of records collected.
print("Number of records collected: {}.".format(scopus.get_results_size()))

In [None]:
# Getting the list of manuscripts' EID.
list_eids_documents = scopus.get_eids()

### 1.2. Getting the publications' data from list of EIDs

The features extracted from the Scopus articles are:
* id (identifier): the Scopus' identifier key of a manuscript.
* doi: the DOI of a manuscript.
* eid: the EID identifier of a manuscript.
* pii: the PII (Publisher Item Identifier) of a manuscript.
* pubmed_id: the MEDLINE's identifier key of a manuscript.
* title: the title of a manuscript.
* abstract: the abstract of a manuscript.
* description: the abstract of a manuscript.
* publication_date (coverDate): the date of publication of a manuscript.
* citation_num (citedby_count): the number of citation of a manuscript.
* language: the language/idiom of a manuscript.
* production_type (aggregationType): the category/type/classification of source of a manuscript.
* source_type (srctype): the category/type/classification of source of a manuscript. It is a short version of feature "production_type".
* auth_keywords (authkeywords): the list of keywords defined by the authors of a manuscript.
* index_terms (idxterms): the list of indexed terms that defined by Scopus.
* issn: the ISSN/E-ISSN of a manuscript.
* isbn: the ISBNs of a manuscript.
* conf_location (conflocation): the place where a conference took place.
* conference_name (confname): the name of a conference of a manuscript.
* vehicle_name (publicationName): the name of source where a manuscript was published.
* vehicle_address (publisheraddress): the address of source where a manuscript was published.
* title_edition (issuetitle): the name of edition/issue of a journal where a manuscript was published.
* publisher: the name of publisher that published a manuscript. Requires the view "FULL".
* affiliations (affiliation): the list of affiliations (Scopus ID, country and name of affiliation) contained in a manuscript.
* subject_areas: the list of subject/study fields of a manuscript. Requires the view "FULL".
* authors: the list of authors (Scopus ID and name) contained in a manuscript.
* author_affil (authorgroup): the list of authors organized with their affiliations. The combination of the features "authors" and "affiliations".
* ref_count (refcount): the number of references contained in a manuscript. Requires the view "FULL".
* references: the list of references data (authors, title, DOI and Scopus ID). Requires the view "FULL".

In [None]:
# Function to collect manuscripts' data.
def collect_data_manuscripts(list_eids_documents):
    data = []
    for key in list_eids_documents:
        record = dict()
        error = True
        while error:
            try:
                paper = AbstractRetrieval(key, id_type="eid", view="FULL", refresh=True)
                error = False
                # Basic Attributes.
                record["id"] = str(paper.identifier)
                record["doi"] = paper.doi
                record["eid"] = str(paper.eid)
                record["pii"] = paper.pii
                record["pubmed_id"] = paper.pubmed_id
                record["title"] = paper.title
                record["abstract"] = paper.abstract
                record["description"] = paper.description

                try:
                    record["publication_date"] = datetime.strptime(paper.coverDate, "%Y-%m-%d").date() \
                                                if paper.coverDate else None
                except ValueError:
                    record["publication_date"] = str(paper.coverDate)
                    print(record["id"], "-", record["publication_date"])

                record["citation_num"] = paper.citedby_count
                record["language"] = paper.language
                record["production_type"] = paper.aggregationType
                record["source_type"] = paper.srctype
                record["auth_keywords"] = tuple(paper.authkeywords) if paper.authkeywords else None
                record["index_terms"] = tuple(paper.idxterms) if paper.idxterms else None
                record["issn"] = paper.issn

                try:
                    record["isbn"] = " ".join(paper.isbn) if type(paper.isbn) == tuple else paper.isbn
                except TypeError:
                    record["isbn"] = None

                # Conference and/or Journals data.
                record["conf_location"] = paper.conflocation

                try:
                    record["conference_name"] = paper.confname
                except AttributeError:
                    record["conference_name"] = None

                record["vehicle_name"] = paper.publicationName
                record["vehicle_address"] = paper.publisheraddress
                record["title_edition"] = paper.issuetitle
                record["publisher"] = paper.publisher

                # Affiliation.
                record["affiliations"] = tuple(
                    [{"id": str(affil.id) if affil and str(affil.id) else None,
                    "affiliation": affil.name if affil and affil.name else None,
                    "country": affil.country if affil and affil.country else None}
                    for affil in paper.affiliation]) if paper.affiliation else None

                # Subject Areas.
                record["subject_areas"] = tuple([{"area": area.area, "code": str(area.code),
                                                "abbrev": area.abbreviation}
                                                for area in paper.subject_areas]) \
                                        if paper.subject_areas else None

                # Authors.
                record["authors"] = tuple(
                    [{"id": str(author.auid) if author and str(author.auid) else None,
                    "name": "{} {}".format(author.given_name, author.surname) \
                                if author and author.given_name and author.surname else
                            "{}".format(author.given_name) if author and author.given_name \
                                and not author.surname else \
                            "{}".format(author.surname) if author and author.surname \
                                and not author.given_name else None}
                    for author in paper.authors]) if paper.authors else None

                try:
                    record["author_affil"] = tuple(
                        [{"id": str(author.auid) if author and str(author.auid) else None,
                        "name": "{} {}".format(author.given_name, author.surname) \
                                    if author and author.given_name and author.surname else \
                                "{}".format(author.given_name) if author and author.given_name \
                                    and not author.surname else \
                                "{}".format(author.surname) if author and author.surname \
                                    and not author.given_name else None,
                        "affil_id": str(author.affiliation_id) if author and str(author.affiliation_id) else None,
                        "affiliation": author.organization if author and author.organization else None,
                        "country": author.country if author and author.country else None}
                        for author in paper.authorgroup]) if paper.authorgroup else None
                except AttributeError:
                    print("Error: ", paper.authorgroup)
                    record["author_affil"] = None

                # References.
                record["ref_count"] = paper.refcount if paper.refcount else None
                record["references"] = tuple([{"id": str(ref.id) if ref and str(ref.id) else None,
                                            "title": ref.title if ref and ref.title else None,
                                            "doi": ref.doi if ref and ref.doi else None,
                                            "authors": ref.authors if ref and ref.authors else None}
                                    for ref in paper.references]) if paper.references else None

            except (Scopus404Error, Scopus500Error, HTTPError, KeyError) as e:
                record["id"] = str(key)
                print(key)
                error = False
            except Scopus429Error as e:
                # Removing the last item in _keys to assign it as new API key.
                config["Authentication"]["APIKey"] = _keys.pop()
                if len(_keys) == 0:
                    raise e
            except (ConnectionError, NewConnectionError, Exception) as e:
                record["id"] = key
                print(key)
                error = False
        data.append(record)
    return data

In [None]:
# Collecting the data.
data = collect_data_manuscripts(list_eids_documents)

## 2. Saving the data collected

In [None]:
# Exporting the data to CSV file.
pd.DataFrame(data).to_csv("../../data/raw/scopus_raw.csv", index=False, quoting=csv.QUOTE_ALL)