# Cleaning and Preprocessing the Scopus publications related to COVID-19

For collecting the Scopus publications related to COVID-19, we used the "pybliometrics" library. It is avaliable on [https://pypi.org/project/pybliometrics/].

In [1]:
# Importing the required libraries.
import re, csv, pandas as pd, numpy as np

## 1. Generating the dataframe from the raw data

In [2]:
# Creating a dataframe from the raw data.
df_data = pd.read_csv("../../data/raw/scopus_raw.csv", header=0,
    dtype={"id": "str", "eid": "str", "pii": "str", "pubmed_id": "str"})

In [3]:
# Checking the dataframe.
df_data.head()

Unnamed: 0,id,doi,eid,pii,pubmed_id,title,abstract,description,publication_date,citation_num,...,vehicle_name,vehicle_address,title_edition,publisher,affiliations,subject_areas,authors,author_affil,ref_count,references
0,85084625311,10.1080/16549716.2020.1760490,2-s2.0-85084625311,,32404043.0,Eco-epidemiological assessment of the COVID-19...,"© 2020, © 2020 The Author(s). Published by Inf...","© 2020, © 2020 The Author(s). Published by Inf...",2020-12-31,0.0,...,Global Health Action,,,Taylor and Francis Ltd.,"({'id': '60031040', 'affiliation': 'Umeå Unive...","('Health Policy', 'Public Health, Environmenta...","({'id': '7006527895', 'name': 'Peter Byass'},)","({'id': '7006527895', 'name': 'Peter Byass', '...",19.0,"({'id': '85056407012', 'title': 'The calendar ..."
1,85084466053,10.1080/20008198.2020.1752504,2-s2.0-85084466053,,,Screening for consequences of trauma–an update...,"© 2020, © 2020 The Author(s). Published by Inf...","© 2020, © 2020 The Author(s). Published by Inf...",2020-12-31,0.0,...,European Journal of Psychotraumatology,,,Taylor and Francis Ltd.,"({'id': '60030118', 'affiliation': ""The Childr...","('Psychiatry and Mental Health',)","({'id': '56230641000', 'name': 'Miranda Olff'}...","({'id': '56230641000', 'name': 'Miranda Olff',...",31.0,"({'id': '85075168775', 'title': 'Trauma-relate..."
2,85084248154,10.1080/13814788.2020.1757312,2-s2.0-85084248154,,32349550.0,Family medicine in times of ‘COVID-19’: A gene...,,,2020-12-16,0.0,...,European Journal of General Practice,,,Taylor and Francis Ltd,"({'id': '60018869', 'affiliation': 'Maastricht...","('Family Practice',)","({'id': '6507333381', 'name': 'An de Sutter'},...","({'id': '6507333381', 'name': 'An de Sutter', ...",,
3,85082549986,10.1080/20964129.2020.1741325,2-s2.0-85082549986,,,Regulating wildlife conservation and food safe...,"© 2020, © 2020 The Author(s). Published by Tay...","© 2020, © 2020 The Author(s). Published by Tay...",2020-12-16,4.0,...,Ecosystem Health and Sustainability,,,Taylor and Francis Ltd.,"({'id': '60027363', 'affiliation': 'University...","('Ecology, Evolution, Behavior and Systematics...","({'id': '57216083373', 'name': 'Jingjing Yuan'...","({'id': '57216083373', 'name': 'Jingjing Yuan'...",17.0,"({'id': '85074624653', 'title': 'A New Twenty-..."
4,85086878414,10.1016/j.ces.2020.115918,2-s2.0-85086878414,S0009250920304504,,Analogies between SARS-CoV-2 infection dynamic...,© 2020 Elsevier LtdThe pandemic infection of S...,© 2020 Elsevier LtdThe pandemic infection of S...,2020-12-14,0.0,...,Chemical Engineering Science,,,Elsevier Ltd,"({'id': '60027245', 'affiliation': 'Universite...","('Chemistry (all)', 'Chemical Engineering (all...","({'id': '23470382500', 'name': 'F. Manenti'}, ...","({'id': '23470382500', 'name': 'F. Manenti', '...",36.0,"({'id': '85071770137', 'title': 'A dynamic neu..."


In [4]:
# Visualizing the information of dataset.
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25030 entries, 0 to 25029
Data columns (total 29 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                25030 non-null  object 
 1   doi               24462 non-null  object 
 2   eid               25029 non-null  object 
 3   pii               8484 non-null   object 
 4   pubmed_id         17416 non-null  object 
 5   title             25029 non-null  object 
 6   abstract          12810 non-null  object 
 7   description       12810 non-null  object 
 8   publication_date  25029 non-null  object 
 9   citation_num      25029 non-null  float64
 10  language          24886 non-null  object 
 11  production_type   25029 non-null  object 
 12  source_type       25029 non-null  object 
 13  auth_keywords     11935 non-null  object 
 14  index_terms       7548 non-null   object 
 15  issn              25006 non-null  object 
 16  isbn              26 non-null     object

## 2. Cleaning and preprocessing the dataframe

In [5]:
# Removing the invalid articles.
df_data = df_data.loc[df_data.id.notnull() & df_data.eid.notnull()]

In [6]:
# Defining the "None" value for the "NaN" values.
df_data.replace({np.nan: None}, inplace=True)

In [7]:
# Defining the "zero" value for the articles without numbers of citation and references.
df_data.citation_num.loc[df_data.citation_num.isnull()] = 0
df_data.ref_count.loc[df_data.ref_count.isnull()] = 0

In [8]:
# Normalizing the feature "abstract".
df_data.abstract.loc[df_data.abstract.isnull() & df_data.description.notnull()] = df_data.description.loc[
    df_data.abstract.isnull() & df_data.description.notnull()]

In [9]:
# Normalizing the feature "vehicle_name".
df_data.vehicle_name.loc[df_data.conference_name.notnull() & df_data.vehicle_name.notnull()] = df_data.conference_name.loc[df_data.conference_name.notnull() & df_data.vehicle_name.notnull()]

In [10]:
# Removing unnecessary columns.
columns_drop = ["eid", "pii", "description", "isbn", "conf_location", "conference_name",
    "vehicle_address", "title_edition"]
df_data.drop(axis=1, columns=columns_drop, inplace=True)

In [11]:
# Changing the type of features.
df_data.loc[:, ["citation_num", "ref_count"]] = df_data.loc[
    :, ["citation_num", "ref_count"]].astype("int")
df_data.auth_keywords.loc[df_data.auth_keywords.notnull()] = df_data.auth_keywords.loc[
    df_data.auth_keywords.notnull()].apply(eval)
df_data.index_terms.loc[df_data.index_terms.notnull()] = df_data.index_terms.loc[
    df_data.index_terms.notnull()].apply(eval)
df_data.affiliations.loc[df_data.affiliations.notnull()] = df_data.affiliations.loc[
    df_data.affiliations.notnull()].apply(eval)
df_data.subject_areas.loc[df_data.subject_areas.notnull()] = df_data.subject_areas.loc[
    df_data.subject_areas.notnull()].apply(eval)
df_data.authors.loc[df_data.authors.notnull()] = df_data.authors.loc[df_data.authors.notnull()].apply(eval)
df_data.author_affil.loc[df_data.author_affil.notnull()] = df_data.author_affil.loc[
    df_data.author_affil.notnull()].apply(eval)
df_data.references.loc[df_data.references.notnull()] = df_data.references.loc[
    df_data.references.notnull()].apply(eval)
df_data.publication_date = pd.to_datetime(df_data.publication_date)

In [12]:
# Normalizing the feature "abstract".
df_data.abstract.loc[df_data.abstract.notnull()] = df_data.abstract.loc[df_data.abstract.notnull()].apply(
    lambda x: x.replace("\\u0019", "").replace("\\%", "%").replace("\\s", "s").strip())

In [13]:
# Normalizing the itens contained in the features "auth_keywords" and "index_terms".
df_data.auth_keywords.loc[df_data.auth_keywords.notnull()] = df_data.auth_keywords.loc[
    df_data.auth_keywords.notnull()].apply(
        lambda x: tuple([item.replace("\ufeff", "").strip() for item in x]))
df_data.index_terms.loc[df_data.index_terms.notnull()] = df_data.index_terms.loc[
    df_data.index_terms.notnull()].apply(
        lambda x: tuple([item.replace("\ufeff", "").strip() for item in x]))

In [14]:
# Normalizing the affiliations contained in the features "affiliations" and "author_affil".
df_data.affiliations.loc[df_data.affiliations.notnull()] = df_data.affiliations.loc[
    df_data.affiliations.notnull()].apply(lambda x: tuple(
        [{"id": affil["id"],
          "affiliation": affil["affiliation"].replace("\u200b", "").replace("\u202f", "").strip(),
          "country": affil["country"]}
         for affil in x]))
df_data.author_affil.loc[df_data.author_affil.notnull()] = df_data.author_affil.loc[
    df_data.author_affil.notnull()].apply(lambda x: tuple(
        [{"id": item["id"], "name": item["name"], "affil_id": item["affil_id"],
          "affiliation": item["affiliation"].replace("\u200b", "").replace("\u202f", "").strip() \
              if item["affiliation"] else None,
          "country": item["country"]}
         for item in x]))

In [15]:
# Checking the result.
df_data.head()

Unnamed: 0,id,doi,pubmed_id,title,abstract,publication_date,citation_num,language,production_type,source_type,...,index_terms,issn,vehicle_name,publisher,affiliations,subject_areas,authors,author_affil,ref_count,references
0,85084625311,10.1080/16549716.2020.1760490,32404043.0,Eco-epidemiological assessment of the COVID-19...,"© 2020, © 2020 The Author(s). Published by Inf...",2020-12-31,0,eng,Journal,j,...,"(Betacoronavirus, China, Coronavirus, Coronavi...",16549880,Global Health Action,Taylor and Francis Ltd.,"({'id': '60031040', 'affiliation': 'Umeå Unive...","(Health Policy, Public Health, Environmental a...","({'id': '7006527895', 'name': 'Peter Byass'},)","({'id': '7006527895', 'name': 'Peter Byass', '...",19,"({'id': '85056407012', 'title': 'The calendar ..."
1,85084466053,10.1080/20008198.2020.1752504,,Screening for consequences of trauma–an update...,"© 2020, © 2020 The Author(s). Published by Inf...",2020-12-31,0,eng,Journal,j,...,,20008066 20008198,European Journal of Psychotraumatology,Taylor and Francis Ltd.,"({'id': '60030118', 'affiliation': 'The Childr...","(Psychiatry and Mental Health,)","({'id': '56230641000', 'name': 'Miranda Olff'}...","({'id': '56230641000', 'name': 'Miranda Olff',...",31,"({'id': '85075168775', 'title': 'Trauma-relate..."
2,85084248154,10.1080/13814788.2020.1757312,32349550.0,Family medicine in times of ‘COVID-19’: A gene...,,2020-12-16,0,eng,Journal,j,...,"(Adaptation, Psychological, Coronavirus Infect...",17511402 13814788,European Journal of General Practice,Taylor and Francis Ltd,"({'id': '60018869', 'affiliation': 'Maastricht...","(Family Practice,)","({'id': '6507333381', 'name': 'An de Sutter'},...","({'id': '6507333381', 'name': 'An de Sutter', ...",0,
3,85082549986,10.1080/20964129.2020.1741325,,Regulating wildlife conservation and food safe...,"© 2020, © 2020 The Author(s). Published by Tay...",2020-12-16,4,eng,Journal,j,...,,23328878 20964129,Ecosystem Health and Sustainability,Taylor and Francis Ltd.,"({'id': '60027363', 'affiliation': 'University...","(Ecology, Evolution, Behavior and Systematics,...","({'id': '57216083373', 'name': 'Jingjing Yuan'...","({'id': '57216083373', 'name': 'Jingjing Yuan'...",17,"({'id': '85074624653', 'title': 'A New Twenty-..."
4,85086878414,10.1016/j.ces.2020.115918,,Analogies between SARS-CoV-2 infection dynamic...,© 2020 Elsevier LtdThe pandemic infection of S...,2020-12-14,0,eng,Journal,j,...,,00092509,Chemical Engineering Science,Elsevier Ltd,"({'id': '60027245', 'affiliation': 'Universite...","(Chemistry (all), Chemical Engineering (all), ...","({'id': '23470382500', 'name': 'F. Manenti'}, ...","({'id': '23470382500', 'name': 'F. Manenti', '...",36,"({'id': '85071770137', 'title': 'A dynamic neu..."


In [16]:
# Visualizing the information of dataset.
df_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25029 entries, 0 to 25029
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   id                25029 non-null  object        
 1   doi               24462 non-null  object        
 2   pubmed_id         17416 non-null  object        
 3   title             25029 non-null  object        
 4   abstract          12810 non-null  object        
 5   publication_date  25029 non-null  datetime64[ns]
 6   citation_num      25029 non-null  object        
 7   language          24886 non-null  object        
 8   production_type   25029 non-null  object        
 9   source_type       25029 non-null  object        
 10  auth_keywords     11935 non-null  object        
 11  index_terms       7548 non-null   object        
 12  issn              25006 non-null  object        
 13  vehicle_name      25029 non-null  object        
 14  publisher         2502

## 3. Saving the dataframe

In [17]:
# Exporting the data to CSV file.
df_data.to_csv("../../data/prepared/scopus_covid_19.csv", index=False, quoting=csv.QUOTE_ALL)