This notebook

- consolidates the CORD database with external metadata from Altmetric, Scimago Journal and Cross Ref
- generates CovidBERT embeddings from the titles and excerpts

In [None]:
%config Completer.use_jedi=False
%matplotlib inline

In [None]:
import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

import os
import csv
import glob
import json
import re
import pickle
from multiprocessing import Pool
from IPython.display import display, Latex, HTML, FileLink
import joblib
import requests
import urllib

from bs4 import BeautifulSoup

import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
tqdm_notebook().pandas()

import semanticscholar as sch
from langdetect import detect
from crossref.restful import Works, Journals
from altmetric import Altmetric

from sklearn.metrics.pairwise import cosine_similarity
import torch
from sentence_transformers import SentenceTransformer, models
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AutoModel

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.decomposition import NMF, LatentDirichletAllocation 
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.metrics import silhouette_samples, silhouette_score, calinski_harabasz_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 

- put the unzipped data from kaggle inside root_path folder
- intermediary and final results will be saved in export_path folder

In [None]:
root_path = "../data/kaggle_data_v2/"
export_path = "../data/exports_v3/"

if not os.path.exists(export_path):
    os.makedirs(export_path)

Get paths of all papers (in json format)

In [None]:
all_json = glob.glob(os.path.join(root_path, "document_parses/**/*.json"), recursive=True)

Load metadata of each paper:

In [None]:
meta_df = pd.read_csv(os.path.join(root_path, 'metadata.csv'), 
                      low_memory=False, 
                      dtype={
                            'pubmed_id': str,
                            'Microsoft Academic Paper ID': str, 
                            'doi': str
                        }
                     )

Define a FileReader class to parse each paper


In [None]:
class FileReader:
    def __init__(self, file_path):
        with open(file_path) as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.abstract = []
            self.body_text = []
            # Abstract
            try:
                for entry in content['abstract']:
                    self.abstract.append(entry['text'])
            except:
                self.abstract.append("No abstract available")
            for entry in content["body_text"]:
                self.body_text.append(entry['text'])
            self.abstract = '. '.join(self.abstract)
            self.body_text = '. '.join(self.body_text)
    def __repr__(self):
        return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'

Loop over the papers and extract information:

In [None]:
dict_ = {'paper_id': [], 
         'abstract': [], 
         'body_text': [], 
         'authors': [], 
         'title': [], 
         'journal': [], 
         'publish_time': [], 
         'abstract_summary': [],
         'doi': [],
         'url': [],
         'source_x': []
        }

for idx, entry in tqdm_notebook(enumerate(all_json), total=len(all_json)):
    content = FileReader(entry)
    
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    # no metadata, skip this paper
    if len(meta_data) == 0:
        continue
    
    dict_['paper_id'].append(content.paper_id)
    dict_['abstract'].append(content.abstract)
    dict_['body_text'].append(content.body_text)
    
    try:
        authors = meta_data['authors'].values[0].split(';')
        dict_['authors'].append(". ".join(authors))
    except Exception as e:
        # if Null value
        dict_['authors'].append(meta_data['authors'].values[0])
    
    # add the title information
    dict_['title'].append(meta_data['title'].values[0])
    
    # add the journal information
    dict_['journal'].append(meta_data['journal'].values[0])
    
    # add the publishing data
    dict_['publish_time'].append(meta_data['publish_time'].values[0])
    
    # add doi 
    dict_['doi'].append(meta_data['doi'].values[0])
    
    # add source
    dict_['source_x'].append(meta_data['source_x'].values[0])
    
    # add url
    dict_['url'].append(meta_data['url'].values[0])    

Convert to dataframe

In [None]:
columns = [
    'paper_id', 'abstract', 'body_text', 'authors', 'title', 
    'journal', 'publish_time', 'doi', 'source_x', 'url'
]

df_covid = pd.DataFrame(dict_, columns=columns)

Remove duplicates in titles and null values within the body text: we keep full-text papers only.

In [None]:
df_covid.drop_duplicates(['title'], inplace=True)
df_covid.dropna(subset=['body_text'], inplace=True)
df_covid = df_covid[~df_covid.title.isnull()]
df_covid.reset_index(inplace=True, drop=True)

Create a covid-19 for each paper based on a predefined keyword list and the publication date

In [None]:
covid_terms =['covid', 'coronavirus disease 19', 'sars cov 2', '2019 ncov', '2019ncov', '2019 n cov', '2019n cov',
              'ncov 2019', 'n cov 2019', 'coronavirus 2019', 'wuhan pneumonia', 'wuhan virus', 'wuhan coronavirus',
              'coronavirus 2', 'covid-19', 'SARS-CoV-2', '2019-nCov']
covid_terms = [elem.lower() for elem in covid_terms]
covid_terms = re.compile('|'.join(covid_terms))

def checkYear(date):
    return int(date[0:4])

def checkCovid(row, covid_terms):
    return bool(covid_terms.search(row['body_text'].lower())) and checkYear(row['publish_time']) > 2019
df_covid['is_covid'] = df_covid.progress_apply(checkCovid, axis=1, covid_terms=covid_terms)

Restrict to articles from 2010 +

In [None]:
df_covid = df_covid[df_covid.publish_time.map(lambda d: checkYear(d) >= 2010)]
df_covid = df_covid.reset_index(drop=True)

Clean the body text

In [None]:
def preprocessing(text):
    # remove mail
    text = re.sub(r'[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}', ' ', text)
    # remove doi
    text = re.sub(r'https\:\/\/doi\.org[^\s]+', 'DOI', text)
    # remove https
    text = re.sub(r'(\()?\s?http(s)?\:\/\/[^\)]+(\))?', ' ', text)
    # remove single characters repeated at least 3 times for spacing error (e.g. s u m m a r y)
    text = re.sub(r'(\w\s+){3,}', ' ', text)
    # replace tags (e.g. [3] [4] [5]) with whitespace
    text = re.sub(r'(\[\d+\]\,?\s?){3,}(\.|\,)?', ' ', text)
    # replace tags (e.g. [3, 4, 5]) with whitespace
    text = re.sub(r'\[[\d\,\s]+\]', ' ', text)
     # replace tags (e.g. (NUM1) repeated at least 3 times with whitespace
    text = re.sub(r'(\(\d+\)\s){3,}', ' ', text)
    # replace '1.3' with '1,3' (we need it for split later)
    text = re.sub(r'(\d+)\.(\d+)', ' ', text)
    # remove all full stops as abbreviations (e.g. i.e. cit. and so on)
    text = re.sub(r'\.(\s)?([^A-Z\s])', ' \g<1>\g<2>', text)
    # correctly spacing the tokens
    text = re.sub(r' {2,}', ' ', text)
    text = re.sub(r'\.{2,}', '.', text)
    # return lowercase text
    return text.lower()

df_covid['preproc_body_text'] = df_covid['body_text'].progress_apply(preprocessing)

Detect the language in each paper and restrict to english papers only

In [None]:
def detect_language(row):
    try:
        return detect(row['preproc_body_text'])
    except:
        try:
            return detect(row['title'])
        except: 
            return 'NC'

df_covid['language'] = df_covid.progress_apply(detect_language, axis=1)
df_covid = df_covid[df_covid['language'] == 'en']
df_covid = df_covid.reset_index(drop=True)

Checkpoint

In [None]:
df_covid.to_csv(os.path.join(export_path, 'metadata.csv'), index=False)

### Crossref data

- crossref is an api that, given a doi, extracts additional metadata for each paper. 
- we use it to check whether the article is a preprint or not

In [None]:
doi_list = df_covid[~df_covid['doi'].isnull()]['doi'].unique().tolist()

In [None]:
scrape_crossref = True

if scrape_crossref:
    
    works = Works(request_params={'timeout': 4})

    def get_crossref_data(doi):
        res = works.doi(doi)
        return res
    
    with Pool(processes=12) as pool, tqdm_notebook(total=len(doi_list)) as pbar:
        crossref_data = []
        for info in pool.imap_unordered(get_crossref_data, doi_list):
            crossref_data.append(info)
            pbar.update()

    crossref_data = [d for d in crossref_data if d is not None]
    df_crossref = pd.DataFrame(crossref_data)
    joblib.dump(df_crossref, os.path.join(export_path, 'crossref.joblib'))    
    
else:
    df_crossref = joblib.load(os.path.join(export_path, 'crossref.joblib'))
    
df_covid = df_covid.merge(df_crossref[['DOI', 'subtype']], how='left', left_on='doi', right_on='DOI')
df_covid.drop('DOI', inplace=True, axis=1)
df_covid['preprint'] = df_covid['subtype'].map(lambda st: st == "preprint")
df_covid.drop('subtype', inplace=True, axis=1)

Checkpoint

In [None]:
df_covid.to_csv(os.path.join(export_path, 'metadata.csv'), index=False)

### Altmetric data

We use altmetric API to fetch social metadata on each article:

- readers count
- citations in posts
- retweets
- citations in facebook walls
- citations in Wikipedia

In [None]:
scrape_altmetrics = True

if scrape_altmetrics:
    def get_altmetric_data(doi):
        res = a.doi(doi)
        return res
    
    altmetrics_data = []
    a = Altmetric()

    for doi in tqdm_notebook(doi_list):
        res = a.doi(doi)
        altmetrics_data.append(res)

    altmetrics_data = [d for d in altmetrics_data if d is not None]
    
    altmetrics_columns = [
        'doi',
        'score',
        'readers_count',
        'cited_by_posts_count',
        'cited_by_tweeters_count',
        'cited_by_fbwalls_count',
        'cited_by_wikipedia_count',
        'subjects',
    ]

    df_altmetrics = pd.DataFrame(altmetrics_data, columns=altmetrics_columns)
    joblib.dump(df_altmetrics, os.path.join(export_path, 'altmetrics.joblib'))

else:
    df_altmetrics = joblib.load(os.path.join(export_path, 'altmetrics.joblib'))
    
df_covid = df_covid.merge(df_altmetrics, how='left', on='doi')

Checkpoint

In [None]:
df_covid.to_csv(os.path.join(export_path, 'metadata.csv'), index=False)

### Scraping H-Index

Scrape the H index of each paper

In [None]:
scrape_scimago = True

if scrape_scimago:

    def parse_page(url):
        return BeautifulSoup(requests.get(url).content, 'lxml')
    
    journals = df_covid.journal.dropna().unique().tolist()    

    def get_h_index(url):
        soup = parse_page(url)
        h_index_div = soup.find('div', {'class': 'hindexnumber'})
        if h_index_div:
            h_index = h_index_div.text
        return h_index

    def extract_info(query):
        original_title = query
        query = urllib.parse.quote_plus(query)
        url = f"https://www.scimagojr.com/journalsearch.php?q={query}"
        soup = parse_page(url)

        search_results = soup.find('div', {'class': 'search_results'}).find_all('a')
        if search_results == []:
            full_title = None
            h_index = None

        else:
            first_result = search_results[0]
            full_title = first_result.find('span').text
            url_journal = 'https://www.scimagojr.com/' + first_result['href']
            h_index = get_h_index(url_journal)

        info = {
            'original_title': original_title,
            'query': query,
            'full_title': full_title,
            'h_index': h_index
        }
        return info

    with Pool(processes=12) as pool, tqdm_notebook(total=len(journals)) as pbar:
        scimago_data = []
        for info in pool.imap_unordered(extract_info, journals):
            scimago_data.append(info)
            pbar.update()
            
    df_scimago = pd.DataFrame(scimago_data)
    joblib.dump(df_scimago, os.path.join(export_path, 'scimago.joblib'))

else:
    df_scimago = joblib.load(os.path.join(export_path, 'scimago.joblib'))
    

mapping_title_hindex = dict(zip(df_scimago['original_title'], df_scimago['h_index']))
df_covid['h_index'] = df_covid.journal.map(lambda j: mapping_title_hindex[j] if j in mapping_title_hindex else j)
df_covid['source_x'] = df_covid.source_x.map(lambda s: s.split(';')[0])

Add a peer-reviewed tag

In [None]:
df_covid['peer_reviewed'] = df_covid['journal'].map(lambda j: False if type(j) == float else True)

Checkpoint

In [None]:
df_covid.to_csv(os.path.join(export_path, 'metadata.csv'), index=False)

### Generate embeddings using covid-bert

extract an excerpt from each paper

In [None]:
def get_excerpt(row):
    if row['abstract'] == '':
        excerpt = row['title'] + ' . ' + row['preproc_body_text'][:500]
    else:
        len_abstract = len(row['abstract'])
        if len_abstract > 500:
            excerpt = row['title'] + ' . ' + preprocessing(row['abstract'][:500])
        else:
            excerpt = (row['title'] + ' . ' 
                       + preprocessing(row['abstract']) + ' . ' 
                       + row['preproc_body_text'][:500 - len_abstract])
        
    return excerpt

df_covid['excerpt'] = df_covid.progress_apply(get_excerpt, axis=1)
df_covid.to_csv(os.path.join(export_path, 'metadata.csv'), index=False)

In [None]:
model = SentenceTransformer("./src/models/covidbert/")

embed excerpts using CovidBert

In [None]:
excerpt_embeddings = model.encode(df_covid.excerpt.tolist(), show_progress_bar=True, batch_size=32)
excerpt_embeddings = np.array(excerpt_embeddings)
np.save(os.path.join(export_path, 'embeddings_excerpts.npy'), excerpt_embeddings)

embed titles using CovidBert

In [None]:
titles_embeddings = model.encode(df_covid.title.tolist(), show_progress_bar=True, batch_size=32)
titles_embeddings = np.array(titles_embeddings)
np.save(os.path.join(export_path, 'embeddings_titles.npy'), titles_embeddings)