# Importing the metadata.csv file

In [1]:
import numpy as np
import pandas as pd 

metadata_df = pd.read_csv('metadata.csv')
metadata_df.head()

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_pdf_parse,has_pmc_xml_parse,full_text_file,url
0,zjufx4fo,b2897e1277f56641193a6db73825f707eed3e4c9,PMC,Sequence requirements for RNA strand transfer ...,10.1093/emboj/20.24.7220,PMC125340,11742998.0,unk,Nidovirus subgenomic mRNAs contain a leader se...,2001-12-17,"Pasternak, Alexander O.; van den Born, Erwin; ...",The EMBO Journal,,,True,True,custom_license,http://europepmc.org/articles/pmc125340?pdf=re...
1,ymceytj3,e3d0d482ebd9a8ba81c254cc433f314142e72174,PMC,"Crystal structure of murine sCEACAM1a[1,4]: a ...",10.1093/emboj/21.9.2076,PMC125375,11980704.0,unk,CEACAM1 is a member of the carcinoembryonic an...,2002-05-01,"Tan, Kemin; Zelus, Bruce D.; Meijers, Rob; Liu...",The EMBO Journal,,,True,True,custom_license,http://europepmc.org/articles/pmc125375?pdf=re...
2,wzj2glte,00b1d99e70f779eb4ede50059db469c65e8c1469,PMC,Synthesis of a novel hepatitis C virus protein...,10.1093/emboj/20.14.3840,PMC125543,11447125.0,no-cc,Hepatitis C virus (HCV) is an important human ...,2001-07-16,"Xu, Zhenming; Choi, Jinah; Yen, T.S.Benedict; ...",EMBO J,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
3,2sfqsfm1,cf584e00f637cbd8f1bb35f3f09f5ed07b71aeb0,PMC,Structure of coronavirus main proteinase revea...,10.1093/emboj/cdf327,PMC126080,12093723.0,unk,The key enzyme in coronavirus polyprotein proc...,2002-07-01,"Anand, Kanchan; Palm, Gottfried J.; Mesters, J...",The EMBO Journal,,,True,True,custom_license,http://europepmc.org/articles/pmc126080?pdf=re...
4,i0zym7iq,dde02f11923815e6a16a31dd6298c46b109c5dfa,PMC,Discontinuous and non-discontinuous subgenomic...,10.1093/emboj/cdf635,PMC136939,12456663.0,unk,"Arteri-, corona-, toro- and roniviruses are ev...",2002-12-01,"van Vliet, A.L.W.; Smits, S.L.; Rottier, P.J.M...",The EMBO Journal,,,True,True,custom_license,http://europepmc.org/articles/pmc136939?pdf=re...


# Finding the number of records with missing abstract or doi

In [None]:
missing_values_count_per_col = metadata_df.isnull().sum()
print('Entries without abstract BEFORE: '+ str(missing_values_count_per_col['abstract']))
print('Entries without doi BEFORE: '+ str(missing_values_count_per_col['doi']))

# Visualizing the percentage of missing values in each column in the metadata.csv file

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

missing_values_count_per_col = metadata_df.isnull().sum()/len(metadata_df)
missing_values_count_per_col = missing_values_count_per_col.to_frame()
missing_values_count_per_col.columns = ['Percentage of Missing Values']
missing_values_count_per_col.index.names = ['Column Name']
missing_values_count_per_col['Column Name'] = missing_values_count_per_col.index

sns.set(style="whitegrid", color_codes=True)
sns.barplot(x = 'Column Name', y = 'Percentage of Missing Values', data=missing_values_count_per_col)
plt.xticks(rotation = 90)
plt.show()

# Dropping records with missing abstract or doi

In [None]:
#Drop rows with missing abstract or doi
metadata_df = metadata_df.dropna(how='any', subset=['abstract','doi'])
missing_values_count_per_col = metadata_df.isnull().sum()
print('Entries without abstract AFTER: '+ str(missing_values_count_per_col['abstract']))
print('Entries without doi AFTER: '+ str(missing_values_count_per_col['doi']))

In [None]:
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS

nlp = English()
for i in range(0, metadata_df.shape[0]):
    print(i)
    abstract = nlp(metadata_df.iloc[i]['abstract'])
    tokens = []
    tokens_after_stopwords_removal = []
    for word in abstract:
        lexeme = nlp.vocab[word.text]
        if not lexeme.is_stop:
            tokens_after_stopwords_removal.append(word.text)
    metadata_df.at[i, 'abstract'] =  str(tokens_after_stopwords_removal) #Refining the abstracts by removing stopwords
print(metadata_df.iloc[0]['abstract'])        

In [None]:
import nltk
from nltk.corpus import wordnet

keywords = ['treatment', 'incubation', 'contagious', 'transmission', 'persistence', 'infection', 'diagnosis', 'symptoms', 'asymptomatic', 'immune']
total_synonyms = []
for keyword in keywords:
    synonyms = []
    for synset in wordnet.synsets(keyword):
        for lemma in synset.lemmas():
            synonyms.append(lemma.name())
    
    synonyms = list(dict.fromkeys(synonyms)) #Removing duplicate synonyms
    total_synonyms.append(synonyms)

#Create a dictionary with the keywords as keys and the list of synonyms for each keyword as the values of the keys respecitvely
synonyms_dict = {} 
for i in range(0, len(total_synonyms)):
    synonyms_dict.update({keywords[i] : total_synonyms[i]})

print(synonyms_dict.keys())

In [None]:
covid19_df = metadata_df[['title', 'doi', 'abstract', 'publish_time', 'authors', 'journal', 'url']].copy()
covid19_df.head()