# Overview

Perform NLP on COVID-19 research papers to extract useful information.

1. Exploratory Data Analysis (EDA)
    - Find most common words and bigrams in title
    - Topic modelling using Latent Dirichlet Allocation (LDA) and gensim, visualize with pyLDAvis
    
2. Find similar papers
    - Get embeddings using Universal Sentence Encoder (USE) and find similar titles using cosine similarity
    
3. Find papers matching query
    - Using cosine similarity & similarity matrix of embeddings

4. Keyword extraction
    - Extact keywords from abstracts using Rake

5. Knowledge graphs
    - Entity detection, dependency parsing, and knowledge graphs from paper abstracts

In [None]:
!pip install rake-nltk

### Imports

In [None]:
# Core
import pandas as pd
import numpy as np
import os
import gc
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

# ML
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('omw-1.4')
import spacy
from spacy.matcher import Matcher 
import tensorflow_hub as hub
from rake_nltk import Rake
import gensim
import pyLDAvis, pyLDAvis.gensim
import networkx as nx

## Exploratory Data Analysis

In [None]:
# Load data
df_cord=pd.read_csv("../input/CORD-19-research-challenge/metadata.csv")
df_cord.head()

In [None]:
df_cord.info()

In [None]:
# Create corpus from tokenizing paper titles
def get_titles_corpus(df):
    corpus=[]
    stem=PorterStemmer()
    lem=WordNetLemmatizer()
    stop=set(stopwords.words('english'))
    for news in df['title'].dropna()[:5000]:
        # remove stopwords
        words=[w for w in word_tokenize(news) if (w not in stop)]
        # lemmatize
        words=[lem.lemmatize(w) for w in words if len(w)>2]
        corpus.append(words)
    return corpus

corpus = get_titles_corpus(df_cord)

In [None]:
# Get most common ngrams from corpus
def get_top_ngram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(n, n), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:10]

### Most common words in title

In [None]:
top_n_words=get_top_ngram(df_cord['title'].dropna(),1)[:10]
x,y=map(list,zip(*top_n_words))
plt.figure(figsize=(8, 6))
sns.barplot(x, y)

### Most common bigrams in title

In [None]:
top_n_bigrams=get_top_ngram(df_cord['title'].dropna(),2)[:10]
x,y=map(list,zip(*top_n_bigrams))
plt.figure(figsize=(10, 6))
sns.barplot(x, y)

In [None]:
# Clear memory
del x, y, top_n_words, top_n_bigrams
gc.collect()

### Topic modeling from titles

In [None]:
# Create bag-of-words (BoW) vector from corpus
dic=gensim.corpora.Dictionary(corpus)
bow_corpus = [dic.doc2bow(doc) for doc in corpus]

In [None]:
# Build LDA model
lda_model =  gensim.models.LdaMulticore(bow_corpus, num_topics = 4, id2word = dic, passes = 10, workers = 2)

In [None]:
lda_model.show_topics()

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dic)
vis

In [None]:
# Clear memory
del corpus,top_n_bigrams,lda_model,bow_corpus,top_tri_grams
gc.collect()

## Find Similar Papers

In [None]:
# Helper function to get similar sentences from similarity matrix
def get_top_similar(sentence, sentence_list, similarity_matrix, topN):
    index = sentence_list.index(sentence)
    similarity_row = np.array(similarity_matrix[index, :])
    indices = similarity_row.argsort()[-topN:][::-1]
    return [(i,sentence_list[i]) for i in indices]

### Using Universal sentence Encoder and cosine similarity.

In [None]:
# Import the Universal Sentence Encoder's TF Hub module
embed = hub.load('https://tfhub.dev/google/universal-sentence-encoder-large/4')

In [None]:
# Create embeddings and similarity matrix
titles=df_cord['title'].fillna("Unknown")
embed_vectors=embed(titles[:100].values)['outputs'].numpy()
sentence_list=titles.values.tolist()
sentence=titles.iloc[5]
print("Find similar research papers for :")
print(sentence)

similarity_matrix=cosine_similarity(embed_vectors)
similar=get_top_similar(sentence,sentence_list,similarity_matrix,6)

In [None]:
for sentence in similar:
    print(sentence)
    print("\n")

In [None]:
del embed_vectors,sentence_list,similarity_matrix
gc.collect()

## Finding related papers from query

### What is known about transmission, incubation, and environmental stability?

- Seasonality of transmission.
- Persistence of virus on surfaces of different materials (e,g., copper, stainless steel, plastic).
- Natural history of the virus and shedding of it from an infected person
- Implementation of diagnostics and products to improve clinical processes
- Disease models, including animal models for infection, disease and transmission
- Tools and studies to monitor phenotypic change and potential adaptation of the virus
- Immune response and immunity
- Role of the environment in transmission

In [None]:
# Load data
# clean_comm=pd.read_csv("../input/cord-19-eda-parse-json-and-generate-clean-csv/clean_comm_use.csv",nrows=5000)
# biox = pd.read_csv("../input/cord-19-eda-parse-json-and-generate-clean-csv/biorxiv_clean.csv")
df_papers = pd.read_csv("../input/cord-19-eda-parse-json-and-generate-clean-csv/clean_comm_use.csv",nrows=5000)
df_papers.head()

In [None]:
df_papers.info()

In [None]:
# Define taks
tasks=["What is known about transmission, incubation, and environmental stability",
      "What do we know about COVID-19 risk factors",
      "What do we know about virus genetics, origin, and evolution",
      "What do we know about vaccines and therapeutics",
      "What do we know about non-pharmaceutical interventions",
      "What do we know about diagnostics and surveillance",
      "What has been published about ethical and social science considerations",
      "Role of the environment in transmission",
      "Range of incubation periods for the disease in humans",
      "Prevalence of asymptomatic shedding and transmission",
      "Seasonality of transmission",
      "Persistence of virus on surfaces of different materials (e,g., copper, stainless steel, plastic)",
      "Susceptibility of populations",
      "Public health mitigation measures that could be effective for control",
      "Transmission dynamics of the virus",
      "Evidence that livestock could be infected",
      "Socioeconomic and behavioral risk factors for this spill-over",
      "Sustainable risk reduction strategies",
      "Resources to support skilled nursing facilities and long term care facilities",
      "Mobilization of surge medical staff to address shortages in overwhelmed communities"]

In [None]:
df_tasks=pd.DataFrame({'title':tasks,'source':'task'})
df_tasks.head()

In [None]:
df_papers=pd.concat([df_papers,df_tasks])
df_papers.fillna("Unknown",inplace=True)

### Find related research papers using USE 

In [None]:
sentence_list=df_papers.title.values.tolist()
embed_vectors=embed(sentence_list)['outputs'].numpy()
similarity_matrix=cosine_similarity(embed_vectors)

In [None]:
sentence= "Role of the environment in transmission"
similar=get_top_similar(sentence,sentence_list,similarity_matrix,10)
for sent in similar:
    print(sent[1])

- Clean and store abstracts from related articles.

In [None]:
ind,title=list(map(list,zip(*similar)))
titles=[]
texts=[]
for i in ind:
    titles.append(df_papers.iloc[i]['title'])
    texts.append(df_papers.iloc[i]['abstract'])

In [None]:
import re
def clean(txt):
    txt=re.sub(r'\n','',txt)
    txt=re.sub(r'\([^()]*\)','',txt)
    txt=re.sub(r'https?:\S+\sdoi','',txt)
    return txt

In [None]:
texts=list(map(clean,texts))
text_list=' '.join(texts)
#text_list=word_tokenize(text_list)

## Keyword Extraction

Keyword extraction is the automated process of extracting the most relevant words and expressions from text.Keyword extraction (also known as keyword detection or keyword analysis) is a text analysis technique that consists of automatically extracting the most important words and expressions in a text. It helps summarize the content of a text and recognize the main topics which are being discussed. 

In [None]:
r = Rake()
r.extract_keywords_from_text(text_list)
r.get_ranked_phrases()

## Knowledge Graph

In [None]:
nlp=spacy.load('en_core_web_sm')

In [None]:
def get_entities(sent):
    ## chunk 1
    ent1 = ""
    ent2 = ""

    prv_tok_dep = ""    # dependency tag of previous token in the sentence
    prv_tok_text = ""   # previous token in the sentence

    prefix = ""
    modifier = ""

  #############################################################
  
    for tok in nlp(sent):
        ## chunk 2
        # if token is a punctuation mark then move on to the next token
        if tok.dep_ != "punct":
          # check: token is a compound word or not
          if tok.dep_ == "compound":
            prefix = tok.text
            # if the previous word was also a 'compound' then add the current word to it
            if prv_tok_dep == "compound":
                   prefix = prv_tok_text + " "+ tok.text
      
      # check: token is a modifier or not
        if tok.dep_.endswith("mod") == True:
            modifier = tok.text
            # if the previous word was also a 'compound' then add the current word to it
            if prv_tok_dep == "compound":
              modifier = prv_tok_text + " "+ tok.text

          ## chunk 3
        if tok.dep_.find("subj") == True:
            ent1 = modifier +" "+ prefix + " "+ tok.text
            prefix = ""
            modifier = ""
            prv_tok_dep = ""
            prv_tok_text = ""      

          ## chunk 4
        if tok.dep_.find("obj") == True:
            ent2 = modifier +" "+ prefix +" "+ tok.text

          ## chunk 5  
          # update variables
        prv_tok_dep = tok.dep_
        prv_tok_text = tok.text
  #############################################################

    return [ent1.strip(), ent2.strip()]

In [None]:
def get_relation(sent):

  doc = nlp(sent)

  # Matcher class object 
  matcher = Matcher(nlp.vocab)

  #define the pattern 
  pattern = [{'DEP':'ROOT'}, 
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},  
            {'POS':'ADJ','OP':"?"}] 

  matcher.add("matching_1", [pattern], on_match=None) 

  matches = matcher(doc)
  k = len(matches) - 1

  span = doc[matches[k][1]:matches[k][2]] 

  return(span.text)

#### Question : What is the Role of the environment in transmission?

- We will prepare a dataframe that contains,subject,relation and object from these abstracts to plot the knowledge graph.


In [None]:
def prepare_df(text_list):
    doc=nlp(text_list)
    df=pd.DataFrame()
    for sent in list(doc.sents):
        sub,obj = get_entities(str(sent))
        relation= get_relation(str(sent))

        if ((len(relation)>2) & (len(sub)>2) &(len(obj)>2)):
            df=df.append({'subject':sub,'relation':relation,'object':obj},ignore_index=True)

    return df

In [None]:
df = prepare_df(text_list[24:])
df.head()

In [None]:
def draw_kg(pairs,c1='red',c2='blue',c3='orange'):
    k_graph = nx.from_pandas_edgelist(pairs, 'subject', 'object',
            create_using=nx.MultiDiGraph())
  
    node_deg = nx.degree(k_graph)
    layout = nx.spring_layout(k_graph, k=0.15, iterations=20)
    plt.figure(num=None, figsize=(50, 40), dpi=80)
    nx.draw_networkx(
        k_graph,
        node_size=[int(deg[1]) * 500 for deg in node_deg],
        arrowsize=20,
        linewidths=1.5,
        pos=layout,
        edge_color=c1,
        edgecolors=c2,
        node_color=c3,
        )
    labels = dict(zip(list(zip(pairs.subject, pairs.object)),
                  pairs['relation'].tolist()))
    nx.draw_networkx_edge_labels(k_graph, pos=layout, edge_labels=labels,
                                 font_color='red')
    plt.axis('off')
    plt.show()

In [None]:
draw_kg(df)

#### Question: What is known about transmission, incubation, and environmental stability?

In [None]:
sentence= "What is known about transmission, incubation, and environmental stability"
similar=get_top_similar(sentence,sentence_list,similarity_matrix,15)

In [None]:
ind,title=list(map(list,zip(*similar)))
titles=[]
texts=[]
for i in ind:
    titles.append(all_articles.iloc[i]['title'])
    texts.append(all_articles.iloc[i]['abstract'])

In [None]:
texts=list(map(clean,texts))
text_list=' '.join(texts)

In [None]:
df = prepare_df(text_list)
draw_kg(df)