# Data prep

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import json

import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
#Read meta data
root_path = '2020-03-13'
metadata_path = root_path+'/all_sources_metadata_2020-03-13.csv'
meta_df = pd.read_csv(metadata_path, dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str, 
    'doi': str
})
meta_df.head()

Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text
0,c630ebcdf30652f0422c3ec12a00b50241dc9bd9,CZI,Angiotensin-converting enzyme 2 (ACE2) as a SA...,10.1007/s00134-020-05985-9,,32125455.0,cc-by-nc,,2020,"Zhang, Haibo; Penninger, Josef M.; Li, Yimin; ...",Intensive Care Med,2002765492,#3252,True
1,53eccda7977a31e3d0f565c884da036b1e85438e,CZI,Comparative genetic analysis of the novel coro...,10.1038/s41421-020-0147-1,,,cc-by,,2020,"Cao, Yanan; Li, Lin; Feng, Zhimin; Wan, Shengq...",Cell Discovery,3003430844,#1861,True
2,210a892deb1c61577f6fba58505fd65356ce6636,CZI,Incubation Period and Other Epidemiological Ch...,10.3390/jcm9020538,,,cc-by,The geographic spread of 2019 novel coronaviru...,2020,"Linton, M. Natalie; Kobayashi, Tetsuro; Yang, ...",Journal of Clinical Medicine,3006065484,#1043,True
3,e3b40cc8e0e137c416b4a2273a4dca94ae8178cc,CZI,Characteristics of and Public Health Responses...,10.3390/jcm9020575,,32093211.0,cc-by,"In December 2019, cases of unidentified pneumo...",2020,"Deng, Sheng-Qun; Peng, Hong-Juan",J Clin Med,177663115,#1999,True
4,92c2c9839304b4f2bc1276d41b1aa885d8b364fd,CZI,Imaging changes in severe COVID-19 pneumonia,10.1007/s00134-020-05976-w,,32125453.0,cc-by-nc,,2020,"Zhang, Wei",Intensive Care Med,3006643024,#3242,False


In [3]:
meta_df["doi_modified"] = meta_df["doi"].str.replace('/','%2F')
meta_df["URL"] = "https://www.google.com/search?q=doi+"+ meta_df["doi_modified"]

In [4]:
all_json = glob.glob(root_path+"/**/*.json", recursive=True)
len(all_json)

13202

In [5]:
class FileReader:
    def __init__(self, file_path):
        with open(file_path) as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.abstract = []
            self.body_text = []
            # Abstract
            for entry in content['abstract']:
                self.abstract.append(entry['text'])
            # Body text
            for entry in content['body_text']:
                self.body_text.append(entry['text'])
            self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)
    def __repr__(self):
        return (f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...')
    
first_row = FileReader(all_json[0])
print(first_row)

0015023cc06b5362d332b3baf348d11567ca2fbb: word count: 194 22 Text word count: 5168 23 24 25 author/funder. All rights reserved. No reuse allowed without permission. Abstract 27 The positive stranded RNA genomes of picornaviruses comprise a si... VP3, and VP0 (which is further processed to VP2 and VP4 during virus assembly) (6). The P2 64 and P3 regions encode the non-structural proteins 2B and 2C and 3A, 3B (1-3) (VPg), 3C pro and 4 structura...


In [6]:
def get_breaks(content, length):
    data = ""
    words = content.split(' ')
    total_chars = 0

    # add break every length characters
    for i in range(len(words)):
        total_chars += len(words[i])
        if total_chars > length:
            data = data + "<br>" + words[i]
            total_chars = 0
        else:
            data = data + " " + words[i]
    return data

In [7]:
from ast import literal_eval

dict_ = {'paper_id': [], 'abstract': [], 'body_text': [], 'authors': [], 'title': [], 'journal': [], 'abstract_summary': []}
for idx, entry in enumerate(all_json):
    if idx % (len(all_json) // 10) == 0:
        print(f'Processing index: {idx} of {len(all_json)}')
    content = FileReader(entry)
    dict_['paper_id'].append(content.paper_id)
    dict_['abstract'].append(content.abstract)
    dict_['body_text'].append(content.body_text)
      
    # also create a column for the summary of abstract to be used in a plot
    if len(content.abstract) == 0: 
        # no abstract provided
        dict_['abstract_summary'].append("Not provided.")
    elif len(content.abstract.split(' ')) > 100:
        # abstract provided is too long for plot, take first 300 words append with ...
        info = content.abstract.split(' ')[:100]
        summary = get_breaks(' '.join(info), 40)
        dict_['abstract_summary'].append(summary + "...")
    else:
        # abstract is short enough
        summary = get_breaks(content.abstract, 40)
        dict_['abstract_summary'].append(summary)
        
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    try:
        # if more than one author
        authors = literal_eval(meta_data['authors'].values[0])
        if len(authors) > 2:
            # more than 2 authors, may be problem when plotting, so take first 2 append with ...
            dict_['authors'].append(". ".join(authors[:2]) + "...")
        else:
            # authors will fit in plot
            dict_['authors'].append(". ".join(authors))
    except Exception as e:
        # if only one author - or Null valie
        dict_['authors'].append(meta_data['authors'].values[0])
    
    # add the title information, add breaks when needed
    try:
        title = get_breaks(meta_data['title'].values[0], 40)
        dict_['title'].append(title)
    # if title was not provided
    except Exception as e:
        dict_['title'].append(meta_data['title'].values[0])
    
    # add the journal information
    dict_['journal'].append(meta_data['journal'].values[0])
    
df_covid = pd.DataFrame(dict_, columns=['paper_id', 'abstract', 'body_text', 'authors', 'title', 'journal', 'abstract_summary'])
df_covid.head()

Processing index: 0 of 13202
Processing index: 1320 of 13202
Processing index: 2640 of 13202
Processing index: 3960 of 13202
Processing index: 5280 of 13202
Processing index: 6600 of 13202
Processing index: 7920 of 13202
Processing index: 9240 of 13202
Processing index: 10560 of 13202
Processing index: 11880 of 13202
Processing index: 13200 of 13202


Unnamed: 0,paper_id,abstract,body_text,authors,title,journal,abstract_summary
0,0015023cc06b5362d332b3baf348d11567ca2fbb,word count: 194 22 Text word count: 5168 23 24...,"VP3, and VP0 (which is further processed to VP...","Ward, J. C. J.; Lasecka-Dykes, L.; Neil, C.; A...",The RNA pseudoknots in foot-and-mouth disease...,,word count: 194 22 Text word count: 5168 23 2...
1,004f0f8bb66cf446678dc13cf2701feec4f36d76,,The 2019-nCoV epidemic has spread across China...,,,,Not provided.
2,00d16927588fb04d4be0e6b269fc02f0d3c2aa7b,Infectious bronchitis (IB) causes significant ...,"Infectious bronchitis (IB), which is caused by...","Butt, S. L.; Erwood, E. C.; Zhang, J.; Sellers...","Real-time, MinION-based, amplicon<br>sequenci...",,Infectious bronchitis (IB) causes<br>signific...
3,013d9d1cba8a54d5d3718c229b812d7cf91b6c89,Background: A novel coronavirus (2019-nCoV) em...,"In December 2019, a cluster of patients with p...",,,,Background: A novel coronavirus (2019-nCoV)<b...
4,01d162d7fae6aaba8e6e60e563ef4c2fca7b0e18,Faced with the current large-scale public heal...,The sudden outbreak of the new coronavirus (SA...,,,,Faced with the current large-scale public<br>...


In [8]:
#df_covid=pd.merge(df_covid, meta_df, left_on='paper_id', right_on='sha')
df_covid=df_covid.dropna()

In [9]:
df_covid.shape

(12248, 7)

# Topic Model

In [19]:
import NewsTrends
import importlib
importlib.reload(NewsTrends)
import locale
import matplotlib.pyplot as plt
plt.style.use('ggplot')

%matplotlib inline

In [11]:
t = NewsTrends.topicModel(df_covid,
                          key_idx=list(df_covid.columns).index("paper_id"),
                          text_idx=list(df_covid.columns).index("body_text"),
                          lang="en_core_web_sm",random_state = 1,
                          bigram = False)

In [12]:
corpus = t.tokenize_docs(rmv_tokens=["'s","=","datum", "r","t","ifn","+","c","ml"])

12247 done ]


In [14]:
corpus.head()

Unnamed: 0,paper_id,tokenized_text
0,000b7d1517ceebb34e1e3e817695b6de03e2fa78,"[figure, s1, phylogeny, sequence, belong, umrv..."
1,00142f93c18b07350be89e96372d240372437ed9,"[introduction, human, constantly, expose, myri..."
2,0022796bb2112abd2e6423ba2d57751db06049fb,"[pathogens, vector, transport, rapidly, world,..."
3,00326efcca0852dc6e39dc6b7786267e1bc4f194,"[addition, preventative, care, nutritional, su..."
4,00352a58c8766861effed18a4b079d1683fec2ec,"[ubiquitination, widely, posttranslational, mo..."


In [15]:
corpus = t.get_bow(below = 0.05,above = 0.9)

In [None]:
#k,results = t.LDA_tune_k(max_k=20,iterations=800)

In [None]:
plt.plot(k,results)

In [16]:
#Train topic model
LDA = t.fit_LDA(6,iterations=2000)

INFO:lda:n_documents: 12248
INFO:lda:vocab_size: 2872
INFO:lda:n_words: 21791024
INFO:lda:n_topics: 6
INFO:lda:n_iter: 2000
INFO:lda:<0> log likelihood: -197519831
INFO:lda:<10> log likelihood: -179657158
INFO:lda:<20> log likelihood: -166363977
INFO:lda:<30> log likelihood: -165271640
INFO:lda:<40> log likelihood: -164850304
INFO:lda:<50> log likelihood: -164641975
INFO:lda:<60> log likelihood: -164540632
INFO:lda:<70> log likelihood: -164492215
INFO:lda:<80> log likelihood: -164458398
INFO:lda:<90> log likelihood: -164443245
INFO:lda:<100> log likelihood: -164437267
INFO:lda:<110> log likelihood: -164416598
INFO:lda:<120> log likelihood: -164390493
INFO:lda:<130> log likelihood: -164376525
INFO:lda:<140> log likelihood: -164363087
INFO:lda:<150> log likelihood: -164332217
INFO:lda:<160> log likelihood: -164324621
INFO:lda:<170> log likelihood: -164318127
INFO:lda:<180> log likelihood: -164304700
INFO:lda:<190> log likelihood: -164292675
INFO:lda:<200> log likelihood: -164275552
INFO:

INFO:lda:<1890> log likelihood: -164094605
INFO:lda:<1900> log likelihood: -164093789
INFO:lda:<1910> log likelihood: -164103929
INFO:lda:<1920> log likelihood: -164098450
INFO:lda:<1930> log likelihood: -164094877
INFO:lda:<1940> log likelihood: -164101093
INFO:lda:<1950> log likelihood: -164093637
INFO:lda:<1960> log likelihood: -164100761
INFO:lda:<1970> log likelihood: -164100392
INFO:lda:<1980> log likelihood: -164096236
INFO:lda:<1990> log likelihood: -164086900
INFO:lda:<1999> log likelihood: -164090953


In [18]:
#Inspect topic results
topics = t.inspect_topics(n_top_words=25)

res = ''
for i, topic_words in enumerate(topics):
    res += '* **Topic {}:** {}\n'.format(i, ' '.join(topic_words))
NewsTrends.MD(res)

In [48]:
topics = [["cell, protein, antibody, control, assay"],
          ["cases, model, outbreak, public, risk, transmission"],
          ["patient, infection, group, age, children, test, symptom"],
          ["protein, sequence, gene, rna, structure, genome, bind"],
          ["pcr, test, detect, sars, infection"],
          ["infection, mouse, immune, vaccine"],
          
         ]
len(topics)

6

In [78]:
import networkx as nx

In [127]:
similarity_cutoff = 0.992
node_label="title"
node_attr=[ "topic",u'authors', u'journal',"abstract"]

#Merge old data with topic data
t.df1 = t.data.merge(t.text_topic,how='left',on = t.key_name,copy=False)

#Generate similarity measure
doc_similarity  = t.doc_topic.transpose().corr()
doc_similarity[doc_similarity  < similarity_cutoff] = 0
np.fill_diagonal(doc_similarity.values, 0)

#Build graph file
mapping = t.df1[node_label]
doc_graph = nx.Graph(doc_similarity.values)
doc_graph = nx.relabel_nodes(doc_graph, mapping) #Creat dict with

nx.set_node_attributes(doc_graph,t.df1[node_attr+[node_label]].set_index(node_label).T.to_dict())

In [128]:
nx.write_gml(doc_graph,'docs14.gml')

In [120]:
#doctopics = t.get_doc_topic(topics)
#networkgraph = t.generate_network(node_attr=[ "topic",u'authors', u'journal',"abstract"],
 #                 node_label="title", similarity_cutoff = 0.75,
  #               similarity_measure = "correlation")

In [None]:
#nx.write_edgelist(networkgraph,"g.csv")

In [85]:
t.doc_topic.to_csv("doc_topic.csv",index=False)

In [86]:
t.df.to_csv("dataset.csv",index=False)

# COVID-19 research exploration

In [16]:
import pandas as pd
%matplotlib inline
tdf = pd.read_csv("dataset.csv")

By using intelligent text processing and learning directly from the textual context, the following analysis finds meaningful connections and relationships in a large corpus of unstructured data. Following I invite you to interactively explore the large body of research around COVID-19. This analysis aims to help to better understand the disease.

# Data summary

In [10]:
analysiscode="COVID"

In [4]:
import pprint
print("The cleaned data set contains {} research papers from {} authors. The data is sourced from https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge".format(tdf.shape[0],tdf["authors"].nunique()))
print()
print("The top journals in the data set are:")
print()
for a, b in enumerate(tdf.groupby("journal")["paper_id"].agg("count").sort_values(ascending=False)[0:5].keys(), 1):
    print('{} {}'.format(a, b))

The cleaned data set contains 12248 research papers from 11422 authors. The data is sourced from https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge

The top journals in the data set are:

1 PLoS One
2 Emerg Infect Dis
3 Viruses
4 Sci Rep
5 PLoS Pathog


# Document Analysis

*Note: If the following plots are not displayed nicely or missing the legend on the right-hand side, hit refresh and wait until fully loaded.*

In [23]:
import os
from shutil import copyfile

# Move templates
source = "01_Vorlage/"
dst = "graphics/network/"

for f in ["index.html","css/style.css","js/main.js","js/sigma/sigma.parseJson.js"]:
    copyfile(source+f, dst+f)


### Research is mainly focused on the 1) epidemiology and 2) molecular biology of the virus ([How to read the network](http://nbviewer.jupyter.org/github/bockjo/Udacity_portfolio/blob/master/networks_how_to.png)).

In [24]:
from IPython.core.display import HTML

HTML('<iframe src="graphics1/network/index.html" width="100%" height="480" seamless>Graph not rendered.</iframe>')

# Plotting code

In [21]:
import plotly.io as pio
import warnings
pio.templates.default = 'simple_white'
warnings.filterwarnings('ignore')

In [22]:
#Plotly
import numpy as np
import plotly
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
from ipywidgets import widgets
from chart_studio.widgets import GraphWidget

init_notebook_mode(connected=True)

# generate a function to handle changes in the widget
# add top_n functionality!!!
def timeline(t,y,date_col,colorby,aggregate="nunique",timeinterval=None):

    plot_dat = t.copy()
    plot_dat[date_col] = pd.to_datetime(plot_dat[date_col]).dt.week
        
    plot_dat = plot_dat.groupby([date_col,colorby])[y].agg(aggregate)
    dat = plot_dat.reset_index()
    
    dataset = []
    for gr in dat[colorby].unique():
        dataset.append(go.Bar(
        x=dat[dat[colorby]==gr][date_col],
        y=dat[dat[colorby]==gr][y],
        name=gr
    ))

    layout = go.Layout(
        autosize=True,
        barmode='stack',
        showlegend = True,
        legend=dict(orientation="h",x=0.1,y=1.5),
        yaxis = dict(title = "Publish-Count"),
        xaxis = dict(title="Week number")
    )
    
    
    fig = go.Figure(data=dataset, layout=layout)
    return iplot(fig, filename=analysiscode+'stacked-bar')

# generate a function to handle changes in the widget
def hist(t,x,colorby="topic_terms",logx = False):

    plot_dat = t.copy()
    
    if logx:
        plot_dat[x]=np.log(plot_dat[x])
        #typ = 'log'
    #else:
        #typ="normal"
    data = []
    for gr in plot_dat[colorby].unique():
        data.append(go.Histogram(
            x=plot_dat.loc[plot_dat[colorby]==gr,x],
            name=gr
            
    ))

    layout = go.Layout(barmode='stack',legend=dict(orientation="h",x=0.1,y=1.5),
                       autosize=True,
                #width=800,
                #height=550,
                       xaxis=dict(title=x,type="normal",autorange=True,
                                 exponentformat= "e" if logx else "none"),
                      yaxis = dict(title="count"))
    fig = go.Figure(data=data, layout=layout)

    
    return iplot(fig, filename=analysiscode+'stacked histogram')

# add top_n functionality!!!
def scatter(t,x,y,groupby,colorby = None,sizeby = "fixed", aggregate = {"x":"sum","y":"sum","colorby":"sum"},
            axistype = {"x":"linear","y":"linear"},
            xtitle="", ytitle=""):
    
    plot_dat = t.copy()
    if sizeby=="fixed":
        s=15
    if colorby == None:
        colorby = groupby
        
    if plot_dat[colorby].dtype == np.object:
        plot_datx = plot_dat.groupby([colorby,groupby])[x].agg(aggregate["x"])
        plot_daty = plot_dat.groupby([colorby,groupby])[y].agg(aggregate["y"])
        
        dat = []
        for cat in plot_dat[colorby].unique():
            
            dat.append({"x":plot_datx[cat].values,
                       "y":plot_daty[cat].values,
                        "text":plot_daty[cat].index,
                       "name":cat,
                       "mode":"markers",
                       "marker":dict(size=s)})
        fig = {
            'data': dat,
            'layout': {"legend":dict(orientation="h",x=0.1,y=1.5),"showlegend": True,
                'xaxis': {'title': xtitle,"type":axistype["x"],"exponentformat": "e" if axistype["x"]=="log" else "none"},
                'yaxis': {'title': ytitle,"type":axistype["y"],"exponentformat": "e" if axistype["y"]=="log" else "none"},
                "autosize":True
            }
        }
    else:
        plot_datx = plot_dat.groupby(groupby)[x].agg(aggregate["x"])
        plot_daty = plot_dat.groupby(groupby)[y].agg(aggregate["y"])
        plot_datcol = plot_dat.groupby(groupby)[colorby].agg(aggregate["colorby"])
        
        trace1 = go.Scatter(
            y = plot_daty.values,
            x = plot_datx.values,
            mode='markers',
            marker=dict(
                size=s,
                color = plot_datcol.values, #set color equal to a variable
                colorscale='Viridis',
                showscale=True
            )
        )
        layout = dict(xaxis=dic(title=xtitle,type=axistype["x"],exponentformat= "e" if axistype["x"]=="log" else "none"),
                      yaxis=dic(title=ytitle,type=axistype["y"],exponentformat= "e" if axistype["y"]=="log" else "none"))
        fig = go.Figure(data=[trace1],layout = layout)
        
    return iplot(fig, filename=analysiscode+'bar')

# add top_n functionality!!!
def bar_continuous(t):
    plot_dat = t.copy()
    
    plot_dat = plot_dat.groupby(groupby)[x].agg(aggregate)
    data =[
        go.Bar(
        y=plot_dat.index.values,
        x=plot_dat.values,
        orientation = 'h')
    ]
    fig = go.Figure(data=data)
    
    return iplot(fig, filename='bar_cont')
        
def bar(t,x,groupby,xtitle="",colorby = "topic_terms", aggregate = "sum"):
    plot_dat = t.copy()
    
    data = []
    plot_dat = plot_dat.groupby([colorby,groupby])[x].agg(aggregate)
    for gr in plot_dat.index.levels[0]: 
        data.append(go.Bar(
        y=plot_dat[gr].index.values,
        x=plot_dat[gr].values,
        name=gr,
        orientation = 'h'
    ))

    layout = go.Layout(
        autosize=False,
    width=800,
    height=550,
        barmode='stack',
        showlegend = True,
        xaxis = dict(title=xtitle),#,position=-1.
        yaxis = dict(),
        legend=dict(orientation="h",x=0.1,y=1.2),
        margin = go.Margin(l=350,r=10)
    )

    fig = go.Figure(data=data, layout=layout)
    
    return iplot(fig, filename=analysiscode+'bar_cat')

# Exploratory analysis

### The top 10 journals cover most of the topics around the coronavirus, but PLoS One gives the most balanced and comprehensive view.

In [17]:
topjour = list(tdf.groupby("journal")["paper_id"].agg("count").sort_values(ascending=False)[0:10].index)

In [20]:
bar(tdf[tdf.journal.isin(topjour)],"paper_id","journal",xtitle="Publish Count",colorby = "topic", aggregate = "nunique")

### The top 10 authors tend to focus on their field of expertise, however Bande et al. cover a wide range of topics.

In [13]:
topau = list(tdf.groupby("authors")["paper_id"].agg("count").sort_values(ascending=False)[0:10].index)

In [14]:
bar(tdf[tdf.authors.isin(topau)],"paper_id","authors",xtitle="Publish Count",colorby = "topic", aggregate = "nunique")