# Loading In The Data

Zachary Z.

30 March 2020

In [42]:
# dependencies
import pandas as pd
import numpy as np

import json
from dictor import dictor

from datetime import datetime
import os

In [2]:
# navigate to the folder with the files
%cd ".\CORD-19-research-challenge"

D:\My Computer\My Documents\UNH MS Analytics\COVID19 Kaggle Comp\CORD-19-research-challenge


load in the data and create a dataframe

In [3]:
os.listdir()

['biorxiv_medrxiv',
 'comm_use_subset',
 'COVID.DATA.LIC.AGMT.pdf',
 'custom_license',
 'FormattedWithNLP.txt',
 'json_schema.txt',
 'metadata.csv',
 'metadata.readme',
 'noncomm_use_subset']

In [4]:
folders = ["biorxiv_medrxiv", 
           "comm_use_subset", 
           "custom_license", 
           "noncomm_use_subset"]

In [5]:
# create the list that will hold the files (for creating a dataframe)
files_list=[]

# get the start time
start = datetime.now()

# loop through the 4 folders
for i in folders: 
    
    # loop through each file
    for j in os.listdir(i):
        with open(i + "\\" + j, "r") as temp:
            data = json.load(temp)
        temp.close()
        
        ### The following code utilizes the dictor package to navigate the nested lists/dicts ###
        
        # get the paper id
        paper_id = dictor(data, "paper_id")
        
        # get the paper title
        paper_title = dictor(data, "metadata.title")
        
        # get the abstract
        paper_abstract = dictor(data, "abstract.0.text")
        
        # get the body text
        paper_body = dictor(data, "body_text.0.text")
            
        # Appending Main Keys to list
        files_list.append({'paper_id': paper_id,
                           'title': paper_title, 
                           'abstract': paper_abstract, 
                           'body': paper_body})
        
# get the total run time
print("Total Runtime:", datetime.now()-start)

Total Runtime: 0:00:30.501005


In [6]:
df = pd.DataFrame(files_list)

In [7]:
df

Unnamed: 0,paper_id,title,abstract,body
0,0015023cc06b5362d332b3baf348d11567ca2fbb,The RNA pseudoknots in foot-and-mouth disease ...,word count: 194 22 Text word count: 5168 23 24...,"VP3, and VP0 (which is further processed to VP..."
1,004f0f8bb66cf446678dc13cf2701feec4f36d76,Healthcare-resource-adjusted vulnerabilities t...,,The 2019-nCoV epidemic has spread across China...
2,00d16927588fb04d4be0e6b269fc02f0d3c2aa7b,"Real-time, MinION-based, amplicon sequencing f...",Infectious bronchitis (IB) causes significant ...,"Infectious bronchitis (IB), which is caused by..."
3,0139ea4ca580af99b602c6435368e7fdbefacb03,A Combined Evidence Approach to Prioritize Nip...,Nipah Virus (NiV) came into limelight recently...,Nipah is an infectious negative-sense single-s...
4,013d9d1cba8a54d5d3718c229b812d7cf91b6c89,Assessing spread risk of Wuhan novel coronavir...,Background: A novel coronavirus (2019-nCoV) em...,"In December 2019, a cluster of patients with p..."
...,...,...,...,...
29310,ff5a79ed22ea416e6d89caad1cf0d83dbc741a4b,Understanding Human Coronavirus HCoV-NL63,Even though coronavirus infection of humans is...,"Regardless of geographic location, respiratory..."
29311,ff6d57f2aad99be129432058665b361dc18747e8,Brief Definitive Report MACROPHAGES GENETICALL...,There is extensive evidence that cultured macr...,Experiments were designed to test whether subs...
29312,ff83907653a4c4500e8c509ca28169e924742b40,Identification of a Subdomain of CENPB That Is...,We have combined in vivo and in vitro approach...,"can function in an autonomous fashion, relocat..."
29313,ffe718db1820f27bf274e3fc519ab78e450de288,Replication enhancer elements within the open ...,We provide experimental evidence of a replicat...,Tick-borne encephalitis virus (TBEV) is a huma...


Load in the metadata file

In [8]:
# load in the file
meta = pd.read_csv("metadata.csv")
meta

Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text,full_text_file
0,,Elsevier,Intrauterine virus infections and congenital h...,10.1016/0002-8703(72)90077-4,,4361535.0,els-covid,Abstract The etiologic basis for the vast majo...,1972-12-31,"Overall, James C.",American Heart Journal,,,False,custom_license
1,,Elsevier,Coronaviruses in Balkan nephritis,10.1016/0002-8703(80)90355-5,,6243850.0,els-covid,,1980-03-31,"Georgescu, Leonida; Diosi, Peter; Buţiu, Ioan;...",American Heart Journal,,,False,custom_license
2,,Elsevier,Cigarette smoking and coronary heart disease: ...,10.1016/0002-8703(80)90356-7,,7355701.0,els-covid,,1980-03-31,"Friedman, Gary D",American Heart Journal,,,False,custom_license
3,aecbc613ebdab36753235197ffb4f35734b5ca63,Elsevier,Clinical and immunologic studies in identical ...,10.1016/0002-9343(73)90176-9,,4579077.0,els-covid,"Abstract Middle-aged female identical twins, o...",1973-08-31,"Brunner, Carolyn M.; Horwitz, David A.; Shann,...",The American Journal of Medicine,,,True,custom_license
4,,Elsevier,Epidemiology of community-acquired respiratory...,10.1016/0002-9343(85)90361-4,,4014285.0,els-covid,Abstract Upper respiratory tract infections ar...,1985-06-28,"Garibaldi, Richard A.",The American Journal of Medicine,,,False,custom_license
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44215,d4f00f66c732c292fcfc28b19f44daa2fa620901,PMC,Epidemiology and clinical profile of pathogens...,10.1371/journal.pone.0188325,PMC5693464,29149199.0,cc-by,This study aimed to identify a broad spectrum ...,2017 Nov 17,"Brini, Ines; Guerrero, Aida; Hannachi, Naila; ...",PLoS One,,,True,comm_use_subset
44216,ec575d33c0d3b34af7644fcfed64af045a75ab63,Elsevier,Functional Analysis of the Transmembrane Domai...,10.1016/j.jmb.2008.12.029,PMC2750892,19121325.0,els-covid,"Abstract To enter cells, enveloped viruses use...",2009-02-13,"Bissonnette, Mei Lin Z.; Donald, Jason E.; DeG...",Journal of Molecular Biology,,,True,custom_license
44217,7f8715a818bfd325bf4413d3c07003d7ce7b6f7e,PMC,Viral Entry Properties Required for Fitness in...,10.1128/mBio.00898-18,PMC6030562,29970463.0,cc-by,Human parainfluenza viruses cause a large burd...,2018 Jul 3,"Iketani, Sho; Shean, Ryan C.; Ferren, Marion; ...",mBio,,,True,comm_use_subset
44218,07e78e218a159c35e9599e3751a99551a271597b,Elsevier,Arenavirus reverse genetics: New approaches fo...,10.1016/j.virol.2011.01.013,PMC3057228,21324503.0,els-covid,"Abstract Several arenaviruses, chiefly Lassa v...",2011-03-15,"Emonet, Sebastien E.; Urata, Shuzo; de la Torr...",Virology,,,True,custom_license


In [9]:
# check only the ones with actual text
meta_full = meta[meta['has_full_text'] == True]
meta_full

Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text,full_text_file
3,aecbc613ebdab36753235197ffb4f35734b5ca63,Elsevier,Clinical and immunologic studies in identical ...,10.1016/0002-9343(73)90176-9,,4579077.0,els-covid,"Abstract Middle-aged female identical twins, o...",1973-08-31,"Brunner, Carolyn M.; Horwitz, David A.; Shann,...",The American Journal of Medicine,,,True,custom_license
5,212e990b378e8d267042753d5f9d4a64ea5e9869,Elsevier,Infectious diarrhea: Pathogenesis and risk fac...,10.1016/0002-9343(85)90367-5,,2861742.0,els-covid,Abstract Our understanding of the pathogenesis...,1985-06-28,"Cantey, J.Robert",The American Journal of Medicine,,,True,custom_license
6,bf5d344243153d58be692ceb26f52c08e2bd2d2f,Elsevier,New perspectives on the pathogenesis of rheuma...,10.1016/0002-9343(88)90356-7,,3052052.0,els-covid,Abstract In the pathogenesis of rheumatoid art...,1988-10-14,"Zvaifler, Nathan J.",The American Journal of Medicine,,,True,custom_license
7,ddd2ecf42ec86ad66072962081e1ce4594431f9c,Elsevier,Management of acute and chronic respiratory tr...,10.1016/0002-9343(88)90456-1,,3048091.0,els-covid,"Abstract Pharyngitis, bronchitis, and pneumoni...",1988-09-16,"Ellner, Jerrold J.",The American Journal of Medicine,,,True,custom_license
8,a55cb4e724091ced46b5e55b982a14525eea1c7e,Elsevier,Acute bronchitis: Results of U.S. and European...,10.1016/0002-9343(92)90608-E,,1621745.0,els-covid,"Abstract Acute bronchitis, an illness frequent...",1992-06-22,"Dere, Willard H.",The American Journal of Medicine,,,True,custom_license
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44215,d4f00f66c732c292fcfc28b19f44daa2fa620901,PMC,Epidemiology and clinical profile of pathogens...,10.1371/journal.pone.0188325,PMC5693464,29149199.0,cc-by,This study aimed to identify a broad spectrum ...,2017 Nov 17,"Brini, Ines; Guerrero, Aida; Hannachi, Naila; ...",PLoS One,,,True,comm_use_subset
44216,ec575d33c0d3b34af7644fcfed64af045a75ab63,Elsevier,Functional Analysis of the Transmembrane Domai...,10.1016/j.jmb.2008.12.029,PMC2750892,19121325.0,els-covid,"Abstract To enter cells, enveloped viruses use...",2009-02-13,"Bissonnette, Mei Lin Z.; Donald, Jason E.; DeG...",Journal of Molecular Biology,,,True,custom_license
44217,7f8715a818bfd325bf4413d3c07003d7ce7b6f7e,PMC,Viral Entry Properties Required for Fitness in...,10.1128/mBio.00898-18,PMC6030562,29970463.0,cc-by,Human parainfluenza viruses cause a large burd...,2018 Jul 3,"Iketani, Sho; Shean, Ryan C.; Ferren, Marion; ...",mBio,,,True,comm_use_subset
44218,07e78e218a159c35e9599e3751a99551a271597b,Elsevier,Arenavirus reverse genetics: New approaches fo...,10.1016/j.virol.2011.01.013,PMC3057228,21324503.0,els-covid,"Abstract Several arenaviruses, chiefly Lassa v...",2011-03-15,"Emonet, Sebastien E.; Urata, Shuzo; de la Torr...",Virology,,,True,custom_license


In [10]:
# merge the metadata with the already created df (containing raw papers)
combo_df = pd.merge(df, meta_full, how='inner', left_on="paper_id", right_on='sha')

# reset the index
combo_df = combo_df.reset_index(drop=True)

# drop duplicate columns
combo_df.drop(columns=['abstract_x', 'title_x', 'source_x'], inplace=True)

# rename the columns
combo_df.rename(columns={"abstract_y":"abstract", "title_y":"title"}, inplace=True)

# replace certain column missing values with blank strings
combo_df['abstract'].fillna('', inplace=True)
combo_df['title'].fillna('', inplace=True)
combo_df['body'].fillna('', inplace=True)

combo_df

Unnamed: 0,paper_id,body,sha,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text,full_text_file
0,0015023cc06b5362d332b3baf348d11567ca2fbb,"VP3, and VP0 (which is further processed to VP...",0015023cc06b5362d332b3baf348d11567ca2fbb,The RNA pseudoknots in foot-and-mouth disease ...,10.1101/2020.01.10.901801,,,biorxiv,The positive stranded RNA genomes of picornavi...,2020-01-11,"Ward, J. C. J.; Lasecka-Dykes, L.; Neil, C.; A...",,,,True,biorxiv_medrxiv
1,004f0f8bb66cf446678dc13cf2701feec4f36d76,The 2019-nCoV epidemic has spread across China...,004f0f8bb66cf446678dc13cf2701feec4f36d76,Healthcare-resource-adjusted vulnerabilities t...,10.1101/2020.02.11.20022111,,,medrvix,We integrate the human movement and healthcare...,2020-02-12,Hanchu Zhou; Jianan Yang; Kaichen Tang; Qingpe...,,,,True,biorxiv_medrxiv
2,00d16927588fb04d4be0e6b269fc02f0d3c2aa7b,"Infectious bronchitis (IB), which is caused by...",00d16927588fb04d4be0e6b269fc02f0d3c2aa7b,"Real-time, MinION-based, amplicon sequencing f...",10.1101/634600,,,biorxiv,Infectious bronchitis (IB) causes significant ...,2019-05-10,"Butt, S. L.; Erwood, E. C.; Zhang, J.; Sellers...",,,,True,biorxiv_medrxiv
3,0139ea4ca580af99b602c6435368e7fdbefacb03,Nipah is an infectious negative-sense single-s...,0139ea4ca580af99b602c6435368e7fdbefacb03,A Combined Evidence Approach to Prioritize Nip...,10.1101/2020.03.12.977918,,,biorxiv,AbstractBackgroundNipah Virus (NiV) came into ...,2020-03-12,Nishi Kumari; Ayush Upadhyay; Kishan Kalia; Ra...,,,,True,biorxiv_medrxiv
4,013d9d1cba8a54d5d3718c229b812d7cf91b6c89,"In December 2019, a cluster of patients with p...",013d9d1cba8a54d5d3718c229b812d7cf91b6c89,Assessing spread risk of Wuhan novel coronavir...,10.1101/2020.02.04.20020479,,,medrvix,Background: A novel coronavirus (2019-nCoV) em...,2020-02-05,Shengjie Lai; Isaac Bogoch; Nick Ruktanonchai;...,,,,True,biorxiv_medrxiv
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27685,ff3db609dbf2669d8d528279ef32280c2646ba85,"Over the past few decades, the world has witne...",ff3db609dbf2669d8d528279ef32280c2646ba85,More than efficacy revealed by single-cell ana...,10.1126/sciadv.aax4761,PMC6821460,31692968.0,cc-by-nc,Because many aspects of viral infection dynami...,2019 Oct 30,"Liu, Wu; Caglar, Mehmet U.; Mao, Zhangming; Wo...",Sci Adv,,,True,noncomm_use_subset
27686,ff5a79ed22ea416e6d89caad1cf0d83dbc741a4b,"Regardless of geographic location, respiratory...",ff5a79ed22ea416e6d89caad1cf0d83dbc741a4b,Understanding Human Coronavirus HCoV-NL63,10.2174/1874357901004010076,PMC2918871,20700397.0,cc-by-nc,Even though coronavirus infection of humans is...,2010 May 25,"Abdul-Rasool, Sahar; Fielding, Burtram C",Open Virol J,,,True,noncomm_use_subset
27687,ff6d57f2aad99be129432058665b361dc18747e8,Experiments were designed to test whether subs...,ff6d57f2aad99be129432058665b361dc18747e8,Macrophages genetically resistant to mouse hep...,,PMC2190139,175127.0,cc-by-nc-sa,"Genetic resistance to mouse hepatitis, which r...",1976 Mar 1,,J Exp Med,,,True,noncomm_use_subset
27688,ff83907653a4c4500e8c509ca28169e924742b40,"can function in an autonomous fashion, relocat...",ff83907653a4c4500e8c509ca28169e924742b40,Identification of a subdomain of CENP-B that i...,,PMC2289363,1740467.0,cc-by-nc-sa,We have combined in vivo and in vitro approach...,1992 Mar 1,,J Cell Biol,,,True,noncomm_use_subset


In [11]:
combo_df.iloc[422,]

paper_id                                7b40dfdf9a60cfe4a199cb967ee458b7dfaabe0c
body                           In December 2019, a cluster of severe pneumoni...
sha                                     7b40dfdf9a60cfe4a199cb967ee458b7dfaabe0c
title                          The incubation period of 2019-nCoV from public...
doi                                                  10.1101/2020.02.02.20020016
pmcid                                                                        NaN
pubmed_id                                                                    NaN
license                                                                  medrvix
abstract                                                                        
publish_time                                                                 NaN
authors                                                                      NaN
journal                                                                      NaN
Microsoft Academic Paper ID 

# Create a Search Tool (Basic Version)

03 April 2020

In [None]:
# load the dependencies
import re

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

from tqdm import tqdm_notebook as tqdm

In [None]:
# load the english language
nlp = spacy.load('en_core_web_md')

tokenize/format text

In [None]:
look_at = "body"
combo_df["body_nlp"] = ""

# create tokens for every doc
for i in tqdm(combo_df.index):
    text = combo_df.iloc[i, combo_df.columns.get_loc(look_at)]
    text = text.lower()
    text = nlp(text)
    text2 = [token for token in text if token.is_stop == False]
    text2 = [token for token in text2 if token.is_punct == False]
    text3 = []
    for token in text2:
        text3.append(token.lemma_)
    combo_df.iloc[i, combo_df.columns.get_loc("body_nlp")] = str(text3)

In [None]:
combo_df

# # save combo_df checkpoint
# combo_df.to_csv("FormattedWithNLP.txt", index=False)

run a search

In [None]:
# create a filter to decide which part of the df to look at

while True:
    print("Please enter which area to search")
    print("The options are 'Title', 'Abstract', or 'Body'")
    look_at_input = input()

    # create a list of options
    options_list = ["title", "abstract", "body"]
    
    # lower case the input
    area_tolook = look_at_input.lower()

    # if the string matches the options_list, break the loop, otherwise ask again
    if look_at_input in options_list:
        break
    else:
        print("!")
        print("Invalid area. Please try again.")
        print("-----------------------------------------")

In [None]:
# generate a search term
search = input("Please input a search term:")

# lower case everything in the search
search = search.lower()

# run the nlp pipeline on it
search = nlp(search)

# remove punctuation and stop words from the search
search2 = [token for token in search if token.is_stop == False]
search2 = [token for token in search2 if token.is_punct == False]

# create a blank list for the lemmanized tokens
search3 = []

# loop through the search term, and add each lemminized token to the new list
for token in search2:
    search3.append(token.lemma_)

In [None]:
# only use the parts of the df we need (the paper id and the part that is being searched)
temp = combo_df[["paper_id", area_tolook]]

# create a list to hold the paper ids
ids_list = []

# loop through every paper
for i in tqdm(temp.index):
    
    # save the text as a variable and the paper id as a variable
    text = temp.iloc[i, temp.columns.get_loc(area_tolook)]
    paper_id = temp.iloc[i, temp.columns.get_loc("paper_id")]
    
    # loop through all the tokenized search terms
    for j in search3:
        
        ### This chunk is for variations of whitespace in the text ###
        
        # if the variation of the term matches a paper, add the id to a list
        pattern = " " + j + " "
        if pattern in text:
            ids_list.append(paper_id)
            
        # if the variation of the term matches a paper, add the id to a list
        pattern = " " + j
        if pattern in text:
            ids_list.append(paper_id)
            
        # if the variation of the term matches a paper, add the id to a list
        pattern = j + " "
        if pattern in text:
            ids_list.append(paper_id)
            
        ### ------------------------------------------------------ ###

# return the dataframe with only the ids selected
search_df = combo_df[combo_df["paper_id"].isin(ids_list)]
search_df.reset_index(inplace=True, drop=True)

In [None]:
# print the resulting df

# for i in search_df.index:
#     print(search_df.iloc[i, search_df.columns.get_loc("abstract")])
#     print("########################################################################################################")

search_df

# Search Tool (Doc2Vec)

11 April 2020

In [12]:
# load the dependencies
import re
import random

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from tqdm import tqdm_notebook as tqdm

from sklearn.cluster import KMeans

unable to import 'smart_open.gcs', disabling that module


In [13]:
# load the english language
nlp = spacy.load('en_core_web_md')

In [None]:
# part to look at
look_at = "abstract"

tokenize

In [14]:
doc_list = []

# create tokens for every doc
for i in tqdm(combo_df.index):
    text = combo_df.iloc[i, combo_df.columns.get_loc(look_at)]
    text = text.lower()
    text = nlp(text)
    text2 = [token for token in text if token.is_stop == False]
    text2 = [token for token in text2 if token.is_punct == False]
    text3 = []
    for token in text2:
        text3.append(token.lemma_)
    doc_list.append(text3)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=27690.0), HTML(value='')))




In [None]:
doc_list

generate the tagged documents for each document

In [15]:
# initialize a blank list
documents = []

# loop through the list of documents, and create a tagged document of each abstracts tokens with paper_id as the tag
for index, doc in enumerate(doc_list):
    temp = TaggedDocument(doc, [index])
    documents.append(temp)

In [17]:
documents[4]

TaggedDocument(words=['background', 'novel', 'coronavirus', '2019-ncov', 'emerge', 'wuhan', 'city', 'china', 'end', '2019', 'cause', 'outbreak', 'human', 'human', 'transmission', 'public', 'health', 'emergency', 'international', 'concern', 'declare', 'world', 'health', 'organization', 'january', '30', '2020', 'aim', 'aim', 'estimate', 'potential', 'risk', 'geographic', 'range', 'wuhan', 'novel', 'coronavirus', '2019-ncov', 'spread', 'china', 'january', 'april', '2020', 'method', 'series', 'domestic', 'international', 'travel', 'network', 'base', 'connectivity', 'risk', 'analysis', 'perform', 'de', 'identify', 'aggregated', 'mobile', 'phone', 'datum', 'air', 'passenger', 'itinerary', 'datum', 'case', 'report', 'result', 'cordon', 'sanitaire', 'wuhan', 'likely', 'occur', 'stage', 'peak', 'population', 'number', 'leave', 'city', 'lunar', 'new', 'year', 'lny', 'traveller', 'depart', 'neighbouring', 'city', 'megacitie', 'china', 'estimate', '59,912', 'air', 'passenger', '834', '95', 'ui', '

run the model

In [16]:
# run the model for doc2vec
model = Doc2Vec(documents, vector_size=50, window=3, min_count=1, workers=4, epochs=1)

find the most similar papers

In [79]:
# generate a random doc number to get a paper id
doc_id = random.randint(0, len(documents))
print(doc_id)
print(combo_df.paper_id[doc_id])
del doc_id

15727
63fbcac09bf2a0133b41cb99e93dd2ce3aeeab05


In [87]:
# create as list of tuples for the n highest similarity locations
while True:
    print("Please Enter the Number of Similar Papers to Find:")
    try:
        # turn the input into an int
        paper_num = int(input())    
        
        # if the number is less than the max number of papers in the df, continue, else as for another number
        if (paper_num > 0) and (paper_num <= (len(combo_df) + 1)):
            break
        else:
            print("That Is Not a Valid Number. Please Try Again\n")
    except:
        print("That Is Not a Valid Number. Please Try Again\n")

loc_list = []

# create a list of tuples of the name and index)
for i in range(0,paper_num):
    name = "Most Similar " + str(i+1)
    tup = (name, i)
    loc_list.append(tup)

# ask for a paper id
while True:
    print("Please Enter a Paper ID:")
    paper_id = input()
    print("\n")
    
    try:
        # assign the index and the title of the paper to variables, if they exist in the df
        doc_index = list(combo_df.paper_id).index(paper_id)
        doc_title = combo_df.title[doc_index]
        break
    except:
        print("That ID is Not Valid. Please Try Again\n")

print("Searching for similar documents to <{}> (ID:{})".format(doc_title, paper_id))
print("-----------------------------------------------------------------------------\n")

# calculate the vector of the document
inferred_vector = model.infer_vector(documents[doc_index].words)

# get the most similar documents
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# get the most similar documents (the limit is set by the user)
for label, index in loc_list:
    
    location = documents[sims[index][0]].tags[0]
    
    # print the title
    print('%s: <%s> (ID:%s)\n' % (label, combo_df.title[location], combo_df.paper_id[location]))
    
    # print the text
    print('Text:',' '.join(documents[sims[index][0]].words))
    
    print("-----------------------------------------------------------------------------\n")

Please Enter the Number of Similar Papers to Find:
5
Please Enter a Paper ID:
63fbcac09bf2a0133b41cb99e93dd2ce3aeeab05


Searching for similar documents to <Live-attenuated bivalent measles virus-derived vaccines targeting Middle East respiratory syndrome coronavirus induce robust and multifunctional T cell responses against both viruses in an appropriate mouse model> (ID:63fbcac09bf2a0133b41cb99e93dd2ce3aeeab05)
-----------------------------------------------------------------------------

Most Similar 1: <The Adjuvanticity of an O. volvulus-Derived rOv-ASP-1 Protein in Mice Using Sequential Vaccinations and in Non-Human Primates> (ID:eff3310317521aed7abe06ef1fa9963ca9d6caf3)

Text: adjuvant potentiate antigen specific protective immune response key element promote vaccine effectiveness previously report onchocerca volvulus recombinant protein rov asp-1 induce activation maturation naïve human dcs innate adjuvant promote balance th1 th2 response bystander vaccine antigen mouse vaccine

# Topic Analysis (LDiA)

11 April 2020

In [91]:
# load the dependencies
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

from tqdm import tqdm_notebook as tqdm

In [None]:
# load the english language
nlp = spacy.load('en_core_web_md')

tokenize

In [88]:
look_at = "abstract"
combo_df["abstract_nlp"] = ""

# create tokens for every doc
for i in tqdm(combo_df.index):
    text = combo_df.iloc[i, combo_df.columns.get_loc(look_at)]
    text = text.lower()
    text = nlp(text)
    text2 = [token for token in text if token.is_stop == False]
    text2 = [token for token in text2 if token.is_punct == False]
    text3 = []
    for token in text2:
        text3.append(token.lemma_)
    combo_df.iloc[i, combo_df.columns.get_loc("abstract_nlp")] = str(text3)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=27690.0), HTML(value='')))




In [89]:
combo_df

Unnamed: 0,paper_id,body,sha,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text,full_text_file,abstract_nlp
0,0015023cc06b5362d332b3baf348d11567ca2fbb,"VP3, and VP0 (which is further processed to VP...",0015023cc06b5362d332b3baf348d11567ca2fbb,The RNA pseudoknots in foot-and-mouth disease ...,10.1101/2020.01.10.901801,,,biorxiv,The positive stranded RNA genomes of picornavi...,2020-01-11,"Ward, J. C. J.; Lasecka-Dykes, L.; Neil, C.; A...",,,,True,biorxiv_medrxiv,"['positive', 'strand', 'rna', 'genome', 'picor..."
1,004f0f8bb66cf446678dc13cf2701feec4f36d76,The 2019-nCoV epidemic has spread across China...,004f0f8bb66cf446678dc13cf2701feec4f36d76,Healthcare-resource-adjusted vulnerabilities t...,10.1101/2020.02.11.20022111,,,medrvix,We integrate the human movement and healthcare...,2020-02-12,Hanchu Zhou; Jianan Yang; Kaichen Tang; Qingpe...,,,,True,biorxiv_medrxiv,"['integrate', 'human', 'movement', 'healthcare..."
2,00d16927588fb04d4be0e6b269fc02f0d3c2aa7b,"Infectious bronchitis (IB), which is caused by...",00d16927588fb04d4be0e6b269fc02f0d3c2aa7b,"Real-time, MinION-based, amplicon sequencing f...",10.1101/634600,,,biorxiv,Infectious bronchitis (IB) causes significant ...,2019-05-10,"Butt, S. L.; Erwood, E. C.; Zhang, J.; Sellers...",,,,True,biorxiv_medrxiv,"['infectious', 'bronchitis', 'ib', 'cause', 's..."
3,0139ea4ca580af99b602c6435368e7fdbefacb03,Nipah is an infectious negative-sense single-s...,0139ea4ca580af99b602c6435368e7fdbefacb03,A Combined Evidence Approach to Prioritize Nip...,10.1101/2020.03.12.977918,,,biorxiv,AbstractBackgroundNipah Virus (NiV) came into ...,2020-03-12,Nishi Kumari; Ayush Upadhyay; Kishan Kalia; Ra...,,,,True,biorxiv_medrxiv,"['abstractbackgroundnipah', 'virus', 'niv', 'c..."
4,013d9d1cba8a54d5d3718c229b812d7cf91b6c89,"In December 2019, a cluster of patients with p...",013d9d1cba8a54d5d3718c229b812d7cf91b6c89,Assessing spread risk of Wuhan novel coronavir...,10.1101/2020.02.04.20020479,,,medrvix,Background: A novel coronavirus (2019-nCoV) em...,2020-02-05,Shengjie Lai; Isaac Bogoch; Nick Ruktanonchai;...,,,,True,biorxiv_medrxiv,"['background', 'novel', 'coronavirus', '2019-n..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27685,ff3db609dbf2669d8d528279ef32280c2646ba85,"Over the past few decades, the world has witne...",ff3db609dbf2669d8d528279ef32280c2646ba85,More than efficacy revealed by single-cell ana...,10.1126/sciadv.aax4761,PMC6821460,31692968.0,cc-by-nc,Because many aspects of viral infection dynami...,2019 Oct 30,"Liu, Wu; Caglar, Mehmet U.; Mao, Zhangming; Wo...",Sci Adv,,,True,noncomm_use_subset,"['aspect', 'viral', 'infection', 'dynamic', 'i..."
27686,ff5a79ed22ea416e6d89caad1cf0d83dbc741a4b,"Regardless of geographic location, respiratory...",ff5a79ed22ea416e6d89caad1cf0d83dbc741a4b,Understanding Human Coronavirus HCoV-NL63,10.2174/1874357901004010076,PMC2918871,20700397.0,cc-by-nc,Even though coronavirus infection of humans is...,2010 May 25,"Abdul-Rasool, Sahar; Fielding, Burtram C",Open Virol J,,,True,noncomm_use_subset,"['coronavirus', 'infection', 'human', 'normall..."
27687,ff6d57f2aad99be129432058665b361dc18747e8,Experiments were designed to test whether subs...,ff6d57f2aad99be129432058665b361dc18747e8,Macrophages genetically resistant to mouse hep...,,PMC2190139,175127.0,cc-by-nc-sa,"Genetic resistance to mouse hepatitis, which r...",1976 Mar 1,,J Exp Med,,,True,noncomm_use_subset,"['genetic', 'resistance', 'mouse', 'hepatitis'..."
27688,ff83907653a4c4500e8c509ca28169e924742b40,"can function in an autonomous fashion, relocat...",ff83907653a4c4500e8c509ca28169e924742b40,Identification of a subdomain of CENP-B that i...,,PMC2289363,1740467.0,cc-by-nc-sa,We have combined in vivo and in vitro approach...,1992 Mar 1,,J Cell Biol,,,True,noncomm_use_subset,"['combine', 'vivo', 'vitro', 'approach', 'inve..."


In [92]:
# create and fit the count vectorizer
vectorizer = CountVectorizer()
df_vectorized = vectorizer.fit_transform(combo_df["abstract_nlp"])

In [95]:
# create a ldia model and fit it to the count vector data
ldia = LatentDirichletAllocation(n_components=10, max_iter=10,verbose=True)
ldia_model = ldia.fit_transform(df_vectorized)

# create a nmf model and fit it to the count vector data
nmf = NMF(n_components=10)
nmf_model = nmf.fit_transform(df_vectorized) 

# create a lsa model and fit it to the count vector data
lsa = TruncatedSVD(n_components=10)
lsa_model = lsa.fit_transform(df_vectorized)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [101]:
def selected_topics(model, vectorizer, top_n=10):
    for i, topic in enumerate(model.components_):
        print("Topic %d:" % (i))
        print([(vectorizer.get_feature_names()[x], topic[x]) for x in topic.argsort()[:-top_n - 1:-1]])
        print("-------------------------------------------------------------------------")

def selected_topics_names(model, vectorizer, top_n=10):
    for i, topic in enumerate(model.components_):
        print("Topic %d:" % (i))       
        print([(vectorizer.get_feature_names()[x]) for x in topic.argsort()[:-top_n - 1:-1]])
        print("-------------------------------------------------------------------------")

In [103]:
print("LDiA Model:")
selected_topics(ldia, vectorizer)
print("#########################################################################")

print("NMF Model:")
selected_topics(nmf, vectorizer)
print("#########################################################################")

print("LSA Model:")
selected_topics(lsa, vectorizer)
print("#########################################################################")

LDiA Model:
Topic 0:
[('infection', 6083.323410144752), ('cell', 4783.677664086467), ('disease', 4453.770217147687), ('immune', 2708.0975777045514), ('response', 2606.03546950463), ('mouse', 2541.743159013706), ('lung', 2420.5627408501846), ('il', 2305.423439215534), ('treatment', 2033.7303812849852), ('cat', 2002.1828067600495)]
-------------------------------------------------------------------------
Topic 1:
[('cell', 13766.032661883424), ('virus', 8584.415543410843), ('protein', 8553.289941976767), ('viral', 5321.67419547165), ('infection', 4703.5285885616095), ('rna', 3970.993621451902), ('replication', 3717.689787475229), ('activity', 3663.1464973125057), ('expression', 3538.285704510566), ('antiviral', 2764.4675687582626)]
-------------------------------------------------------------------------
Topic 2:
[('antibody', 5452.150326902909), ('vaccine', 5080.947295208966), ('virus', 4901.84262045198), ('strain', 3795.697560807346), ('cell', 3683.016057646475), ('protein', 2808.89396

[('kim', 0.22592951606788353), ('lee', 0.1390971373009507), ('young', 0.10111870213534442), ('park', 0.0924673612693221), ('jung', 0.061674739314507364), ('hee', 0.05848045169380682), ('allergic', 0.058293078094930735), ('asthma', 0.0578341129560553), ('hyun', 0.05391166640522813), ('soo', 0.04477409582807084)]
-------------------------------------------------------------------------
Topic 2:
[('patient', 0.5037004437380638), ('care', 0.17291067509548266), ('respiratory', 0.11581229054093528), ('case', 0.11426521530629184), ('de', 0.11387367336354653), ('disease', 0.10721312066863015), ('health', 0.09887116186183299), ('study', 0.09823398424189564), ('hospital', 0.0981278807709415), ('intensive', 0.08313561595337053)]
-------------------------------------------------------------------------
Topic 3:
[('cell', 0.46137680601288494), ('protein', 0.22185990590157684), ('patient', 0.1547549755397599), ('sars', 0.10160049933373011), ('expression', 0.09475218700708612), ('response', 0.0882218

In [102]:
print("LDiA Model:")
selected_topics_names(ldia, vectorizer)
print("#########################################################################")

print("NMF Model:")
selected_topics_names(nmf, vectorizer)
print("#########################################################################")

print("LSA Model:")
selected_topics_names(lsa, vectorizer)
print("#########################################################################")

LDiA Model:
Topic 0:
['infection', 'cell', 'disease', 'immune', 'response', 'mouse', 'lung', 'il', 'treatment', 'cat']
-------------------------------------------------------------------------
Topic 1:
['cell', 'virus', 'protein', 'viral', 'infection', 'rna', 'replication', 'activity', 'expression', 'antiviral']
-------------------------------------------------------------------------
Topic 2:
['antibody', 'vaccine', 'virus', 'strain', 'cell', 'protein', 'pedv', 'response', 'gene', 'pig']
-------------------------------------------------------------------------
Topic 3:
['health', 'model', 'disease', 'study', 'result', 'datum', 'care', 'outbreak', 'public', 'epidemic']
-------------------------------------------------------------------------
Topic 4:
['disease', 'influenza', 'virus', 'health', 'human', 'infection', 'infectious', 'vaccine', 'review', 'pandemic']
-------------------------------------------------------------------------
Topic 5:
['protein', 'sequence', 'virus', 'gene', 'b

# Clustering

April 2020

In [None]:
# from sklearn.cluster import KMeans
# from sklearn.decomposition import PCA

# kmeans_model = KMeans(n_clusters=25, max_iter=100) 

# X = kmeans_model.fit(model.docvecs.vectors_docs)

# labels=kmeans_model.labels_.tolist()

# l = kmeans_model.fit_predict(model.docvecs.vectors_docs)

# pca = PCA(n_components=2).fit(model.docvecs.vectors_docs)

# datapoint = pca.transform(model.docvecs.vectors_docs)

# import matplotlib.pyplot as plt

# label1 = ['#FFFF00', '#008000', '#0000FF', '#800080']
# color = [label1[i] for i in labels]

# plt.scatter(datapoint[:, 0], datapoint[:, 1], c=color)

# centroids = kmeans_model.cluster_centers_

# centroidpoint = pca.transform(centroids)

# plt.scatter(centroidpoint[:, 0], centroidpoint[:, 1], marker='^', s=150, c='#000000')

# plt.show()