In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

import matplotlib.pyplot as plt
import seaborn as sns

from bert_serving.client import BertClient

from rank_bm25 import BM25Okapi

# self defined
from models import Bert_article, Bert_paragraph, Bm25_article

  from pandas import Panel


In [2]:
df = pd.read_hdf('metadata/preprocessed.h5')
df.head()

Unnamed: 0,paper_id,body_text,methods,results,source,title,doi,abstract,publish_time,authors,journal,arxiv_id,url,publish_year,is_covid19,study_design
0,3cdc48bb9e40afd30a59463b7872761a726998c8,NDV (Roakin strain) was obtained from Dr. D. J...,NDV (Roakin strain) was obtained from Dr. D. J...,Adult house flies harbored Newcastle Disease v...,PMC,Experimental Evaluation of Musca domestica (Di...,10.1093/jmedent/44.4.666,"House flies, Musca domestica L. (Diptera: Musc...",2007-07-01,"Watson, D. Wes; Niño, Elina L.; Rochon, Katery...",J Med Entomol,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,2007,False,[]
1,d99acb4e99be7852aa61a688c9fbd38d44b5a252,Live attenuated viruses have been developed an...,RSV A2 strain was obtained from ATCC (Manassas...,The reverse genetics system for measles Edmons...,PMC,Evaluation of Measles Vaccine Virus as a Vecto...,10.2174/1874357901206010012,Live attenuated recombinant measles vaccine vi...,2012-02-16,"Mok, Hoyin; Cheng, Xing; Xu, Qi; Zengel, James...",Open Virol J,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,2012,False,"['truncated', 'gamma', 'protocol']"
2,748d4c57fe1acc8d9d97cf574f7dea5296f9386c,Ebola virus (EBOV) and other members of the fa...,U2OS human osteosarcoma cells were cultured in...,For evaluating EBOV GP triggering under biosaf...,PMC,Direct Visualization of Ebola Virus Fusion Tri...,10.1128/mbio.01857-15,Ebola virus (EBOV) makes extensive and intrica...,2016-02-09,"Spence, Jennifer S.; Krause, Tyler B.; Mittler...",mBio,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,2016,False,"['truncated', 'heterogeneity']"
3,b891efc6e1419713b05ff7d89b26d260478c28df,To the Editor:\nChina has the world's second l...,,,PMC,Tuberculosis prevention in healthcare workers ...,10.1183/23120541.00015-2015,BSL3 and respiratory isolation wards protect h...,2015-08-21,"Deng, Yunfeng; Li, Yan; Wang, Fengtian; Gao, D...",ERJ Open Res,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,2015,False,[]
4,353852971069ad5794445e5c1ab6077ce23da75d,Coronavirus disease 2019 (COVID-19) has spread...,,,,,,,NaT,,,,,-1,True,[]


# Models

In [3]:
prefix = 'metadata/uncased_L-12_H-768_A-12-abstract-'

bert_article = Bert_article(ip='0.0.0.0', saved_prefix=prefix)
bert_paragraph = Bert_paragraph(ip='0.0.0.0')

bm25_article = Bm25_article(df['abstract'])

bert_article.article_vectors.shape, bm25_article.bm25.corpus_size

got 47110 articles, real corpus size for bm25 model: 41062


((41062, 768), 41062)

## Extract papers

In [59]:
def find_article(query, df, top_n=5):
    """
    """

    result = {
        'bm25': bm25_article,
        'bert': bert_article
    }

    for key in result:
        indices = result[key].get_top_notna_article_indices(query, top_n=top_n)
        temp = df[df['abstract'].notna()].iloc[indices]    # filter the top_n
        temp['article_from'] = key
        temp['article_rank'] = range(1, top_n+1)
        
        result[key] = temp
    
    result = pd.concat(result.values())
    
    return result

def find_paragraphs(query, df, top_n=5):
        
    ## Extract paragraphs
    def split_str_to_paragraph_list(paper_content):
        """
        """
        paragraphs = str(paper_content).split('\n')
        paragraphs = [paragraph for paragraph in paragraphs if len(paragraph)>0]
        
        return paragraphs
    
    def splitting_top_paragraphs(row, para_indices):
        original_index = row.index.difference(['paragraphs'])
        d = {key: row[key] for key in original_index}
        paragraphs = [row['paragraphs'][i] for i in para_indices.loc[row.name]]

        df = pd.DataFrame({**d, 'paragraph': paragraphs, 'para_rank': [i+1 for i in range(len(paragraphs))] })
        return df
    
    
    df['paragraphs'] = df['body_text'].progress_apply(split_str_to_paragraph_list)    # the result is a series, with entries as list of strings
    
    
    d = {
        'bert': bert_paragraph
    }
    
    for key in d:
        para_indices = d[key].get_top_notna_paragraph_indices(query=query, paragraphs_series=df['paragraphs'])    
            # return pd.Series, which contains lists of paragraph indices.
        temp = df.apply(splitting_top_paragraphs, axis='columns', para_indices=para_indices)
        temp = pd.concat(temp.tolist())
        temp['para_from'] = key
        d[key] = temp
    
    result = pd.concat(d.values()).reset_index(drop=True)
    return result

In [59]:
query = 'What Should Gastroenterologists and Patients Know About COVID-19?'
result = find_article(query, df)
result = find_paragraphs(query, result)

In [64]:
display_cols = ['paper_id', 'title', 'authors', 'publish_time', 'abstract', 'body_text', 'source', 'is_covid19',
                'article_from', 'article_rank', 'para_from', 'para_rank', 'paragraph', 'study_design']

result[display_cols].head()

Unnamed: 0,paper_id,title,authors,publish_time,abstract,body_text,source,is_covid19,article_from,article_rank,para_from,para_rank,paragraph,study_design
0,b30770ae30b35cdfaf0a173863e74e93edbb0329,36th International Symposium on Intensive Care...,"Bateman, R. M.; Sharpe, M. D.; Jagger, J. E.; ...",2016-04-20,P001 - Sepsis impairs the capillary response w...,\nIntroduction: A hallmark of sepsis is early ...,PMC,True,bm25,1,bert,1,Conclusions: The incidence of ESBL and carbape...,"['meta-analysis', 'β', 'systematic review', 'a..."
1,b30770ae30b35cdfaf0a173863e74e93edbb0329,36th International Symposium on Intensive Care...,"Bateman, R. M.; Sharpe, M. D.; Jagger, J. E.; ...",2016-04-20,P001 - Sepsis impairs the capillary response w...,\nIntroduction: A hallmark of sepsis is early ...,PMC,True,bm25,1,bert,2,"Conclusions: In our clinical series, dPCO2, Ca...","['meta-analysis', 'β', 'systematic review', 'a..."
2,b30770ae30b35cdfaf0a173863e74e93edbb0329,36th International Symposium on Intensive Care...,"Bateman, R. M.; Sharpe, M. D.; Jagger, J. E.; ...",2016-04-20,P001 - Sepsis impairs the capillary response w...,\nIntroduction: A hallmark of sepsis is early ...,PMC,True,bm25,1,bert,3,Introduction: Endotoxin > 0.6 is associated wi...,"['meta-analysis', 'β', 'systematic review', 'a..."
3,a5293bb4f17ad25a72133cdd9eee8748dd6a4b8d,"XXIV World Allergy Congress 2015: Seoul, Korea...","Lee, Heung-Man; Park, Il-Ho; Shin, Jae-Min; Yo...",2016-04-19,A1 Pirfenidone inhibits TGF-b1-induced extrace...,Purpose: Pirfenidone has been shown to have an...,PMC,False,bm25,2,bert,1,Conclusions: Many patients who experience oxal...,"['meta-analysis', 'β', 'systematic review', 'a..."
4,a5293bb4f17ad25a72133cdd9eee8748dd6a4b8d,"XXIV World Allergy Congress 2015: Seoul, Korea...","Lee, Heung-Man; Park, Il-Ho; Shin, Jae-Min; Yo...",2016-04-19,A1 Pirfenidone inhibits TGF-b1-induced extrace...,Purpose: Pirfenidone has been shown to have an...,PMC,False,bm25,2,bert,2,Objective: This case report presents cases of ...,"['meta-analysis', 'β', 'systematic review', 'a..."


In [67]:
result.to_csv(f'results/{query}.csv')

In [None]:
to_print = 0
write = 0

if to_print:
    print(query)
    print('---------------------------------')
    print()

    for (target, paragraphs, indices) in zip(target_df.iterrows(), paragraphs_df, paragraphs_top_indices):
        print(target[0],
              target[1],#[['source', 'title', 'publish_time', 'methods', 'results', 'study_design', 'is_covid19']], 
              sep='\n')
        print('paragraphs:', indices)
        print()
        print(np.array(paragraphs)[indices])
        print()
        print('---------------------------------')
        print()

if write:
    with open(write+query+'.txt', 'w') as f:
        f.write(query+'\n')
        f.write('---------------------------------\n\n')

        for (target, paragraphs, indices) in zip(target_df.iterrows(), paragraphs_df, paragraphs_top_indices):
            f.write(str(target[0])+'\n'+str(target[1]))
            f.write('paragraphs:'+str(indices)+'\n\n')
            f.write(str(np.array(paragraphs)[indices])+'\n')
            f.write('\n---------------------------------\n\n\n')