In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from functools import reduce
import matplotlib.pyplot as plt
import nltk
import os
from nltk.stem.porter import PorterStemmer

In [2]:
def clean(df):
    df = df.drop_duplicates()
    df = df.fillna('unable to retrieve data')
    
    # make sure it was published in 2020
    df = df[df['publish_time'].str.contains('2020')]
    
    # make abstract lower case
    df["abstract"] = df["abstract"].str.lower()+df["title"].str.lower()
    
    # make sure abstract discusses covid-19
    df = df[df['abstract'].str.contains('covid') | df['abstract'].str.contains('-cov-2') | df['abstract'].str.contains('cov2') | df['abstract'].str.contains('ncov')]
    
    df = df.drop_duplicates(subset='title', keep="first")
    return df

def stem(words):
    p_stemmer = PorterStemmer()
    stem_word = []
    for word in words:
        stem_word.append(p_stemmer.stem(word))
    return stem_word

def search(df, search_tokens):
    search_words = stem(search_tokens)
    df = df[reduce(lambda x, y: x&y, (df['abstract'].str.contains(word) for word in search_words))]
    return df


df = pd.read_csv('metadata.csv', usecols = ['title', 'journal', 'abstract', 'authors', 'publish_time'] )

df = clean(df)
df

Unnamed: 0,title,abstract,publish_time,authors,journal
12234,Metformin Inhibits Proliferation of Human Thyr...,objective: to uncover the potential effect of ...,2020-01-06,"He, Yang; Cao, Lingling; Wang, Li; Liu, Lingpi...",Onco Targets Ther
12320,Note from the editors: novel coronavirus (2019...,unable to retrieve datanote from the editors: ...,2020-01-23,unable to retrieve data,Euro Surveill
12526,Topological dynamics of the 2015 South Korea M...,network analysis to examine infectious contact...,2020-03-09,"Yang, Chang Hoon; Jung, Hyejin",Sci Rep
12552,Complete Genome Sequence of a 2019 Novel Coron...,a complete genome sequence was obtained for a ...,2020-03-12,"Sah, Ranjit; Rodriguez-Morales, Alfonso J.; Jh...",Microbiol Resour Announc
12553,First cases of coronavirus disease 2019 (COVID...,"in the who european region, covid-19 surveilla...",2020-03-05,"Spiteri, Gianfranco; Fielding, James; Diercke,...",Euro Surveill
...,...,...,...,...,...
59825,Nonstructural proteins NS7b and NS8 are likely...,the seventh novel human infecting betacoronavi...,2020,"Fahmi, Muhamad; Kubota, Yukihiko; Ito, Masahiro","Infection, Genetics and Evolution"
59836,Potential Factors Influencing Repeated SARS Ou...,within last 17 years two widespread epidemics ...,2020-03-03,"Sun, Zhong; Thilakavathy, Karuppiah; Kumar, S....",Int J Environ Res Public Health
59839,A precision medicine approach to managing 2019...,"in december 2019, several patients with pneumo...",2020-02-04,"Wang, Minjin; Zhou, Yanbing; Zong, Zhiyong; Li...",Precis Clin Med
59876,SARS-CoV-2 Viral Load in Upper Respiratory Spe...,unable to retrieve datasars-cov-2 viral load i...,2020-03-19,"Zou, Lirong; Ruan, Feng; Huang, Mingxing; Lian...",N Engl J Med


In [3]:
questions = ['What do we know about seasonality of transmission?', 
            'What do we know about stability of the virus in environmental conditions?',
            'Is population movement control effective in stopping transmission (spread)?']

In [19]:
def sentences(df, search_tokens):
    col_names =  ['pub_date', 'title', 'author', 'sentence']
    new_df  = pd.DataFrame(columns = col_names)
    search_words=stem(search_tokens)
    for i, row in df.iterrows():
        sentence = ''
        all_sentences = row['abstract'].split('. ')
        for s in all_sentences:
            missing = 0
            for word in search_words:
                if word not in s:
                    missing = 1
            if missing == 0 and s != '':
                s = s.capitalize()
                if s[-1] != '.':
                    s = s + '.'
                sentence = sentence + s
        if sentence != '':
            author = row['authors']
            if len(author.split()) > 3:
                author = author.split()[0] + ' et. al'
            title = row['title']
            new_df.loc[len(new_df)] = [row['publish_time'], title, author, sentence]
    return new_df

In [20]:
search_words = ['seasonal','transmission']
df1 = search(df, search_words)
df1 = sentences(df1, search_words)
df1.to_csv(r'topics\topics.csv')

In [21]:
search_word2 = ['environment', 'transmission']
df2 = search(df, search_word2)
df2 = sentences(df2, search_word2)
df2.to_csv(r'topics\topics2.csv')

In [22]:
search_word2 = ['temperature', 'humidity', 'transmission']
df3 = search(df, search_word2)
df3 = sentences(df3, search_word2)
df3.to_csv(r'topics\topics3.csv')

In [23]:
search_word2 = ['death', 'male']
df3 = search(df, search_word2)
df3 = sentences(df3, search_word2)
df3.to_csv(r'topics\topics3.csv')