# Load Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import json

import matplotlib.pyplot as plt
plt.style.use('ggplot')

## File check

In [None]:
!ls /kaggle/input/CORD-19-research-challenge/

# Explore meta data

In [None]:
root_path = '/kaggle/input/CORD-19-research-challenge/'
metadata_path = f'{root_path}/metadata.csv'
meta_df = pd.read_csv(metadata_path, dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str, 
    'doi': str
})
meta_df.head()

In [None]:
meta_df.shape

In [None]:
sum(meta_df.abstract.isna())

In [None]:
sum(meta_df.abstract.str.contains(r'hum[ae]n | m[ae]n |treatment')[meta_df.abstract.str.contains(r'hum[ae]n | m[ae]n |treatment').notnull()])

In [None]:
meta_df.apply(lambda x:sum(x.isna()))

### Check how many papers or articles (json files) are there?

In [None]:
all_json = glob.glob(f'{root_path}/**/*.json', recursive=True)
len(all_json)

### Extract main content from the json files 

In [None]:
class FileReader:
    def __init__(self, file_path):
        with open(file_path) as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.abstract = []
            self.body_text = []
            # Abstract
            try:
                if content['abstract']:
                
                    for entry in content['abstract']:
                        self.abstract.append(entry['text'])
            except:
                self.abstract.append('NA')
                
            
            # Body text
            for entry in content['body_text']:
                self.body_text.append(entry['text'])
            self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)
    def __repr__(self):
        return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'
first_row = FileReader(all_json[10000])
print(first_row)

##### Abtracts have breaks which need to be extracted too to get full text.

In [None]:
def get_breaks(content, length):
    data = ""
    words = content.split(' ')
    total_chars = 0

    # add break every length characters
    for i in range(len(words)):
        total_chars += len(words[i])
        if total_chars > length:
            data = data + "<br>" + words[i]
            total_chars = 0
        else:
            data = data + " " + words[i]
    return data

### Attempt to extract files and create a dataset(table)

In [None]:
#dict_ = {'paper_id': [], 'abstract': [], 'body_text': []}
#for idx, entry in enumerate(all_json):
#    if idx % (len(all_json) // 10) == 0:
#        print(f'Processing index: {idx} of {len(all_json)}')
#    content = FileReader(entry)
#    dict_['paper_id'].append(content.paper_id)
#    dict_['abstract'].append(content.abstract)
#    dict_['body_text'].append(content.body_text)
#df_covid = pd.DataFrame(dict_, columns=['paper_id', 'abstract', 'body_text'])
#df_covid.head()

#### MOdified attempt to also include metadata information 

In [None]:
dict_ = {'paper_id': [], 'abstract': [], 'body_text': [], 'authors': [], 'title': [], 'journal': [], 'abstract_summary': []}
for idx, entry in enumerate(all_json):
    if idx % (len(all_json) // 10) == 0:
        print(f'Processing index: {idx} of {len(all_json)}')
    content = FileReader(entry)
    
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    # no metadata, skip this paper
    if len(meta_data) == 0:
        continue
    
    dict_['paper_id'].append(content.paper_id)
    dict_['abstract'].append(content.abstract)
    dict_['body_text'].append(content.body_text)
    
    # also create a column for the summary of abstract to be used in a plot
    if len(content.abstract) == 'NA': 
        # no abstract provided
        dict_['abstract_summary'].append("Not provided.")
    elif len(content.abstract.split(' ')) > 100:
        # abstract provided is too long for plot, take first 300 words append with ...
        info = content.abstract.split(' ')[:100]
        summary = get_breaks(' '.join(info), 40)
        dict_['abstract_summary'].append(summary + "...")
    else:
        # abstract is short enough
        summary = get_breaks(content.abstract, 40)
        dict_['abstract_summary'].append(summary)
        
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    
    try:
        # if more than one author
        authors = meta_data['authors'].values[0].split(';')
        if len(authors) > 2:
            # more than 2 authors, may be problem when plotting, so take first 2 append with ...
            dict_['authors'].append(". ".join(authors[:2]) + "...")
        else:
            # authors will fit in plot
            dict_['authors'].append(". ".join(authors))
    except Exception as e:
        # if only one author - or Null valie
        dict_['authors'].append(meta_data['authors'].values[0])
    
    # add the title information, add breaks when needed
    try:
        title = get_breaks(meta_data['title'].values[0], 40)
        dict_['title'].append(title)
    # if title was not provided
    except Exception as e:
        dict_['title'].append(meta_data['title'].values[0])
    
    # add the journal information
    dict_['journal'].append(meta_data['journal'].values[0])
    
df_covid = pd.DataFrame(dict_, columns=['paper_id', 'abstract', 'body_text', 'authors', 'title', 'journal', 'abstract_summary'])
df_covid.head()

In [None]:
df_covid.to_csv('initial_coviddata.csv',index=False,index_label='paper_id')

In [None]:
sum(df_covid.body_text.str.contains(r'hum[ae]n | m[ae]n |treatment')[df_covid.body_text.str.contains(r'hum[ae]n | m[ae]n |treatment').notnull()])

In [None]:
df_covid.shape

# Feature extraction



### CREATING TRIVIAL FEATURES:

* Word Count
* Character Count
* Average word length
* Number of stop words
* Number of special characters
* Number of numerics
* Number of uppercase words


#### WORD COUNTS

In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import json

import matplotlib.pyplot as plt
plt.style.use('ggplot')
df_covid=pd.read_csv('/kaggle/working/initial_coviddata.csv')

In [5]:
df_covid1=df_covid.copy()
df_covid1=df_covid1.set_index('paper_id')
df_covid1['word_count']=df_covid1['body_text'].apply(lambda x: len(str(x).split(" ")))
df_covid1[['body_text','word_count']].head()

Unnamed: 0_level_0,body_text,word_count
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3cdc48bb9e40afd30a59463b7872761a726998c8,Newcastle disease (ND) is an emerging disease ...,3649
d99acb4e99be7852aa61a688c9fbd38d44b5a252,Live attenuated viruses have been developed an...,5845
748d4c57fe1acc8d9d97cf574f7dea5296f9386c,occurs primarily through a macropinocytosis-li...,5285
b891efc6e1419713b05ff7d89b26d260478c28df,The goal of the present study was to investiga...,788
76d2990a2663635e195b8a9818f9664872b6d3af,; T-cell leukemia virus type 1; and human T-ce...,6182


### Character count

In [6]:
df_covid1['char_count']=df_covid1['body_text'].str.len()
df_covid1[['body_text','char_count']].head()

Unnamed: 0_level_0,body_text,char_count
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3cdc48bb9e40afd30a59463b7872761a726998c8,Newcastle disease (ND) is an emerging disease ...,22199
d99acb4e99be7852aa61a688c9fbd38d44b5a252,Live attenuated viruses have been developed an...,36115
748d4c57fe1acc8d9d97cf574f7dea5296f9386c,occurs primarily through a macropinocytosis-li...,34906
b891efc6e1419713b05ff7d89b26d260478c28df,The goal of the present study was to investiga...,5200
76d2990a2663635e195b8a9818f9664872b6d3af,; T-cell leukemia virus type 1; and human T-ce...,40835


### Avg word length

In [7]:
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

df_covid1['avg_word'] = df_covid1['body_text'].apply(lambda x: avg_word(x))
df_covid1[['body_text','avg_word']].head()

Unnamed: 0_level_0,body_text,avg_word
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3cdc48bb9e40afd30a59463b7872761a726998c8,Newcastle disease (ND) is an emerging disease ...,5.032065
d99acb4e99be7852aa61a688c9fbd38d44b5a252,Live attenuated viruses have been developed an...,5.146894
748d4c57fe1acc8d9d97cf574f7dea5296f9386c,occurs primarily through a macropinocytosis-li...,5.563746
b891efc6e1419713b05ff7d89b26d260478c28df,The goal of the present study was to investiga...,5.573957
76d2990a2663635e195b8a9818f9664872b6d3af,; T-cell leukemia virus type 1; and human T-ce...,5.568281


### Number of Stopwords

In [8]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

df_covid1['stopwords'] = df_covid1['body_text'].apply(lambda x: len([x for x in x.split() if x in stop]))
df_covid1[['body_text','stopwords']].head()

Unnamed: 0_level_0,body_text,stopwords
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3cdc48bb9e40afd30a59463b7872761a726998c8,Newcastle disease (ND) is an emerging disease ...,1127
d99acb4e99be7852aa61a688c9fbd38d44b5a252,Live attenuated viruses have been developed an...,1722
748d4c57fe1acc8d9d97cf574f7dea5296f9386c,occurs primarily through a macropinocytosis-li...,1603
b891efc6e1419713b05ff7d89b26d260478c28df,The goal of the present study was to investiga...,228
76d2990a2663635e195b8a9818f9664872b6d3af,; T-cell leukemia virus type 1; and human T-ce...,2018


### Special chars

In [9]:
import re
def strip_character(series):
    r = re.compile(r'[^a-zA-Z !@#$%&*_+-=|\:";<>,./()[\]{}\']')
    return len(re.findall(r,series))


df_covid1['special_chars'] = df_covid1['body_text'].apply(lambda x:strip_character(x))

In [10]:
df_covid1[['body_text','special_chars']].head()

Unnamed: 0_level_0,body_text,special_chars
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3cdc48bb9e40afd30a59463b7872761a726998c8,Newcastle disease (ND) is an emerging disease ...,247
d99acb4e99be7852aa61a688c9fbd38d44b5a252,Live attenuated viruses have been developed an...,49
748d4c57fe1acc8d9d97cf574f7dea5296f9386c,occurs primarily through a macropinocytosis-li...,86
b891efc6e1419713b05ff7d89b26d260478c28df,The goal of the present study was to investiga...,7
76d2990a2663635e195b8a9818f9664872b6d3af,; T-cell leukemia virus type 1; and human T-ce...,66


### Number of uppercase words

In [11]:
df_covid1['upper'] = df_covid1['body_text'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
df_covid1[['body_text','upper']].head()

Unnamed: 0_level_0,body_text,upper
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3cdc48bb9e40afd30a59463b7872761a726998c8,Newcastle disease (ND) is an emerging disease ...,171
d99acb4e99be7852aa61a688c9fbd38d44b5a252,Live attenuated viruses have been developed an...,513
748d4c57fe1acc8d9d97cf574f7dea5296f9386c,occurs primarily through a macropinocytosis-li...,374
b891efc6e1419713b05ff7d89b26d260478c28df,The goal of the present study was to investiga...,78
76d2990a2663635e195b8a9818f9664872b6d3af,; T-cell leukemia virus type 1; and human T-ce...,350


### Number of numerics

In [12]:
df_covid1['numerics'] = df_covid1['body_text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
df_covid1[['body_text','numerics']].head()

Unnamed: 0_level_0,body_text,numerics
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3cdc48bb9e40afd30a59463b7872761a726998c8,Newcastle disease (ND) is an emerging disease ...,143
d99acb4e99be7852aa61a688c9fbd38d44b5a252,Live attenuated viruses have been developed an...,144
748d4c57fe1acc8d9d97cf574f7dea5296f9386c,occurs primarily through a macropinocytosis-li...,70
b891efc6e1419713b05ff7d89b26d260478c28df,The goal of the present study was to investiga...,18
76d2990a2663635e195b8a9818f9664872b6d3af,; T-cell leukemia virus type 1; and human T-ce...,82


## Cleaning text

In [13]:
from IPython.display import display
#make words lowercase
df_covid1['body_text'] = df_covid1['body_text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
display(df_covid1['body_text'].head())

##remove punctuations
df_covid1['body_text'] = df_covid1['body_text'].str.replace('[^\w\s]',' ')
display(df_covid1['body_text'].head())

paper_id
3cdc48bb9e40afd30a59463b7872761a726998c8    newcastle disease (nd) is an emerging disease ...
d99acb4e99be7852aa61a688c9fbd38d44b5a252    live attenuated viruses have been developed an...
748d4c57fe1acc8d9d97cf574f7dea5296f9386c    occurs primarily through a macropinocytosis-li...
b891efc6e1419713b05ff7d89b26d260478c28df    the goal of the present study was to investiga...
76d2990a2663635e195b8a9818f9664872b6d3af    ; t-cell leukemia virus type 1; and human t-ce...
Name: body_text, dtype: object

paper_id
3cdc48bb9e40afd30a59463b7872761a726998c8    newcastle disease  nd  is an emerging disease ...
d99acb4e99be7852aa61a688c9fbd38d44b5a252    live attenuated viruses have been developed an...
748d4c57fe1acc8d9d97cf574f7dea5296f9386c    occurs primarily through a macropinocytosis li...
b891efc6e1419713b05ff7d89b26d260478c28df    the goal of the present study was to investiga...
76d2990a2663635e195b8a9818f9664872b6d3af      t cell leukemia virus type 1  and human t ce...
Name: body_text, dtype: object

In [14]:
# remove stopwords
df_covid1['body_text'] = df_covid1['body_text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df_covid1['body_text'].head()

paper_id
3cdc48bb9e40afd30a59463b7872761a726998c8    newcastle disease nd emerging disease affectin...
d99acb4e99be7852aa61a688c9fbd38d44b5a252    live attenuated viruses developed used help pr...
748d4c57fe1acc8d9d97cf574f7dea5296f9386c    occurs primarily macropinocytosis like process...
b891efc6e1419713b05ff7d89b26d260478c28df    goal present study investigate describe latent...
76d2990a2663635e195b8a9818f9664872b6d3af    cell leukemia virus type 1 human cell leukemia...
Name: body_text, dtype: object

In [15]:
#common words
freq = pd.Series(' '.join(df_covid1['body_text']).split()).value_counts()[:10]
freq

1            806691
2            649700
et           634849
al           608193
cells        557555
virus        477027
3            447208
5            410421
0            402643
infection    386266
dtype: int64

In [16]:
#rare words
rare = pd.Series(' '.join(df_covid1['body_text']).split()).value_counts()[-10:]
rare

perfectpro             1
plasmidspecific        1
infoweb                1
dermati                1
trp501                 1
coriobacteriacaee      1
riscos                 1
cgtaatggtgacagccgat    1
multidimens            1
standblocking          1
dtype: int64

In [None]:
#spelling correction
#from textblob import TextBlob
#df_covid1['body_text'][:5].apply(lambda x: str(TextBlob(x).correct()))

In [32]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk

### Tokenize

In [19]:
from textblob import TextBlob
TextBlob(df_covid1['body_text'][1]).words

WordList(['live', 'attenuated', 'viruses', 'developed', 'used', 'help', 'protect', 'humans', 'various', 'infectious', 'diseases', 'live', 'attenuated', 'vaccine', 'viruses', 'simulate', 'natural', 'infection', 'highly', 'immunogenic', 'effective', 'inducing', 'humoral', 'cellular', 'immune', 'responses', 'demonstrated', 'safety', 'effectiveness', 'infectious', 'diseases', 'different', 'live', 'attenuated', 'vaccine', 'viruses', 'explored', 'vectors', 'express', 'heterologous', 'antigens', '1', 'measles', 'vaccine', 'mv', 'strain', 'long', 'history', 'demonstrated', 'safety', 'humans', 'induces', 'long', 'lasting', 'immunity', 'including', 'antibody', 'cell', 'responses', '2', '3', '4', 'aerosol', 'administration', 'measles', 'vaccine', 'also', 'developed', 'shown', 'induce', 'similar', 'better', 'immunogenicity', 'compared', 'conventional', 'subcutaneous', 'intramuscular', 'vaccination', '5', '6', 'aerosol', 'administration', 'route', 'may', 'offer', 'advantages', 'helping', 'protect',

In [24]:
pd.Series(df_covid1['body_text'][1].split()).value_counts()

rsv              104
measles           73
gp350             72
f                 69
virus             61
                ... 
incorporation      1
possess            1
transfecting       1
every              1
glycoprotein       1
Length: 1148, dtype: int64

### Lemmatize

In [27]:
df_covid2=df_covid1.copy()

In [28]:
from textblob import Word
df_covid1['body_text'] = df_covid1['body_text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
df_covid1['body_text'].head()

paper_id
3cdc48bb9e40afd30a59463b7872761a726998c8    newcastle disease nd emerging disease affectin...
d99acb4e99be7852aa61a688c9fbd38d44b5a252    live attenuated virus developed used help prot...
748d4c57fe1acc8d9d97cf574f7dea5296f9386c    occurs primarily macropinocytosis like process...
b891efc6e1419713b05ff7d89b26d260478c28df    goal present study investigate describe latent...
76d2990a2663635e195b8a9818f9664872b6d3af    cell leukemia virus type 1 human cell leukemia...
Name: body_text, dtype: object

In [29]:
df_covid1.to_csv('covid_cleaned.csv',index=False,index_label='paper_id')

### Lemmatize and preprocess

In [36]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [35]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [44]:
from nltk.tokenize import word_tokenize
tokenized_series=df_covid1['body_text'].apply(word_tokenize)

In [45]:
tokenized_series[:5]

paper_id
3cdc48bb9e40afd30a59463b7872761a726998c8    [newcastle, disease, nd, emerging, disease, af...
d99acb4e99be7852aa61a688c9fbd38d44b5a252    [live, attenuated, virus, developed, used, hel...
748d4c57fe1acc8d9d97cf574f7dea5296f9386c    [occurs, primarily, macropinocytosis, like, pr...
b891efc6e1419713b05ff7d89b26d260478c28df    [goal, present, study, investigate, describe, ...
76d2990a2663635e195b8a9818f9664872b6d3af    [cell, leukemia, virus, type, 1, human, cell, ...
Name: body_text, dtype: object

In [46]:
dictionary = gensim.corpora.Dictionary(tokenized_series)

In [49]:
count=0
for k,v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break
    

0 0
1 000
2 0187
3 03
4 1
5 10
6 100
7 12
8 15
9 1500
10 18


In [54]:
bow_corpus = [dictionary.doc2bow(doc) for doc in tokenized_series]
bow_corpus[4310]

[(0, 5),
 (4, 23),
 (5, 6),
 (6, 3),
 (7, 7),
 (8, 1),
 (10, 1),
 (26, 6),
 (27, 2),
 (32, 1),
 (34, 1),
 (38, 3),
 (39, 4),
 (45, 4),
 (50, 9),
 (51, 4),
 (52, 1),
 (53, 4),
 (55, 1),
 (57, 2),
 (59, 3),
 (62, 1),
 (69, 2),
 (76, 1),
 (79, 1),
 (85, 1),
 (87, 6),
 (89, 1),
 (92, 16),
 (94, 2),
 (98, 1),
 (107, 1),
 (122, 5),
 (130, 16),
 (138, 2),
 (141, 4),
 (145, 1),
 (146, 1),
 (149, 14),
 (151, 1),
 (158, 1),
 (160, 1),
 (164, 1),
 (170, 1),
 (173, 1),
 (179, 1),
 (182, 3),
 (190, 1),
 (201, 1),
 (202, 1),
 (217, 2),
 (225, 1),
 (231, 1),
 (235, 4),
 (241, 2),
 (248, 2),
 (265, 2),
 (286, 1),
 (304, 1),
 (315, 1),
 (328, 4),
 (333, 1),
 (336, 2),
 (337, 2),
 (354, 1),
 (363, 1),
 (365, 9),
 (379, 8),
 (381, 1),
 (388, 2),
 (392, 4),
 (406, 1),
 (408, 3),
 (412, 2),
 (414, 1),
 (416, 1),
 (419, 2),
 (420, 1),
 (426, 1),
 (432, 1),
 (438, 1),
 (458, 3),
 (461, 1),
 (465, 1),
 (480, 2),
 (481, 1),
 (488, 3),
 (490, 1),
 (502, 6),
 (507, 3),
 (513, 1),
 (516, 1),
 (521, 1),
 (544, 3),

In [55]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.006188778148051378),
 (1, 0.0020611942490422986),
 (2, 0.009392503282463372),
 (3, 0.00255033475097569),
 (4, 0.00031161903312566786),
 (5, 0.002576415688146359),
 (6, 0.004259439228441555),
 (7, 0.0006302486784868583),
 (8, 0.00023920300936749488),
 (9, 0.00379695289824089),
 (10, 0.0003817257250037918),
 (11, 0.005212585220879278),
 (12, 0.005135991832696033),
 (13, 0.003549476851873537),
 (14, 0.0073377128148703445),
 (15, 0.0036329671524821475),
 (16, 0.006735437165102545),
 (17, 0.009609669915874364),
 (18, 0.002918801141687375),
 (19, 0.0026830254770643627),
 (20, 0.002585587359848431),
 (21, 0.0023596737769333983),
 (22, 0.014736619719482057),
 (23, 0.004695354726163395),
 (24, 0.00412294607242761),
 (25, 0.0019472764083417894),
 (26, 0.0004257928442795155),
 (27, 0.0002215189594336216),
 (28, 0.0016269080793182961),
 (29, 0.0065794719899783345),
 (30, 0.001604502541556863),
 (31, 0.006764072170609073),
 (32, 0.00136892646822458),
 (33, 0.009917285410387356),
 (34, 0.0051

### LDA using BoW model

In [None]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)