In [12]:
import numpy as np
import pandas as pd
import gensim
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import urllib.request
import shutil
import os
import random
from copy import deepcopy
import pprint as pp

from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.api import CategorizedCorpusReader
import nltk.data
from nltk import sent_tokenize, pos_tag, wordpunct_tokenize
import en_core_web_lg
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
# https://github.com/buriy/python-readability
from readability.readability import Unparseable
from readability.readability import Document as Paper

# https://docs.python.org/3/library/time.html
import time

# https://beautiful-soup-4.readthedocs.io/en/latest/
import bs4

# https://docs.python.org/3/library/codecs.html
import codecs

# https://docs.python.org/3/library/json.html
import json

import os

from time import time

import re

import multiprocessing
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
from time import time  # To time our operations

Load the 2018 and 2019 Corpora

In [6]:
# load 2018 corpus, 2019 corpus
nlp = en_core_web_lg.load( disable=['parser','ner'])

stop_words = stopwords.words('english')

# we create a list of categories/keywords/tags to
# be used to refine searches
# CAT_PATTERN = r'([0-9]+\.htm$)'
CAT_PATTERN =r'([\d]+)_html\.json'

# we mark the HTML tags to be used for 
# extacting the desired article, etc. text
TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'li']
#TAGS = ['h1']


In [15]:
DOC_ID = ['2021_html.json']

In [16]:

class JOURNALCorpusReader(CategorizedCorpusReader, CorpusReader):
    """ a corpus reader for CDC Journal articles """
    # class nltk.corpus.reader.api.CorpusReader
    # we explicitly specify the encoding as utf8 even though
    # the default is utf8
    def __init__(self, root, tags=TAGS, fileids=DOC_ID, encoding='utf8', **kwargs):
            
        # we use this check to see if the user specified any
        # values in the CAT_PATTERN list
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        # initialize the NLTK  reader objects
        # review https://www.nltk.org/api/nltk.corpus.reader.api.html#nltk.corpus.reader.api.CategorizedCorpusReader to see
        # how __init__ is defined for each module; for the categorized
        # corpus reader, we use it to create categories if none are specified.
        CategorizedCorpusReader.__init__(self, kwargs)

        # https://www.nltk.org/api/nltk.corpus.reader.api.html#nltk.corpus.reader.api.CorpusReader
        # encoding –
        # The default unicode encoding for the files that make up the corpus. The value of encoding can 
        # be any of the following:
        #   A string: encoding is the encoding name for all files.
        #   A dictionary: encoding[file_id] is the encoding name for the file whose identifier is file_id. If file_id is not in encoding, 
        #       then the file contents will be processed using non-unicode byte strings.
        #   A list: encoding should be a list of (regexp, encoding) tuples. The encoding for a file whose 
        #       identifier is file_id will be the encoding value for the first tuple whose regexp matches
        #        the file_id. If no tuple’s regexp matches the file_id, the file contents will be processed using non-unicode byte strings.
        #   None: the file contents of all files will be processed using non-unicode byte strings.
        CorpusReader.__init__(self, root, fileids, encoding)
        
        self.fileids = fileids
        #self.categories = self.categories()
        self.tags = tags

        #print("From the constructor these are the fileids", fileids)
        #print("from the constructor these are the categories", self.categories)
        
        

    # we create a method that will allow us to filter how we
    # read the data from disk, either by specifying a list of categories
    # or a list of filenames
    def resolve(self, fileids, categories):
        if fileids is not None and categories is not None:
           raise ValueError("Specify fileids or categories, not both")
            
        if categories is not None:
            #pp.pprint("This is a test of the resolve() method where categories is not None:", self.categories)
            return self.fileids(categories)
    
        #pp.pprint("This is a test of the resolve() method where categories IS None:", self.categories)
        return fileids

    # we use this method to read all values from the key-value objects,
    # concatenating them into a list object which is returned.
    def docs(self,fileids=None, categories=None):

        fileids = self.resolve(fileids, categories)
        #for f in fileids:
        #    pp.pprint("This is a list of the fileids in doc():", f)
        
        # https://docs.python.org/3/library/codecs.html
        # This module defines base classes for standard Python codecs 
        # (encoders and decoders) and provides access to the internal Python 
        # codec registry, which manages the codec and error handling 
        # lookup process. Most standard codecs are text encodings, which encode 
        # text to bytes (and decode bytes to text), but there are also codecs 
        # provided that encode text to text, and bytes to bytes. Custom codecs 
        # may encode and decode between arbitrary types, but some module features 
        # are restricted to be used specifically with text encodings or with codecs 
        # that encode to bytes.

        # A string in Python is a sequence of Unicode code points (in range U+0000–U+10FFFF). To store or 
        # transfer a string, it needs to be serialized as a sequence of bytes.
        # Serializing a string into a sequence of bytes is known as “encoding”, 
        # and recreating the string from the sequence of bytes is known as “decoding”.
        # There are a variety of different text serialization codecs, which are collectively 
        # referred to as “text encodings”.

        # codecs.open(filename, mode='r', encoding=None, errors='strict', buffering=-1)
        # Open an encoded file using the given mode and return an instance of StreamReaderWriter, 
        # providing transparent encoding/decoding. The default file mode is 'r', meaning to open 
        # the file in read mode.
        # Note If encoding is not None, then the underlying encoded files are always opened in binary 
        # mode. No automatic conversion of '\n' is done on reading and writing. The mode argument
        # may be any binary mode acceptable to the built-in open() function; 
        # the 'b' is automatically added.
        
        # abspaths() Return a list of the absolute paths for all fileids in this corpus; 
        #       or for the given list of fileids, if specified.
        for path, encoding in self.abspaths(fileids, include_encoding=True):
            #print("This is a test of the docs() method using the codecs module", path)

            with codecs.open(path, 'r', encoding=encoding) as f:
                data = json.load(f)
                #for key, value in data.items():
                #    json_list.append(value)

                #return data.values()
                return data

    # we use this method to iterate over each key-value pair, specifically
    # iterating over the value ie HTML content
    def html(self, fileids=None, categories=None):
        for doc in self.docs(fileids, categories):
            try:
                yield Paper(doc).summary() # summer() Given a HTML file, extracts the text of the article
            except Unparseable as e:
                print("Could not parse HTML: {}".format(e))
                continue
            
    def paras(self, fileids=None, categories=None):
        for html in self.html(fileids, categories):
            soup=bs4.BeautifulSoup(html,'html.parser')
            for element in soup.find_all(TAGS):
                #if not any(c.isnumeric() for c in element.text):
                yield element.text
            soup.decompose()
                
    def sents(self, fileids=None, categories=None):
        for paragraph in self.paras(fileids, categories):
            for sentence in sent_tokenize(paragraph):
                yield sentence

    def words(self, fileids=None, categories=None):
        for sentence in self.sents(fileids, categories):
            for token in wordpunct_tokenize(sentence):
                yield token
    
    def tokenize(self, fileids=None, categories=None):
        for paragraph in self.paras(fileids, categories):
            yield[
                    pos_tag(wordpunct_tokenize(sent))
                    for sent in sent_tokenize(paragraph)
                    ]    
            
    def describe(self, fileids=None, categories=None):
        """
        Performs a single pass of the corpus and
        returns a dictionary with a variety of metrics
        concerning the state of the corpus.
        """
        started = time()
        
        #structures to perform counting
        counts = nltk.FreqDist()
        tokens = nltk.FreqDist()
        
        #perform single pass over paragraphs, tokenize and count
        for para in self.paras():
            counts['paras'] += 1
            
        for sent in self.sents():
            counts['sents'] += 1
                
        for word in self.words():
            counts['words'] += 1
            tokens[word] += 1


        #compute  the number of files and categories in the corpus
        n_fileids = len(self.resolve(fileids, categories) or self.fileids)
        #n_fileids = len(fileids)
        #n_topics = len(self.categories(self.resolve(fileids, categories)) or self.categories)
        #n_topics = len(categories)
        
        #return data structure with information
        return{
                # number of files
                'files': n_fileids,
                # number of topics
                #'topics': n_topics,
                # number of paragraphs
                'paras': counts['paras'],
                # number of sentences
                'sents': counts['sents'],
                # number of words
                'words': counts['words'],
                # average numer of words per sentence
                'awps': counts['words'] / counts['sents'],
                # size of vocabulary ie number of unique terms
                'vocab': len(tokens),
                # lexical diversity, the ratio of unique terms to total words
                'lexdiv': float(counts['words']) / float(len(tokens)),
                # average number of paragraphs per document
                'ppdoc': float(counts['paras']) / float(n_fileids),
                # average number of sentences per paragraph
                'sppar': float(counts['sents']) / float(counts['paras']),
                # total processing time
                'secs': time() - started,
                }


In [17]:
def w2v_to_numpy (model):
  """ Convert the word2vec model (the embeddings) into numpy arrays.
  Also create and return the mapping of words to the row numbers.

  Parameters:
  ===========
  model (gensim.Word2Vec): a trained gensim model

  Returns:
  ========
  embeddings (numpy.ndarray): Embeddings of each word
  idx, iidx (tuple): idx is a dictionary mapping word to row number
                     iidx is a dictionary mapping row number to word
  """
  embeddings = deepcopy (model.wv.get_normed_vectors())
  idx = {w:i for i, w in enumerate (model.wv.index_to_key )}
  iidx = {i:w for i, w in enumerate (model.wv.index_to_key )}
  return embeddings, (idx, iidx)

In [18]:
def near_neighbors (embs, query, word2rownum, rownum2word, k=5):
  """ Get the `k` nearest neighbors for a `query`

  Parameters:
  ===========
  embs (numpy.ndarray): The embeddings.
  query (str): Word whose nearest neighbors are being found
  word2rownum (dict): Map word to row number in the embeddings array
  rownum2word (dict): Map rownum from embeddings array to word
  k (int, default=5): The number of nearest neighbors

  Returns:
  ========
  neighbors (list): list of near neighbors;
                    size of the list is k and each item is in the form
                    of word and similarity.
  """

  sims = np.dot (embs, embs[word2rownum[query]])
  indices = np.argsort (-sims)
  return [(rownum2word[index], sims[index]) for index in indices[1:k+1]]

In [19]:
def procrustes(A, B):
    """
    Learn the best rotation matrix to align matrix B to A
    https://en.wikipedia.org/wiki/Orthogonal_Procrustes_problem
    """
    # U, _, Vt = np.linalg.svd(B.dot(A.T))
    U, _, Vt = np.linalg.svd(B.T.dot(A))
    return U.dot(Vt)

def intersect_vocab (idx1, idx2):
  """ Intersect the two vocabularies

  Parameters:
  ===========
  idx1 (dict): the mapping for vocabulary in the first group
  idx2 (dict): the mapping for vocabulary in the second group

  Returns:
  ========
  common_idx, common_iidx (tuple): the common mapping for vocabulary in both groups
  """
  common = idx1.keys() & idx2.keys()
  common_vocab = [v for v in common]

  common_idx, common_iidx = {v:i for i,v in enumerate (common_vocab)}, {i:v for i,v in enumerate (common_vocab)}
  return common_vocab, (common_idx, common_iidx)

def align_matrices (mat1, mat2, idx1, idx2):
  """ Align the embedding matrices and their vocabularies.

  Parameters:
  ===========
  mat1 (numpy.ndarray): embedding matrix for first group
  mat2 (numpy.ndarray): embedding matrix for second group

  index1 (dict): the mapping dictionary for first group
  index2 (dict): the mapping dictionary for the second group

  Returns:
  ========
  remapped_mat1 (numpy.ndarray): the aligned matrix for first group
  remapped_mat2 (numpy.ndarray): the aligned matrix for second group
  common_vocab (tuple): the mapping dictionaries for both the matrices
  """
  common_vocab, (common_idx, common_iidx) = intersect_vocab (idx1, idx2)
  row_nums1 = [idx1[v] for v in common_vocab]
  row_nums2 = [idx2[v] for v in common_vocab]

  #print (len(common_vocab), len (common_idx), len (common_iidx))
  remapped_mat1 = mat1[row_nums1, :]
  remapped_mat2 = mat2[row_nums2, :]
  #print (mat1.shape, mat2.shape, remapped_mat1.shape, remapped_mat2.shape)
  omega = procrustes (remapped_mat1, remapped_mat2)
  #print (omega.shape)
  # rotated_mat2 = np.dot (omega, remapped_mat2)
  rotated_mat2 = np.dot (remapped_mat2, omega)

  return remapped_mat1, rotated_mat2, (common_idx, common_iidx)

In [14]:
corpus2020 = JOURNALCorpusReader("C:/_harvester/data/html-by-year/")
texts_out=[]
inline_text = []
for sent in corpus2020.paras():
    #pp.pprint(sent)
    doc = nlp(sent)
    for token in doc:
        if token.pos_ in ['NOUN', 'ADJ', 'VERB', 'ADV']:
            #pp.pprint(token.text)
            texts_out.append(re.sub(r'[^\w\s]*', '', token.text))
    if len(texts_out) > 0:
        inline_text.append(texts_out)
    texts_out = []

texts2020 = [[t for t in text if len(t) > 3] for text in inline_text]
#pp.pprint(inline_text[0:25])

In [20]:
corpus2021 = JOURNALCorpusReader("C:/_harvester/data/html-by-year/")
texts_out=[]
inline_text = []
for sent in corpus2021.paras():
    #pp.pprint(sent)
    doc = nlp(sent)
    for token in doc:
        if token.pos_ in ['NOUN', 'ADJ', 'VERB', 'ADV']:
            #pp.pprint(token.text)
            texts_out.append(re.sub(r'[^\w\s]*', '', token.text))
    if len(texts_out) > 0:
        inline_text.append(texts_out)
    texts_out = []

texts2021 = [[t for t in text if len(t) > 3] for text in inline_text]
#pp.pprint(inline_text[0:25])

In [21]:
corpus2020.describe()

{'files': 1,
 'paras': 31342,
 'sents': 88607,
 'words': 2136357,
 'awps': 24.110476598914307,
 'vocab': 47838,
 'lexdiv': 44.65815878590242,
 'ppdoc': 31342.0,
 'sppar': 2.8271010146129796,
 'secs': 201.5404977798462}

In [22]:
corpus2021.describe()

{'files': 1,
 'paras': 24479,
 'sents': 70443,
 'words': 1802191,
 'awps': 25.583677583294296,
 'vocab': 44561,
 'lexdiv': 40.44323511590853,
 'ppdoc': 24479.0,
 'sppar': 2.8776910821520487,
 'secs': 150.24201607704163}

In [23]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

w2v_2020 = Word2Vec(min_count=20, # (int, optional) – Ignores all words with total frequency lower than this.
                     window=20, # Maximum distance between the current and predicted word within a sentence.
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=15,
                     workers=cores-1)

In [29]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create Dictionary
id2word2020 = Dictionary(texts2020)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
id2word2020.filter_extremes(no_below=20, no_above=0.5)

# Bag-of-words representation of the documents.
corpus2020 = [id2word2020.doc2bow(text) for text in texts2020]

pp.pprint(corpus2020[0:25])

w2v_2020.build_vocab(texts2020, progress_per=100)

w2v_2020.train(texts2020, total_examples=w2v_2020.corpus_count, epochs=30, report_delay=1)


[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(2, 1),
  (3, 2),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1)],
 [(3, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)],
 [(2, 1), (3, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1)],
 [(8, 1), (24, 1), (25, 1), (26, 1)],
 [(0, 1), (1, 1), (2, 1), (3, 1)],
 [(2, 1),
  (3, 2),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1)],
 [(3, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)],
 [(2, 1), (3, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1)],
 [(8, 1), (24, 1), (25, 1), (26, 1)],
 [(0, 1), (1, 1), (2, 1), (3, 1)],
 [(2, 1),
  (3, 2),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1)],
 [(3, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)],
 [(2, 1), (3, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1)],
 [(8, 1), (24, 1), (25, 1), (26, 1)],
 [(0, 1), (1, 1), (2, 1), (3, 1)],
 [(2, 1),
  (3, 2),
  (

(10204058, 22402410)

In [30]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

w2v_2021 = Word2Vec(min_count=20, # (int, optional) – Ignores all words with total frequency lower than this.
                     window=20, # Maximum distance between the current and predicted word within a sentence.
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=15,
                     workers=cores-1)

In [31]:
# Create Dictionary
id2word2021 = Dictionary(texts2021)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
id2word2021.filter_extremes(no_below=20, no_above=0.5)

# Bag-of-words representation of the documents.
corpus2021 = [id2word2021.doc2bow(text) for text in texts2021]

w2v_2021.build_vocab(texts2021, progress_per=100)

w2v_2021.train(texts2021, total_examples=w2v_2021.corpus_count, epochs=30, report_delay=1)


(7887693, 18430860)

In [33]:
w2v_2020.save(f"C:/_harvester/data/word2vec-models/word2vec-2020.model")
w2v_2021.save(f"C:/_harvester/data/word2vec-models/word2vec-2021.model")

In [34]:
word_list = ['person', 'people', 'virus', 'cough', 'respiratory']
for word in word_list:
    similar_words = w2v_2020.wv.most_similar(positive=[word])
    for word, similarity in similar_words:
        pp.pprint(f"{word}: {similarity}")
    print("\n")

'contact: 0.4746364951133728'
'compatible: 0.4046168327331543'
'close: 0.4037652909755707'
'exposure: 0.366028368473053'
'confirmed: 0.36508429050445557'
'case: 0.36319348216056824'
'transmission: 0.3622850477695465'
'contacts: 0.3617348372936249'
'absence: 0.345529705286026'
'source: 0.3453209698200226'


'older: 0.4922367334365845'
'more: 0.469707727432251'
'People: 0.46366068720817566'
'less: 0.4067952334880829'
'younger: 0.3945811688899994'
'poorer: 0.39318808913230896'
'adults: 0.3846714496612549'
'disadvantaged: 0.3812831938266754'
'Black: 0.3796895146369934'
'also: 0.35838523507118225'


'infection: 0.5655866861343384'
'infections: 0.5163565874099731'
'respiratory: 0.5100288987159729'
'transmission: 0.503086507320404'
'spread: 0.4813520312309265'
'widespread: 0.47528985142707825'
'coronavirus: 0.47013017535209656'
'influenza: 0.46524468064308167'
'causes: 0.4646841287612915'
'CoV2: 0.4604160189628601'


'symptoms: 0.566379725933075'
'signs: 0.53143709897995'
'fever: 0.5189049839

In [35]:
word_list = ['person', 'people', 'virus', 'cough', 'respiratory']
for word in word_list:
    similar_words = w2v_2021.wv.most_similar(positive=[word])
    for word, similarity in similar_words:
        pp.pprint(f"{word}: {similarity}")
    print("\n")

'learning: 0.5907158255577087'
'school: 0.5628183484077454'
'Schools: 0.5621341466903687'
'instruction: 0.5525521039962769'
'students: 0.551885724067688'
'schools: 0.5310417413711548'
'extracurricular: 0.530912458896637'
'kindergarten: 0.5203924179077148'
'close: 0.5197715759277344'
'hybrid: 0.5129963755607605'


'African: 0.5883188843727112'
'Caribbean: 0.5746883749961853'
'immigrants: 0.5463264584541321'
'counterparts: 0.541668713092804'
'American: 0.49978429079055786'
'focuses: 0.479608416557312'
'diabetes: 0.46489831805229187'
'People: 0.4639800786972046'
'Additionally: 0.46270835399627686'
'often: 0.46045953035354614'


'causes: 0.7586001753807068'
'coronavirus: 0.6103140115737915'
'host: 0.5569154620170593'
'antibodies: 0.5551154613494873'
'circulation: 0.5453903079032898'
'infectious: 0.5447540283203125'
'respiratory: 0.5352216958999634'
'surface: 0.5305452346801758'
'CoV2: 0.5154407620429993'
'transmission: 0.5141725540161133'


'shortness: 0.894736111164093'
'breath: 0.8729056

In [36]:
# convert embedding to numpy array

"""
  idx, iidx (tuple): idx is a dictionary mapping word to row number
                     iidx is a dictionary mapping row number to word
"""
embs2020, (idx2020, iidx2020) = w2v_to_numpy(w2v_2020)
embs2021, (idx2021, iidx2021) = w2v_to_numpy(w2v_2021)

In [38]:
query = 'virus'
print (f'Near neighbors for {query} in the 2018 corpus')
for item in near_neighbors (embs2020, query, idx2020, iidx2020, k=10):
  print (item)
print ()
print (f'Near neighbors for {query} in the 2019 corpus')
for item in near_neighbors (embs2021, query, idx2021, iidx2021, k=10):
  print (item)

Near neighbors for virus in the 2018 corpus
('infection', 0.56558675)
('infections', 0.51635665)
('respiratory', 0.51002896)
('transmission', 0.5030865)
('spread', 0.48135212)
('widespread', 0.4752899)
('coronavirus', 0.4701302)
('influenza', 0.4652447)
('causes', 0.4646842)
('CoV2', 0.46041608)

Near neighbors for virus in the 2019 corpus
('causes', 0.7586002)
('coronavirus', 0.61031395)
('host', 0.5569155)
('antibodies', 0.55511546)
('circulation', 0.54539025)
('infectious', 0.544754)
('respiratory', 0.53522164)
('surface', 0.53054523)
('CoV2', 0.51544076)
('transmission', 0.5141725)


In [39]:
_2020_aligned_embs, _2021_aligned_embs, (common_idx, common_iidx) = align_matrices (embs2020, embs2021, idx2020, idx2021)

In [40]:
near_neighbors(_2020_aligned_embs, 'person', common_idx, common_iidx, k=5)

[('contact', 0.47463652),
 ('compatible', 0.4046168),
 ('close', 0.40376526),
 ('exposure', 0.36602837),
 ('confirmed', 0.36508432)]

In [41]:
near_neighbors(_2021_aligned_embs, 'person', common_idx, common_iidx, k=5)

[('learning', 0.59071594),
 ('school', 0.5628184),
 ('Schools', 0.56213427),
 ('students', 0.55188584),
 ('schools', 0.5310418)]

In [42]:
journal_words = ['disease','person', 'child', 'adult', 'cough', 'respiratory', 'symptom']
journal_words = [(w, _2020_aligned_embs[common_idx[w]].dot(_2021_aligned_embs[common_idx[w]])) for w in journal_words]
for w,score in sorted (journal_words, key=lambda x:x[1], reverse=True):
  print (w, score)

disease 0.6502698
symptom 0.6223781
respiratory 0.62005043
cough 0.5967292
child 0.42598912
adult 0.39087787
person 0.36146486
