In [None]:
"""
    script: JournalLDA.ipynb
    @author alan hamm(pqn7)

    resources:
        Applied Text Analysis with Python by Benjamin Bengfort, Rebecca Bilbro, 
        and Tony Ojeda(O'Reilly). 978-1-491-96304-3.

        https://radimrehurek.com/gensim/auto_examples/howtos/run_compare_lda.html

"""

In [1]:
from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.api import CategorizedCorpusReader
import nltk.data
from nltk import sent_tokenize, pos_tag, wordpunct_tokenize
import en_core_web_lg
import gensim
from gensim.models import ldamulticore
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
# https://github.com/buriy/python-readability
from readability.readability import Unparseable
from readability.readability import Document as Paper

# https://docs.python.org/3/library/time.html
import time

# https://beautiful-soup-4.readthedocs.io/en/latest/
import bs4

# https://docs.python.org/3/library/codecs.html
import codecs

# https://docs.python.org/3/library/json.html
import json

import re 

import os

import pprint as pp

import multiprocessing
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
from time import time  # To time our operations

from sklearn.manifold import TSNE
from nltk.corpus import stopwords

from gensim.models import Word2Vec
from matplotlib import pyplot as plt

import numpy as np

from tqdm import tqdm

import pandas as pd
#import modin.pandas as pd


import torch
from tqdm import tqdm
import csv

In [2]:
#%%
# we create a list to contain the json files that are to be processed

#year = 2019
#DOC_ID=list()
#for x in os.listdir(r"C:/_harvester/data/html-by-year/10s"):
#    if x.endswith(".json") and x[:4] in ['2019']:
#        DOC_ID.append(x)
#print(DOC_ID)
DOC_ID =r'.*([\d]+_html\.json)'


# we create a list of categories/keywords/tags to
#cat_pattern = r'(.*)[\d]_html\.json'
#cat_pattern = r'(.*?)(\d{,4}?_html\.json'
#CAT_PATTERN = r'(.*?)\d{,4}\.[\w]+'
CAT_PATTERN = r'^(.*?)[\W]*?\d{,4}?_html\.json'


# we mark the HTML tags to be used for 
# extacting the desired article, etc. text
# don't include 'li' tag e.g. <li>The Centers for Disease Control and Prevention (CDC) cannot attest to the accuracy of a non-federal website.</li>
TAGS = ['p']
#TAGS = ['h1']

# stop words
stop_words = stopwords.words('english')
# observed findings 
stop_words.extend(['icon', 'website', 'mmwr', 'citation', 'author', 'report', 'formatting', "format",'regarding',
                   'system', 'datum', 'link', 'linking', 'federal', 'data', 'tract', 'census', 'study',"question",
                   'conduct', 'report', 'including', 'top', 'summary', 'however', 'name', 'known', 'figure', 'return', 
                   'page', 'view', 'affiliation', 'pdf', 'law', 'version', 'list', 'endorsement', "review",
                   'article', 'download', 'reference', 'publication', 'discussion', 'table', 'vol', "message",
                   'information', 'web', 'notification', 'policy', 'policie', #spaCy lemmatization can make errors with pluralization(e.g. rabie for rabies)
                   'acknowledgment', 'altmetric',
                   'abbreviation', 'figure', "service","imply","current","source",
                   "trade","address", "addresses","program","organization" ,"provided", "copyrighted", "copyright",
                   "already", "topic", "art", 'e.g', 'eg'])

# pretrained model for POS tagging/filtering
nlp = en_core_web_lg.load( disable=['parser','ner'])

# set encoding for CorpusReader class
ENCODING = 'utf8'

# SET DIR PATHS
JSON_OUT = "C:/_harvester/data/json-outputs/"

# set the minimum number of topics to find
MIN_TOPICS = 100

# set the maximum number of topics to find
MAX_TOPICS = 505

# set the step by value
STEP_BY = 2

# set value to determine if lemmatization will be performed
LEMMATIZATION = True

In [3]:
import codecs
import json
import bs4
import re
import nltk
from time import time

class JOURNALCorpusReader(CategorizedCorpusReader, CorpusReader):
    """ a corpus reader for CDC Journal articles """
    
    def __init__(self, root, tags=TAGS, fileids=DOC_ID, encoding=ENCODING, **kwargs):
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids, encoding)
        
        self.tags = tags

    def resolve(self, fileids=None, categories=None):
        if categories is not None:
            return self.fileids(categories)
        
        return fileids

    def docs(self,fileids=None, categories=None):
        fileids = self.resolve(self.fileids(), self.categories())
        
        for path, encoding in self.abspaths(self.fileids(), include_encoding=True):
            with codecs.open(path, 'r', encoding=encoding) as f:
                yield json.load(f)

    def html(self, fileids=None, categories=None):
        for idx, doc in enumerate(self.docs(fileids, categories)):
            pp.pprint(f"The file {self.fileids()[idx]} is being processed in HTML()")
            for sentence in doc:
                try:
                    yield Paper(sentence).summary()
                except Unparseable as e:
                    print("Could not parse HTML: {}".format(e))
                    print(f"the fileid {self.fileids()[idx]}")
                    pp.pprint(sentence)
                    print("\n")
                    continue

    para_dict = dict()       
    def paras(self, fileids=None, categories=None):
        for html in self.html(fileids, categories):
            soup=bs4.BeautifulSoup(html,'html.parser')
            for element in soup.find_all(TAGS):
                if re.search(r'[a-zA-Z]+', element.text):
                    yield element.text
            soup.decompose()

    sent_dict = dict()                
    def sents(self,fileids=None,categories=None):
        for paragraph in self.paras(fileids,categories):
            for sentence in sent_tokenize(paragraph):
                yield sentence
                
    word_dict = dict() 
    def words(self,fileids=None,categories=None): 
        for sentence in self.sents(fileids,categories):
            for token in wordpunct_tokenize(sentence):
                yield token
    
    def generate(self, fileids=None, categories=None):

        # English Articles All Series:
        #   2010:142838it, 2011:160805it, 2012:123248it, 2013:137446it, 2014:121235it, 2015:151360it,
        #   2016:230592it, 2017:    , 2018:     , 2019: 12642it 
        para_dict = dict()
        for idx, para in tqdm(enumerate(self.paras())):
            #pp.pprint(para)
            para_dict[idx] = para
        
        return para_dict

In [4]:
_corpus = JOURNALCorpusReader('/_harvester/data/html-by-year/10s')
#print(_corpus.categories())
_corpus.fileids()

['2010_html.json',
 '2011_html.json',
 '2012_html.json',
 '2013_html.json',
 '2014_html.json',
 '2015_html.json',
 '2016_html.json',
 '2017_html.json',
 '2018_html.json',
 '2019_html.json']

In [5]:
corpus_tuple = _corpus.generate()

360it [00:00, 801.30it/s]

'The file 2010_html.json is being processed in HTML()'


926232it [03:09, 3906.86it/s]

'The file 2011_html.json is being processed in HTML()'


1911781it [07:20, 3673.12it/s]

'The file 2012_html.json is being processed in HTML()'


2747487it [10:12, 2951.23it/s]

'The file 2013_html.json is being processed in HTML()'


3638418it [13:13, 2533.61it/s]

'The file 2014_html.json is being processed in HTML()'


4355765it [16:05, 941.39it/s] 

'The file 2015_html.json is being processed in HTML()'


5291581it [19:28, 425.89it/s] 

'The file 2016_html.json is being processed in HTML()'


6953824it [25:11, 4140.06it/s]

'The file 2017_html.json is being processed in HTML()'


8592480it [31:13, 2965.02it/s]

'The file 2018_html.json is being processed in HTML()'


8670894it [31:53, 2523.77it/s]

'The file 2019_html.json is being processed in HTML()'


8686550it [32:22, 4472.56it/s]


In [None]:
#for idx, paras in corpus_tuple.items():
#    pp.pprint(paras)

In [6]:
from time import time
import spacy

texts_out = []
inner_text = []

# number of stopwords found
stopword_count = nltk.FreqDist()

pp.pprint(f"Executing POS/LEMMATIZATION")

t = time()
for key, paras in tqdm(corpus_tuple.items()):
    doc = nlp(paras)
    
    for token in doc:
        if token.pos_ in ['NOUN', 'ADJ', 'VERB', 'ADV']:
            if len(token.text) > 1:
                if token.text.lower() not in stop_words and token.lemma_.lower() not in stop_words: 
                    if LEMMATIZATION == False:
                        inner_text.append(token.text) 
                    else:
                        inner_text.append(token.lemma_) 
                else:
                    if LEMMATIZATION == False:
                        stopword_count[token.text] += 1
                    else:
                        stopword_count[token.lemma_] += 1

    if len(inner_text) > 0:
        texts_out.append(inner_text)
    inner_text = []

#pp.pprint(texts_out)
pp.pprint('Time to finish spaCy filter: {} mins'.format(round((time() - t) / 60, 2)))

NameError: name 'year' is not defined

In [None]:
import json
filename = f'C:/_harvester/data/tokenized-sentences/10s/tokenized_sents-wo-bigrams.json'
with open(filename, 'w') as jsonfile:
    json.dump(texts_out, jsonfile, ensure_ascii=False)

In [None]:
# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(texts_out, min_count=20)

# freqDist object for bigrams
bigram_freq = nltk.FreqDist()

# print bigrams
for ngrams, _ in bigram.vocab.items():
    #unicode_ngrams = ngrams.decode('utf-8')
    if '_' in ngrams:
        bigram_freq[ngrams]+=1
        print(ngrams)

# add bigrams to texts_out to be included in corpus
for idx in range(len(texts_out)):
    for token in bigram[texts_out[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            texts_out[idx].append(token)

In [None]:
#pp.pprint(texts_out)

In [None]:
#tokenized_sents = pd.DataFrame(texts_out)
#tokenized_sents.to_parquet(r"C:\_harvester\data\lda-models\2010s_html.json\tokenized_sents-w-bigrams.parquet")
#pp.pprint(texts_out)
fliename2 = f"C:/_harvester/data/tokenized-sentences/10s/tokenized_sents-w-bigrams.json"
with open(fliename2, 'w') as jsonfile:
    json.dump(texts_out, jsonfile, ensure_ascii=False)