# Exploring _Chronicling America_

Enter a search below and wait. Older newspapers tend to take more time to process--they have fewer pictures, so more text (They also don't use modern spelling, so the spellchecker spends more time on them).

In [1]:
from collections import Counter, namedtuple
from difflib import HtmlDiff
import io
from math import ceil
from operator import attrgetter, itemgetter
from pathlib import Path
import re
from tempfile import NamedTemporaryFile
from time import sleep
from urllib.parse import quote, urlencode
from xml.etree import ElementTree as ET

import folium
from folium.plugins import MarkerCluster
import ipywidgets as widgets
import nltk
from nltk.corpus import brown, stopwords
from nltk.stem.snowball import SnowballStemmer
from PIL import Image
from pydash import py_
from requests import get, post
import spacy

import bingsc_settings as sc

from IPython.display import display, HTML

In [2]:
BMAPSKEY = Path('./bingMapsKey').open('r').read().strip()

nlp = spacy.load("en_core_web_sm")

stemmer = SnowballStemmer(language='english')
sws = stopwords.words('english')

In [3]:
#Get word frequencies from the Brown Corpus.
brownfreqs = Counter(word.lower() for word in brown.words())
brownlen = len(brown.words())
brownprobs = {k: v/brownlen for k, v in brownfreqs.items()}

In [4]:
Entity = namedtuple("Entity", ['text', 'lemma', 'stem', 
                              'first_appearance', 'kind', 'count'])
EntityData = namedtuple('EntityData', ('name', 'description', 'url', 'lat', 'long'))
Paper = namedtuple("Paper", ['title', 'place', 'coords'])

In [5]:
statusWidget = widgets.Output(layout={'border': '2px solid blue'})

progressbar = widgets.IntProgress(
    value=0,
    min=0,
    max=1,
    step=1,
    description='Loading:',
    bar_style='success', # 'success', 'info', 'warning', 'danger' or ''
    orientation='horizontal'
)

urlbox = widgets.Output(layout={'border': '1px solid black', 'grid_area': 'header'})
entitiesbox = widgets.Output(layout={'border': '1px solid black', 'grid_area': 'entities'})
nlpbox = widgets.Output(layout={'border': '1px solid black', 'grid_area': 'nlp'})
mapbox  = widgets.Output(layout={'border': '1px solid black', 'grid_area': 'map'})

In [6]:
def geocodecity(citystate):
    try:
        (city, state) = citystate.split(',')
    except:
        return False
    url = f"http://dev.virtualearth.net/REST/v1/Locations?countryRegion=US\
&adminDistrict={state}&locality={city}&includeNeighborhood=true&\
maxResults=1&key={BMAPSKEY}"
    
    return py_.get(get(url).json(), 'resourceSets.0.resources.0.point.coordinates')

In [7]:
SC_MAX_LENGTH = 10000

def bing_spellcheck(text):
    """
    Send `text` to the Bing Spell Check API, breaking it into
    chunks if necessary to stay below the maximum input length of 
    10000 bytes.
    
    Returns corrected text and the raw response from Bing.
    """
    #Many (most? all?) of these documents seem to contain 
    #some characters that breaks the spell checker. 
    #This regex replaces anything but letters, numbers, 
    #and common punctuation with spaces.
    text = re.sub(r'[^a-zA-Z0-9\.\?!,\' ]+', ' ', text)
    sentences = nltk.sent_tokenize(text)
    bit_length = ceil(len(sentences) / (ceil(len(text) / SC_MAX_LENGTH)))

    sctext = ''
    scoutp = []
    
    scparams = {
        'mkt':'en-us',
        'mode':'proof'
    }
    scheaders = {
        'Content-Type': 'application/x-www-form-urlencoded',
        'Ocp-Apim-Subscription-Key': sc.KEY1,
        #'X-Search-Location': 'lat:41.823611;long:-71.422222;re:2000'
    }
    
    while len(sentences) > 0:
        bit = False
        mult = 1.1
        diff = 0
        while (bit is False or len(bit) > SC_MAX_LENGTH) and mult > .1:
            mult -= .1
            scount = ceil(bit_length*mult)
            bit = ' '.join(sentences[:scount])         
            scbit = ' '.join(sentences[:scount])
        
        del(sentences[:scount])
            
        
        data = {
            'text': bit
        }
        rsp = post(sc.ENDPOINT, headers=scheaders, params=scparams, data=data)
        scdata = rsp.json()
        
        for repl in scdata.get('flaggedTokens', []):
            start = repl['offset'] - diff
            token = repl['token']
            end = start + len(token)
            
            sug = repl['suggestions'][0].get('suggestion', token)
            
            #print('token:', token, 'suggestion:', sug)
            #print(scbit[start-100:start], '----', scbit[start:end], '----', scbit[end:end+100])
            scbit = scbit[:start] + sug + scbit[end:]
            diff += len(token) - len(sug)
            
        sctext += scbit
        scoutp.append(scdata)
            
    return sctext

In [8]:
def searchCA(year1:int, year2:int, searchterm:str, count:int):
    """Search Chronicling America"""
    #URL template for searching Chronicling America.
    searchurl = 'https://chroniclingamerica.loc.gov/search/pages/results?'
    
    urlbox.clear_output()
    
    if year1 > year2:
        startyear = year2
        endyear = year1
    else:
        startyear = year1
        endyear = year2
        
    searchterms = {
        'searchType': 'basic',
        'dateFilterType': 'yearRange',
        'language': 'eng',
        'proxtext': quote(searchterm), 
        'date1': startyear, 
        'date2': endyear,
        'rows': count,
    }
    
    url = searchurl + urlencode(searchterms)
    with urlbox:
        display(HTML(f'<a href="{url}" target="_blank">{url}</a>'))
    
    searchterms['format'] = 'json'
    papersearch = get(searchurl + urlencode(searchterms)).json()
    return papersearch['items']

In [9]:
def getEntities(page_data:dict):
    text = page_data['ocr_eng']
    sctext = bing_spellcheck(text)
    scytext = nlp(sctext)
    
    #Get named entities from spacy and remove really short ones and 
    #uninteresting tags.
    entities = [Entity(ent.text, ent.lemma_, stemmer.stem(ent.text), 
                         ent.start_char, ent.label_, 1) 
         for ent in scytext.ents
         if ent.label_ not in ('DATE', 'MONEY', 'CARDINAL', 'ORDINAL')
               and len(ent.text) > 3
         ]
    
    for e in range(len(entities)):
        if entities[e] != False:
            for f in range(e+1, len(entities)):
                if entities[f] != False:
                    #If these two entities have the same stem, 
                    if entities[e].stem.lower() == entities[f].stem.lower():
                        entities[f] = False
                        entities[e] = Entity(entities[e].text, entities[e].lemma,
                                             entities[e].stem, entities[e].first_appearance,
                                             entities[e].kind, entities[e].count + 1)
        
    entities = list(filter(None, entities))
    
    return (sctext, entities)

In [10]:
def getWikidata(entities:list, limit:int=100):
    ents = entities[:limit]
    wikigeturl = "https://www.wikidata.org/w/api.php?action=wbgetentities&sites=enwiki&languages=en&format=json&"
    
    entdata = {}
    while ents:
        srch = '|'.join(e[0] for e in ents[:50])
        del(ents[:50])
        results = get(wikigeturl + urlencode({'titles': srch})).json()
        entdata.update({k: v for k, v in results['entities'].items() if k[0] != '-'})
        
    outp = []
    for idx, ent in entdata.items():
        outp.append(EntityData(py_.get(ent, 'labels.en.value'),
                                      py_.get(ent, 'descriptions.en.value'),
                                      f'https://www.wikidata.org/wiki/{idx}',
                                      py_.get(ent, 'claims.P625.0.mainsnak.datavalue.value.longitude', False),
                                      py_.get(ent, 'claims.P625.0.mainsnak.datavalue.value.latitude', False)))
        
    return outp

In [11]:
def createMap(paper_data, entity_data):
    m = folium.Map()

    paper_cluster = MarkerCluster().add_to(m)

    for paper in paper_data:
        folium.Marker(
            location=[paper.coords[0], paper.coords[1]],
            popup=f'<small>newspaper</small><h3>{paper.title}</h3>{paper.place}',
            icon=folium.Icon(color='gray'),
        ).add_to(paper_cluster)

    entity_cluster = MarkerCluster().add_to(m)

    for ent in entity_data:
        if ent.lat and ent.long:
            folium.Marker(
                location=[ent.long, ent.lat],
                popup=f'<small>named entity</small><h3>{ent.name}</h3>{ent.description}',
                icon=folium.Icon(color='blue'),
            ).add_to(entity_cluster)
    
    return m

In [12]:
def doAnalysis(btn):
    progressbar.value = 0
    statusWidget.clear_output()
    entitiesbox.clear_output()
    nlpbox.clear_output()
    mapbox.clear_output()
    
    with statusWidget:
        display(HTML('Getting data from <i>Chronicling America</i>'))
        
    inpt = searchCA(year1box.value,
                     year2box.value,
                     searchbox.value, 
                     resultsslider.value)
    
    with statusWidget:
        display(HTML('Getting named entities and preparing per-page NLP statistics.'))
    
    progressbar.max = len(inpt)
    
    alltexts = []
    allentities = []
    frequencies = []
    paperlocs = []
    cntr = 0
    for item in inpt:
        cntr += 1
        
        pop = geocodecity(item['place_of_publication'])
        if pop:
            paperlocs.append(Paper(item['title'],
                                  item['place_of_publication'], 
                                  pop))
        
        sctext, entities = getEntities(item)
        alltexts.append(sctext)
        allentities += entities
        frequencies.append(Counter(word.lower() for word in nltk.word_tokenize(sctext) 
                                if not (word in sws or len(word) < 4)))

        progressbar.value += 1
    
    with statusWidget:
        display(HTML('Preparing search-wide NLP statistics.'))

    corpustext = '\n\n'.join(alltexts)
    corpusfreqs = Counter(word.lower() for word in nltk.word_tokenize(sctext) 
                                if not (word in sws or len(word) < 4))
    corpuslen = len(nltk.word_tokenize(sctext))
    corpusprobs = {k: v/corpuslen for k, v in corpusfreqs.items()}
    upwords = {k: v - brownprobs.get(k, 1) for k, v in corpusprobs.items()}

    #corpusfreqs = {k: (v, v / brownprobs.get(k, 1)) for (k, v) in corpusfreqs.items()}

    for e in range(len(allentities)):
        if allentities[e] != False:
            for f in range(e+1, len(allentities)):
                if allentities[f] != False:
                    #If these two entities have the same stem, 
                    if allentities[e].stem.lower() == allentities[f].stem.lower():
                        allentities[e] = Entity(allentities[e].text, allentities[e].lemma,
                                             allentities[e].stem, allentities[e].first_appearance,
                                             allentities[e].kind, allentities[e].count + allentities[f].count)
                        allentities[f] = False

    allentities = sorted(filter(None, allentities), key=attrgetter("count"), reverse=True)
    entitydata = getWikidata(allentities)
    
    fmap = createMap(paperlocs, entitydata)
    
    entdisplay = '\n'.join([
        f"""<p><b><a href="{ed.url}" target="_blank">{ed.name}</a></b>
        <br/>{ed.description}</p>"""
        for ed in entitydata
    ])
    nlpdisplay = '\n'.join([
       f"<li><b>{x[0]}</b></li>" for x in 
            sorted(upwords.items(), key=itemgetter(1), reverse=True)[:20]
            if x[1] > .005])
    
    with entitiesbox:
        display(HTML(entdisplay))
    
    with nlpbox:
        display(HTML('<ul>'+nlpdisplay+'</ul>'))
    
    with mapbox:
        display(fmap)
    
    with statusWidget:
        display(HTML('<b>Done.</b>'))

In [13]:
resultsslider = widgets.IntSlider(
    value=15,
    min=1,
    max=100,
    step=1,
    description='# of Results',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

year1box = widgets.BoundedIntText(
    value=1900,
    min=1789,
    max=1963,
    step=1,
    description='Start:',
)

year2box = widgets.BoundedIntText(
    value=1963,
    min=1789,
    max=1963,
    step=1,
    description='End:',
)

searchbox = widgets.Text(
    placeholder='Enter search terms',
    description='Search'
)

analyseButton = widgets.Button(
    description='Go',
    disabled=False,
    tooltip='Run',
    icon='' # (FontAwesome names without the `fa-` prefix)
)

analyseButton.on_click(doAnalysis)


ui = widgets.HBox([widgets.VBox([year1box, year2box]), 
                   widgets.VBox([searchbox, resultsslider]), analyseButton])

display(ui)

HBox(children=(VBox(children=(BoundedIntText(value=1900, description='Start:', max=1963, min=1789), BoundedInt…

In [14]:
widgets.VBox([statusWidget, progressbar])

VBox(children=(Output(layout=Layout(border='2px solid blue')), IntProgress(value=0, bar_style='success', descr…

In [16]:
widgets.GridBox(children=[urlbox, entitiesbox, nlpbox, mapbox],
        layout=widgets.Layout(
            width='100%',
            grid_template_rows='auto auto auto',
            grid_template_columns='70% auto',
            grid_gap='5px 5px',
            grid_template_areas='''
            "header header"
            "entities nlp"
            "map map"
            ''')
       )

GridBox(children=(Output(layout=Layout(border='1px solid black', grid_area='header')), Output(layout=Layout(bo…