
# 02 - Parse HTML and insert into MongoDB

The previous notebook used Selenium to save  Scopus queries (each with up to 200 abstracts) as HTML files.  This notebook loads the previously-saved HTML files and extracts the abstract text (along with some other info like the journal, year, authors, etc).  The extracted information is then saved to a Mongo database.  


### Imports

In [34]:
import pandas as pd
import os
from bs4 import BeautifulSoup
import pymongo
import string
import numpy as np
import sys
import unicodedata
import nltk
import nltk.data
import re
import csv

### Miscellaneous helper functions

In [35]:
# Function to convert generic string to unicode
# http://stackoverflow.com/questions/22474701/mongodb-insertion-shows-strings-in-documents-must-be-valid-utf-8

def to_unicode (d):
    #return unicodedata.normalize('NFKD', d).encode('ascii','ignore')
    return unicode (d, errors = 'replace')

##########################################

# Function to parse the filename of HTML file and return its separate components
# Splits the filename on underscores and returns the values:
# Year
# Journal index
# Electronic ISSN
# File index for given journal (can be non-zero if the journal / year had more than 200 articles)

def parse_html_filepath (fp_html):
    s = os.path.splitext (os.path.split (fp_html) [1])[0] # Isolate filename (end of path)
                                                          # and remove extension
    s_list = s.split('_')  # separate filename on underscores
    return (int (s_list[0]), int(s_list[1]), s_list[2], int (s_list[3])-1)

###############################################

# Function to extract a 'minipath' from a path string -- this just returns the last n path
# elements.
def minipath (fp, n_elements = 2):
    path_list = fp.split('/')
    return ('/'.join (path_list [-1*n_elements:]))

###############################################

# Function to remove 'Kill / Restart Chromedriver' rows from logfile dataframe

def is_chromedriver_remark (s):
    return 'CHROMEDRIVER' in s
    
################################################

# Function to remove any non-printable chatacter from string

def remove_unprintable (s):
    try: s = filter (lambda x: x in string.printable, s)
    except: pass
    return s


### Function to open an HTML file and return soup and list of < li > objects, each of which contains information for one article

In [36]:
# Open an html doc and:
#   - convert to beautifulSoup object
#   - return li_list, which is a list of html <li> objects, each of which
#     contains the information for a given article.
#   - The soup object and the <li> list used for subsequent HTML parsing.

def scopus_query_html_to_soup (fp_html):
    with open (fp_html) as f:
        html = f.read()
    soup = BeautifulSoup (html, 'lxml')
    li_list_key = soup.find(name = None, attrs = {'id':'srchResultsList'})
    li_list = li_list_key.find_all ('li')
    return soup, li_list
    

### Function to parse the soup for a single article.  Requires soup object, as well as list of < li > objects, and the index ('doc_num') for desired article

In [37]:
# Given a doc number, function to return the abstract, title, etc. from the HTML
# Doc num passed in should be 1-indexed.
# The field for the abstract is also 1-indexed.  However, the
# field for the list item owning the other results is zero-indexed is zero-indexed.
# li_list is a list of <li> elements, found previously.

# If no abstract was returned by Scopus, abstract field is NONE.

def get_doc_info (soup, doc_num, li_list):
    
    # Put this HTML processing block in a try.
    # Known modes of failure....
    #    1) At least once the <li> had a bunch of <a> tags BEFORE the one with the
    #    article title and link.  The find returned this tag, and threw an error because
    #    the text (title) and href were not present and could not be extracted.
    try:
        ab = soup.find (name=None, attrs = {'id':'previewAbstract' + str(doc_num + 1)})
        ab_text = ab.text.strip()
        if ab_text[0:24] == '[No abstract available]':
            ab_text = 'NONE'
        else:
            ab_text = ab.text.replace ('\n','  ').strip()  # Remove line breaks in the abstract


        li_key = li_list [doc_num] # Get the correct item in the results key list (0-indexed)

        ti_key = li_key.find (name = None, attrs = {'class':'docTitle'})  # Get the document title obj
        a_key = ti_key.find ('a')  # An anchor key in the ti_key has the url and the title text
        ti_text = a_key.text
        doc_url = a_key['href']

        author_col = li_key.find (name = None, attrs = {'class':'dataCol3'})  # This column has author info
        a_list = author_col.find_all ('a')  # There is an anchor tag here for each (displayed) author
        author_list_names = [i.text for i in a_list] #Extract author names from the anchor tags
        author_list_urls = [i['href'] for i in a_list] #Extract link urls from the anchor tags

        citation_col = li_key.find (name = None, attrs = {'class':'dataCol6'}) # Column with 'cited by'
    
    except:
        ti_text, ab_text, doc_url, n_citations, url_citations, author_list_names, author_list_urls = \
        '', '', '', 0, '', [], []
        return (ti_text, ab_text, doc_url, n_citations, url_citations, \
         author_list_names, author_list_urls)
    
    # Use a try / except here because we will have an excepton in the 0-citation case.
    try:
        cite_tag = citation_col.find ('a') #An anchor tag in the column has the info
        n_citations = int (cite_tag.text.split()[0])
        url_citations = cite_tag['href']
    except:  #No citations
        n_citations = 0
        url_citations = ''
    
    # return all the strings encoded as UTF-8, so they can be written to file
    # without any grief
    author_list_names = [i.encode ('UTF-8') for i in author_list_names]
    author_list_urls = [i.encode ('UTF-8') for i in author_list_urls]
    url_citations = url_citations.encode ('UTF-8') 
    ti_text, ab_text, doc_url, = ti_text.encode ('UTF-8'), ab_text.encode ('UTF-8'), doc_url.encode ('UTF-8')

    # Build list of all the items to return
    # Call to_unicode() defined above to convert strings to unicode
    #r = (to_unicode(ti_text), to_unicode(ab_text), doc_url, n_citations, url_citations, \
    #     [to_unicode (i) for i in author_list_names], author_list_urls)
    
    r = (ti_text, ab_text, doc_url, n_citations, url_citations, \
         author_list_names, author_list_urls)
    return r


### Functions to clean up the abstact text

1) Remove copyright and funding info.      
2) Add space between sentences, if not present.   

In [38]:
def replace_copyright_symbol (s):
    return s.replace('©', 'COPYRIGHT')

def is_clean_sentence (s, dirty_list = ['FUNDING','COPYRIGHT','PUBLISH','PUBLISHED']):
    s = replace_copyright_symbol (s)
    s = s.upper()
    for d in dirty_list:
        if d in s: return False
    return True

# Function to separate sentences, where no space is present.  This seems to be issue with
# copyright lines and structured abstract headings.  For example, ...Elsevier, Ltd.Chronic...
# will add a space after the period.  Likewise, 'of the participants.OBJECTIVES:' will
# add space after the period.  Essential, a space will be added after any period that
# is directly followed by a letter.

def separate_sentences (s):
    s = re.sub('\.[a-zA-z]', lambda x: x.group(0)[0:-1] + ' ' + x.group(0)[-1], s)
    return s

def clean_abstract (s):
    s = replace_copyright_symbol (s) 
    s = filter (lambda x: x in string.printable, s) # Remove unprintable weirdness.
    s = s.strip()
    s = separate_sentences (s)
    try:
        sent_list= sent_detector.tokenize (s)     # Tokenize into sentences
        sent_list = filter (is_clean_sentence, sent_list) # Remove ''dirty sentences',
                                                      # with funding or copyright info
        s = ' '.join (sent_list)                          # Build sentences back to single string
    except: print s
    return s


### Functions to convert British English to American English

In [39]:
def load_british_american_map (fp_brit_am):
    r = {}
    with open (fp_brit_am, 'rU') as f:
        myCsvreader = csv.reader (f, delimiter = '\t',)
        for row in myCsvreader:
            r[row[0]] = row[1]
    return r
        
def replacer_factory(spelling_dict):
    def replacer(match):
        word = match.group()
        return spelling_dict.get(word, word)
    return replacer

def limie_to_yankee(text, brit_to_am_dict):
    pattern = r'\b\w+\b'  # this pattern matches whole words only
    replacer = replacer_factory(brit_to_am_dict)
    return re.sub(pattern, replacer, text)

#from english_american_dictionary import ame_to_bre_spellings
#text = 'I am the conceptualised tyre full of haem'
#print limie_to_yankee (text, brit_to_am_dict)

### Functions for writing abstracts to MongoDB

In [40]:
# Function to build simple dictionary from the elements extracted from 
# a single article's HTML. This dictionary corresponds to JSON that will 
# be saved in the Mongo db.

def build_doc_dict(title='dummyTitle', authors=['I.P. Freely','The other author'], \
                    journal='dummyJournal', year=1776, abstract='dummyAb', \
                    idx_article_in_journal=69, n_cit=69, url_cit=None, url_art=None,\
                    url_authors_list=[], e_issn = 'xxxx-xxxx'):
    
    return {        'TITLE':title,\
                    'AUTHORS':authors,\
                    'JOURNAL':journal,
                    'E_ISSN':e_issn,\
                    'YEAR':year,\
                    'ABSTRACT':abstract, \
                    'ARTICLE_IDX':idx_article_in_journal, \
                    'N_CITATIONS':n_cit,\
                    'URL_CITATIONS':url_cit,\
                    'URL_ARTICLE':url_art,\
                    'URL_AUTHORS_LIST':url_authors_list}

##########################################################

# Function to open (if extant) or create (if not) the collection from the 
# Mongo database.  Collection is analogous to a table in SQL db.

def create_or_get_collection (db, col_name, verbose = False):
    try:
        col = db.get_collection (col_name)
        if verbose: print 'Returned extant Collection'
    except:
        col = db.create_collection (col_name)
        if verbose: print 'Created Collection'
    return col

# MAIN 

Load the saved HTML files, use Beautiful Soup to extract the abstracts and other 
content, and save them to a MongoDB

In [42]:
# Directory with the HTML files
fp_dir_in = '/Users/bryanfry/projects/proj_asksci/files_out/scopus_query_out/'  

#Path to log file with all the scraped HTML pages.  Load it as a pandas dataframe
fp_html_log_in = '/Users/bryanfry/projects/proj_asksci/files_out/scopus_query_out/_SCOPUS_QRY_LOG.csv'
df_html_all_years = pd.read_csv (fp_html_log_in)
df_html_all_years.YEAR = df_html_all_years.YEAR.apply(str) #Convert year to string (not int)

# Give path to file mapping British to American spellings, and build dictionary with mappings
fp_brit_am = '/Users/bryanfry/projects/proj_asksci/files_in/britsh_to_american.txt'
brit_to_am_dict = load_british_american_map (fp_brit_am)

# Set the range of HTML files to read (0-indexed)
html_idx_start = 0
html_idx_end = 10000 # 10000 > number of files --> go to end

max_articles_per_html = 200

#List of years, as strings
year_list = ['2015','2013','2011','2009','2007']

db_name = 'abstract_db' #Name of the mongo db
col_name = db_name + '_col' #Name of the collection in the db (just need one collection)

# Open the database and get / create the collection
client = pymongo.MongoClient()
db = client.get_database (db_name)
col = create_or_get_collection (db, col_name)

# Sentence detector -- Punkt tokenizer
sent_detector = nltk.data.load ('tokenizers/punkt/english.pickle')

for year in year_list:

    df_html = df_html_all_years[df_html_all_years.YEAR ==  year]
    df_html = df_html.sort_values (['HTML_FILE'])
    # Filter chromedriver reset remarks from the dataframe
    df_html = df_html[df_html.YEAR.apply (is_chromedriver_remark) == False]

    # Now loop on the rows of the log file.  Each one corresponds to a saved HTML file
    for html_idx in range (html_idx_start, min ([html_idx_end + 1, len(df_html)])):

        html_log_item = df_html.iloc[html_idx]  # Get a row from the HTML log file.
        fn_html = html_log_item.HTML_FILE
        e_issn = html_log_item.E_ISSN
        title_j = html_log_item.JOURNAL_TITLE
        SNIP_2015 = html_log_item['2015_SNIP']
        n_tot_art = html_log_item.N_ARTICLES
        
        if not fn_html[0:3] == 'ERR':
            
            print 'Parsing HTML from file = ' + fn_html

            fp_html = os.path.join (fp_dir_in, fn_html) #Prepend dir to get HTML filepath
            year, journal_idx, issn, page_idx_in_journal = parse_html_filepath (fp_html) # Get items from HTML filepath

            soup, li_list = scopus_query_html_to_soup(fp_html) #Get soup and list of <li> tags

            # Now we loop on the individal <li> tags in the soup.  Each one is data for one article
            list_doc_dict = []  #Create empty list with the dictionaries containing data for the docs.
                                # A new dictionary will be added to the list for each article.
            for li_idx, li in enumerate (li_list):

                # Compute article index in journal (can be over 200 if there are multiple HTML dumps for 
                # one journal)
                idx_article_in_journal = li_idx + (max_articles_per_html * (page_idx_in_journal))

                # Parse HTML for an <li>
                (title, abstract, url_art, n_cit, url_citations, auth_list, \
                url_auth_list) = get_doc_info (soup, li_idx, li_list)  

                # Convert British to American English, if present
                abstract = limie_to_yankee (abstract, brit_to_am_dict)
                
                # Clean the abstract text (remove copyright line, etc, remove unprintables)
                abstract = clean_abstract(abstract)

                # Remove non-printable weirdo characters from abstract, authors, Journal title
                title_j = remove_unprintable (title_j)
                title = remove_unprintable (title)
                auth_list = [remove_unprintable (i) for i in auth_list]

                # Build the dictionary to write to Mongo (as JSON)
                # NOTE: Change the joutnal title to unicode first
                doc_dict = build_doc_dict (title, auth_list, to_unicode(title_j), year, abstract, \
                                idx_article_in_journal, n_cit, url_citations, \
                                url_art, url_auth_list, e_issn)

                list_doc_dict.append (doc_dict)
            col.insert_many (list_doc_dict)

print 'DONE'


Parsing HTML from file = 2015_00003_1935-8237_01.html
Parsing HTML from file = 2015_00004_1529-1006_01.html
Parsing HTML from file = 2015_00005_15452085_01.html
Parsing HTML from file = 2015_00006_15206890_01.html
Parsing HTML from file = 2015_00007_1931-7883_01.html
Parsing HTML from file = 2015_00008_14606976_01.html
Parsing HTML from file = 2015_00010_1553-877X_01.html
Parsing HTML from file = 2013_00000_15390756_01.html
Parsing HTML from file = 2013_00002_0079-6425_01.html
Parsing HTML from file = 2013_00003_1935-8237_01.html
Parsing HTML from file = 2013_00004_1529-1006_01.html
Parsing HTML from file = 2013_00007_1931-7883_01.html
Parsing HTML from file = 2013_00008_14606976_01.html
Parsing HTML from file = 2013_00010_1553-877X_01.html
Parsing HTML from file = 2011_00000_15390756_01.html
Parsing HTML from file = 2011_00002_0079-6425_01.html
Parsing HTML from file = 2011_00003_1935-8237_01.html
Parsing HTML from file = 2011_00004_1529-1006_01.html
Parsing HTML from file = 2011_0000