In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
import time
import pickle 
import re

In [2]:
#get the data from the website

def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

#this cell as markdown to protect from overwrite. Change to code type before running program to include data dict
#instantiate the dictionary with a datadict entry, keep it in a seperate cell in case something goes wronge
#with the scraing you don't have to start building from scratch by accident
tag_dict = {'data_dictionary':
            [
    ['urls, case number is in url'],
    ['case numbers (including non-linked ones)']
            ]
           }

In [13]:

#iterate through the ascii codes for a-z for the url pattern (need to include skip elements)
for i in range(122, 123):
    
    #get the html from the index page
    index_page_html = simple_get("https://www.dos.ny.gov/coog/oml_listing/o" + chr(i) + ".html")
    
    #to pass over the letters with no page--note this doesn't work and needs to be patched
    if index_page_html == False:
        pass
    
    else:

        #parse the page with bs
        soup = BeautifulSoup(index_page_html, "html.parser")

        #allows us to grab alternating td from the page with low overhead
        count = -1

        #grabs the second table
        table =  soup.find_all('table')[1]

        for item in table.find_all('td'):

            #grabs the text from the leftside td, which is keyword name
            if count == -1:
                text = item.text

            #makes a list of the links and the non-linked codes for rulings from the rightside td
            else:
                tag_dict[text] = [[a['href'] for a in item.find_all('a')], item.text]

            #flips the sign on the count to grab from right/left
            count *= -1

        #to be polite to the server
        time.sleep((np.random.randint(100,300))/100)



    #         for data in row.find_all('td'):
    #             if count == -1:
    #                 text = data.text

    #             else:
    #                 link = data.find_all('a')
    #             print(text, link)
    #             count *= -1

{'data_dictionary': [['urls, case number is in url'],
  ['case numbers (including non-linked ones)']],
 'Abstention from Voting': [['http://docs.dos.ny.gov/coog/otext/o2198.htm',
   'http://docs.dos.ny.gov/coog/otext/o3103.htm'],
  '931, 2198, 3103'],
 'Action taken Outside of meeting': [['http://docs.dos.ny.gov/coog/otext/o3680.htm',
   'http://docs.dos.ny.gov/coog/otext/o3732.htm',
   'http://docs.dos.ny.gov/coog/otext/o3821.htm',
   'http://docs.dos.ny.gov/coog/otext/o3899.htm',
   'http://docs.dos.ny.gov/coog/otext/o4113.htm',
   'http://docs.dos.ny.gov/coog/otext/o5416.html',
   'http://docs.dos.ny.gov/coog/otext/o5531.html'],
  '3680\r\n    , 3732\r\n    , 3821\r\n    , 3899\r\n    , 4113, 5416, 5531 '],
 'Ad hoc Committee, Advisory Body': [['http://docs.dos.ny.gov/coog/otext/o2673.htm',
   'http://docs.dos.ny.gov/coog/otext/o3261.htm',
   'http://docs.dos.ny.gov/coog/otext/o3346.htm'],
  '509, 544, 723, 842, 1702, 1758, 2673, \r\n      3261, 3346'],
 'Adjournment': [['http://doc

In [15]:
#add this feature to the data_dictionary
tag_dict['data_dictionary'].append(['unicode html version of the ruling'])

#iter through keys and add new feature to features list in value
for key in tag_dict:
    tag_dict[key].append([])
    
    #iter through the urls and get the html of the ruling
    for url in tag_dict[key][0]:
        ruling = simple_get(url)
        
        #handle dead links
        if ruling == None:
            tag_dict[key][2].append('dead link')
            
        #only store the div with the content, the rest is formatting and links
        else:
            soup = BeautifulSoup(ruling, "html.parser")
            tag_dict[key][2].append(soup.find_all('div', id='mainContent')[0].prettify())

            #take between 1.0 and 5.0 seconds between calls
            time.sleep((np.random.randint(10,50)/10))
        



Error during requests to urls, case number is in url : Invalid URL 'urls, case number is in url': No schema supplied. Perhaps you meant http://urls, case number is in url?


In [16]:
tag_dict


{'data_dictionary': [['urls, case number is in url'],
  ['case numbers (including non-linked ones)'],
  ['unicode html version of the ruling', 'dead link'],
  []],
 'Abstention from Voting': [['http://docs.dos.ny.gov/coog/otext/o2198.htm',
   'http://docs.dos.ny.gov/coog/otext/o3103.htm'],
  '931, 2198, 3103',
  ['<div id="mainContent">\n <p>\n </p>\n <!-- InstanceBeginEditable name="Content" -->\n March, 9, 1993\n <p>\n </p>\n <p>\n </p>\n <p>\n  Mrs. Sondra Bauernfeind\n  <br/>\n  Chairman\n  <br/>\n  Sullivan County Conservative Party\n  <br/>\n  73 Brittman Road\n  <br/>\n  Mongaup Valley, NY 12762-5004\n </p>\n <p>\n  The staff of the Committee on Open Government is authorized to\r\n          issue advisory opinions. The ensuing staff advisory opinion is\r\n          based solely upon the facts presented in your correspondence.\n </p>\n <p>\n  Dear Mrs. Bauernfeind:\n </p>\n <p>\n  I have received your letter of March 3 in which you raised\r\n          questions relating to the Op

In [24]:
tag_dict['data_dictionary'][2] = ['dead link']

In [25]:
#add this feature to the data_dictionary
tag_dict['data_dictionary'].append(['clean text for bow (no whitespace)'])


#storing this for now, but maybe this should just be part of the 
#pipline for the LDA topic modeling 

#iter through the dict
for key in tag_dict:
    
    #add the new feature list to the value
    tag_dict[key].append([])
    for i in range(len(tag_dict[key][2])):
        
        #pass over dead links
        if tag_dict[key][2][i] == 'dead link':
            pass
        
        #clean up the text for bag of words (bow) approach
        else:

            #it was pickled as unicode, so first we reparse and get the element we need
            goup = BeautifulSoup(tag_dict[key][2][i]).div

            #then we take out some JS that was included
            goup = goup.get_text().replace('document.write("OML-AO-"+DocName);', '')

            #and remove the whitespace because for bow it doesn't matter
            goup = re.sub(r'\s+',' ', goup )

            #replace escaped single quotes with double quotes
            goup = re.sub(r'\'', '"', goup)

            tag_dict[key][3].append(goup)


In [29]:
with open('FOILAdvisoryDecisiondataWithText.pickle', 'wb') as f:
    # Pickle the dictionary using the highest protocol available.
    pickle.dump(tag_dict, f, pickle.HIGHEST_PROTOCOL)

In [28]:
tag_dict['Abstention from Voting'][3]

[' March, 9, 1993 Mrs. Sondra Bauernfeind Chairman Sullivan County Conservative Party 73 Brittman Road Mongaup Valley, NY 12762-5004 The staff of the Committee on Open Government is authorized to issue advisory opinions. The ensuing staff advisory opinion is based solely upon the facts presented in your correspondence. Dear Mrs. Bauernfeind: I have received your letter of March 3 in which you raised questions relating to the Open Meetings Law. Specifically, you asked whether "an abstention count[s] as a "yes" vote, a negative vote or a removal of one"s person from the meeting and this changing the number present for a quorum." Similarly, you asked "what number or percentage of the member body must be present to constitute a quorum. In this regard, I offer the following comments. First, the Open Meetings Law applies to meeting of public bodies, and §102(2) of the Law defines the phrase "public body" to mean: "...any entity for which a quorum is required in order to conduct public busine

In [None]:
tester = pickle.load(open('FOILAdvisoryDecisiondataWithText.pickle', 'wb'))

In [35]:
with open('FOILAdvisoryDecisiondataWithText.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    tester = pickle.load(f)

In [32]:
len(tester)    

290

In [33]:
tester['Abstention from Voting']

[['http://docs.dos.ny.gov/coog/otext/o2198.htm',
  'http://docs.dos.ny.gov/coog/otext/o3103.htm'],
 '931, 2198, 3103',
 ['<div id="mainContent">\n <p>\n </p>\n <!-- InstanceBeginEditable name="Content" -->\n March, 9, 1993\n <p>\n </p>\n <p>\n </p>\n <p>\n  Mrs. Sondra Bauernfeind\n  <br/>\n  Chairman\n  <br/>\n  Sullivan County Conservative Party\n  <br/>\n  73 Brittman Road\n  <br/>\n  Mongaup Valley, NY 12762-5004\n </p>\n <p>\n  The staff of the Committee on Open Government is authorized to\r\n          issue advisory opinions. The ensuing staff advisory opinion is\r\n          based solely upon the facts presented in your correspondence.\n </p>\n <p>\n  Dear Mrs. Bauernfeind:\n </p>\n <p>\n  I have received your letter of March 3 in which you raised\r\n          questions relating to the Open Meetings Law.\n </p>\n <p>\n  Specifically, you asked whether "an abstention count[s] as a\r\n          \'yes\' vote, a negative vote or a removal of one\'s person from the\r\n          meeti

In [None]:
#this is to clean the documents and add taggs to important sections



In [None]:
# document structure tags: opening paragraph, closing paragraph, middle paragraph, quotation from judicial ruling
# domain knowledge tag: NER court cases
# 