In [1]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from pprint import pprint
from tqdm import tqdm

## Statutes POC (proof-of-concept)

In [2]:
dict_url = {
    'Missouri': {
        'str_url': 'https://revisor.mo.gov/main/OneChapter.aspx?chapter=435',
        'str_match': 'Notice of arbitration provisions required.  (8/28/1980)',
    },   
}
print('State Statutes:')
pprint(dict_url)

State Statutes:
{'Missouri': {'str_match': 'Notice of arbitration provisions required.  '
                           '(8/28/1980)',
              'str_url': 'https://revisor.mo.gov/main/OneChapter.aspx?chapter=435'}}


## Pick a State

In [3]:
str_state = 'Missouri'
str_url = dict_url[str_state]['str_url']
str_match = dict_url[str_state]['str_match']
print(f'State: {str_state}')
print(f'URL: {str_url}')
print(f'Matching String: {str_match}')

State: Missouri
URL: https://revisor.mo.gov/main/OneChapter.aspx?chapter=435
Matching String: Notice of arbitration provisions required.  (8/28/1980)


## Get info from page

In [4]:
# get request
r = requests.get(str_url)
# get content of page
soup = BeautifulSoup(r.content, 'html.parser')

In [5]:
# get tables
list_tables = soup.findAll('table')
# get the table we want
table = list_tables[1]
# get rows
list_rows = table.findAll('tr')[1:]

# iterate through rows and find the one where the provision matches what we are looking for
for a, row in enumerate(list_rows):
    try:
        # get dimensions
        list_cols = row.findAll('td')
        # get section
        str_section = list_cols[0].text.strip()
        # get section url
        str_section_url = list_cols[0].find('a', href=True)['href']
        # get provisions
        str_provisions = list_cols[1].text
        # logic
        if str_provisions == str_match:
            str_url_new_pre = str_url.split('/OneChapter')[0]
            str_url_new_inter = 'OneSection'
            str_url_new_post = str_section_url.split('PageSelect.')[1]
            str_url_new = f'{str_url_new_pre}/{str_url_new_inter}.{str_url_new_post}'
            break
    except TypeError:
        pass

# message
print(f'New URL: {str_url_new}')

New URL: https://revisor.mo.gov/main/OneSection.aspx?section=435.460&bid=24234&hl=


In [6]:
# get request
r = requests.get(str_url_new)
# get content of page
soup = BeautifulSoup(r.content, 'html.parser')

In [7]:
# get description
list_str_description = soup.find('p', {'class': 'norm'}).text.split('—')[1].strip().split(',')[2:-1]
list_str_description = [str_description.strip() for str_description in list_str_description]
# get example
str_example = soup.find('p', {'class': 'indentblk norm'}).text
print(f'State: {str_state}')
print('')
print(f'Original URL: {str_url}')
print('')
print(f'New URL: {str_url_new}')
print('')
print('Description:')
for str_description in list_str_description:
    print(f'- {str_description}')
print('')
print('Example:')
print(str_example)

State: Missouri

Original URL: https://revisor.mo.gov/main/OneChapter.aspx?chapter=435

New URL: https://revisor.mo.gov/main/OneSection.aspx?section=435.460&bid=24234&hl=

Description:
- the space provided for signatures a statement
- in ten point capital letters

Example:
"THIS CONTRACT CONTAINS A BINDING ARBITRATION PROVISION WHICH MAY BE ENFORCED BY THE PARTIES."
