In [1]:
import jsonlines
import os
import pandas as pd
import pickle
import re

def replace_semicolon(text, threshold=10):
    '''
    Get rid of semicolons.
    First split text into fragments between the semicolons. If the fragment 
    is longer than the threshold, turn the semicolon into a period. O.w treat
    it as a comma.
    Returns new text
    '''
    new_text = ""
    for subset in re.split(';', text):
        subset = subset.strip() # Clear off spaces
        # Check word count
        if len(subset.split()) > threshold:
            # Turn first char into uppercase
            new_text += ". " + subset[0].upper() + subset[1:]
        else:
            # Just append with a comma 
            new_text += ", " + subset

    return new_text

USC_re = re.compile('[Uu]\.*[Ss]\.*[Cc]\.]+')
PAREN_re = re.compile('\([^(]+\ [^\(]+\)')
BAD_PUNCT_RE = re.compile(r'([%s])' % re.escape('"#%&\*\+/<=>@[\]^{|}~_'), re.UNICODE)
BULLET_RE = re.compile('\n[\ \t]*`*\([a-zA-Z0-9]*\)')
DASH_RE = re.compile('--+')
WHITESPACE_RE = re.compile('\s+')
EMPTY_SENT_RE = re.compile('[,\.]\ *[\.,]')
FIX_START_RE = re.compile('^[^A-Za-z]*')
FIX_PERIOD = re.compile('\.([A-Za-z])')
SECTION_HEADER_RE = re.compile('SECTION [0-9]{1,2}\.|\nSEC\.* [0-9]{1,2}\.|Sec\.* [0-9]{1,2}\.')

FIX_PERIOD = re.compile('\.([A-Za-z])')

SECTION_HEADER_RE = re.compile('SECTION [0-9]{1,2}\.|\nSEC\.* [0-9]{1,2}\.|Sec\.* [0-9]{1,2}\.')

def clean_text(text):
    """
    Borrowed from the FNDS text processing with additional logic added in.
    Note: we do not take care of token breaking - assume SPACY's tokenizer
    will handle this for us.
    """

    # Indicate section headers, we need them for features
    text = SECTION_HEADER_RE.sub('SECTION-HEADER', text)
    # For simplicity later, remove '.' from most common acronym
    text = text.replace("U.S.", "US")
    text = text.replace('SEC.', 'Section')
    text = text.replace('Sec.', 'Section')
    text = USC_re.sub('USC', text)

    # Remove parantheticals because they are almost always references to laws 
    # We could add a special tag, but we just remove for now
    # Note we dont get rid of nested parens because that is a complex re
    #text = PAREN_re.sub('LAWREF', text)
    text = PAREN_re.sub('', text)
    

    # Get rid of enums as bullets or ` as bullets
    text = BULLET_RE.sub(' ',text)
    
    # Clean html 
    text = text.replace('&lt;all&gt;', '')

    # Remove annoying punctuation, that's not relevant
    text = BAD_PUNCT_RE.sub('', text)

    # Get rid of long sequences of dashes - these are formating
    text = DASH_RE.sub( ' ', text)

    # removing newlines, tabs, and extra spaces.
    text = WHITESPACE_RE.sub(' ', text)
    
    # If we ended up with "empty" sentences - get rid of them.
    text = EMPTY_SENT_RE.sub('.', text)
    
    # Attempt to create sentences from bullets 
    text = replace_semicolon(text)
    
    # Fix weird period issues + start of text weirdness
    #text = re.sub('\.(?=[A-Z])', '  . ', text)
    # Get rid of anything thats not a word from the start of the text
    text = FIX_START_RE.sub( '', text)
    # Sometimes periods get formatted weird, make sure there is a space between periods and start of sent   
    text = FIX_PERIOD.sub(". \g<1>", text)

    # Fix quotes
    text = text.replace('``', '"')
    text = text.replace('\'\'', '"')

    # Add special punct back in
    text = text.replace('SECTION-HEADER', '<SECTION-HEADER>')

    return text

## First data set: US train

In [2]:
# Read data
new_data = pd.read_json('us_train_data_final.jsonl', lines=True)

In [3]:
# Pre-process
new_data['clean_text'] = new_data.text.map(clean_text)        
new_data['clean_summary'] = new_data.summary.map(clean_text)
new_data['clean_title'] = new_data.title.map(clean_text)

In [4]:
# Write resulting dataset
new_data.to_json('us_train_data_final_clean.jsonl', lines=True, orient='records')

In [5]:
# Read back and confirm that everything is fine
new_data_clean = pd.read_json('us_train_data_final_clean.jsonl', lines=True)

In [6]:
# Example: first row
new_data_clean.loc[0,'clean_text']

'<SECTION-HEADER> SHORT TITLE. This Act may be cited as the "South Utah Valley Electric Conveyance Act". <SECTION-HEADER> DEFINITIONS. In this Act: District. The term "District" means the South Utah Valley Electric Service District, organized under the laws of the State of Utah. Electric distribution system. The term "Electric Distribution System" means fixtures, irrigation, or power facilities lands, distribution fixture lands, and shared power poles. Fixtures. The term "fixtures" means all power poles, cross-members, wires, insulators and associated fixtures, including substations, that comprise those portions of the Strawberry Valley Project power distribution system that are rated at a voltage of 12.5 kilovolts and were constructed with Strawberry Valley Project revenues. And any such fixtures that are located on Federal lands and interests in lands. Irrigation or power facilities lands. The term "irrigation or power facilities lands" means all Federal lands and interests in lands 

In [7]:
# Get corresponding sentences
new_data_clean['clean_sentences'] = 0
# Convert to object data to be able to assign a list
new_data_clean['clean_sentences'] = new_data_clean['clean_sentences'].astype(object)
size_new_data_clean = new_data_clean.shape[0]

In [8]:
# Split text into sentences
print(size_new_data_clean)
for i in range(size_new_data_clean):
    sentences = new_data_clean.loc[i,'clean_text'].split(". ")
    new_data_clean.iat[i,new_data_clean.columns.get_loc('clean_sentences')] = sentences

28408


In [9]:
# Sanity check
new_data_clean.loc[1000,'clean_sentences']

['<SECTION-HEADER> SHORT TITLE',
 'This Act may be cited as the "Legacy IRA Act"',
 '<SECTION-HEADER> TAX-FREE DISTRIBUTIONS FROM INDIVIDUAL RETIREMENT ACCOUNTS FOR CHARITABLE PURPOSES',
 'In General',
 'Paragraph (8) of section 408(d) of the Internal Revenue Code of 1986 is amended to read as follows: Distributions for charitable purposes',
 'In general',
 'No amount shall be includible in gross income by reason of a qualified charitable distribution',
 'Limitations',
 'In general',
 'The aggregate amount excluded from gross income by subparagraph (A) for a taxable year shall not exceed $400,000',
 'Organization and entity specific limitations',
 'The amount excluded from gross income by subparagraph (A) for a taxable year shall not exceed $100,000, in the case of any distribution described in subparagraph (i)(I), and $400,000, in the case of any distribution described in subparagraph (i)(II)',
 'Qualified charitable distribution',
 "For purposes of this paragraph, the term `qualified

In [10]:
# Write results
new_data_clean.to_json('us_train_data_final_clean_with_sentences.jsonl', lines=True, orient='records')

In [11]:
# Read one last time and confirm that everything is fine
new_data_clean_with_sentences = pd.read_json('us_train_data_final_clean_with_sentences.jsonl', lines=True)

In [12]:
# Example: first row
new_data_clean_with_sentences.loc[0,"clean_sentences"]

['<SECTION-HEADER> SHORT TITLE',
 'This Act may be cited as the "South Utah Valley Electric Conveyance Act"',
 '<SECTION-HEADER> DEFINITIONS',
 'In this Act: District',
 'The term "District" means the South Utah Valley Electric Service District, organized under the laws of the State of Utah',
 'Electric distribution system',
 'The term "Electric Distribution System" means fixtures, irrigation, or power facilities lands, distribution fixture lands, and shared power poles',
 'Fixtures',
 'The term "fixtures" means all power poles, cross-members, wires, insulators and associated fixtures, including substations, that comprise those portions of the Strawberry Valley Project power distribution system that are rated at a voltage of 12.5 kilovolts and were constructed with Strawberry Valley Project revenues',
 'And any such fixtures that are located on Federal lands and interests in lands',
 'Irrigation or power facilities lands',
 'The term "irrigation or power facilities lands" means all Fed

## Second data set: US test

In [13]:
# Read data
new_data_2 = pd.read_json('us_test_data_final.jsonl', lines=True)

In [14]:
# Pre-process
new_data_2['clean_text'] = new_data_2.text.map(clean_text)        
new_data_2['clean_summary'] = new_data_2.summary.map(clean_text)
new_data_2['clean_title'] = new_data_2.title.map(clean_text)

In [15]:
# Write resulting dataset
new_data_2.to_json('us_test_data_final_clean.jsonl', lines=True, orient='records')

In [16]:
# Read back and confirm that everything is fine
new_data_2_clean = pd.read_json('us_test_data_final_clean.jsonl', lines=True)

In [17]:
# Example: first row
new_data_2_clean.loc[0,'clean_text']

'<SECTION-HEADER> SHORT TITLE. This Act may be cited as the "Public Safety Officers\' Benefits Improvement Act of 2016". <SECTION-HEADER> REPORTS. Section 1205 of title I of the Omnibus Crime Control and Safe Streets Act of 1968 is amended in subsection (a), by inserting "Rules, regulations, and procedures issued under this part may include regulations based on standards developed by another Federal agency for programs related to public safety officer death or disability claims." before the last sentence, in subsection (b) by inserting "(1)" before "In making". And by adding at the end the following: In making a determination under section 1201, the Bureau shall give substantial weight to the evidence and all findings of fact presented by a State, local, or Federal administrative or investigative agency regarding eligibility for death or disability benefits.". And by adding at the end the following: (1)(A) Not later than 30 days after the date of enactment of this subsection, the Burea

In [18]:
# Get corresponding sentences
new_data_2_clean['clean_sentences'] = 0
# Convert to object data to be able to assign a list
new_data_2_clean['clean_sentences'] = new_data_2_clean['clean_sentences'].astype(object)
size_new_data_2_clean = new_data_2_clean.shape[0]

In [19]:
# Split text into sentences
print(size_new_data_2_clean)
for i in range(size_new_data_2_clean):
    sentences = new_data_2_clean.loc[i,'clean_text'].split(". ")
    new_data_2_clean.iat[i,new_data_2_clean.columns.get_loc('clean_sentences')] = sentences

5014


In [20]:
# Sanity check
new_data_2_clean.loc[1000,'clean_sentences']

['<SECTION-HEADER> SHORT TITLE',
 'This Act may be cited as the "Veterans Advocacy Act of 2007"',
 '<SECTION-HEADER> PILOT PROGRAM ON PROVISION OF LEGAL ASSISTANCE TO ASSIST VETERANS AND MEMBERS OF THE ARMED FORCES RECEIVE HEALTH CARE, BENEFITS, AND SERVICES',
 'Pilot Program Required',
 'In general',
 'The Secretary of Veterans Affairs shall carry out a pilot program to assess the feasibility and advisability of utilizing eligible entities to provide legal services to assist veterans and members of the Armed Forces in applying for and receiving health care, benefits, and services',
 'Consultation',
 'The Secretary of Veterans Affairs shall carry out the pilot program in consultation with the Secretary of Defense',
 'Grants',
 'In general',
 'The Secretary of Veterans Affairs shall carry out the pilot program through the award of grants to eligible entities selected by the panel established in accordance with subsection (d)(1) for the provision of legal services at no cost to members o

In [21]:
# Write results
new_data_2_clean.to_json('us_test_data_final_clean_with_sentences.jsonl', lines=True, orient='records')

In [22]:
# Read one last time and confirm that everything is fine
new_data_2_clean_with_sentences = pd.read_json('us_test_data_final_clean_with_sentences.jsonl', lines=True)

In [23]:
# Example: first row
new_data_2_clean_with_sentences.loc[0,"clean_sentences"]

['<SECTION-HEADER> SHORT TITLE',
 'This Act may be cited as the "Public Safety Officers\' Benefits Improvement Act of 2016"',
 '<SECTION-HEADER> REPORTS',
 'Section 1205 of title I of the Omnibus Crime Control and Safe Streets Act of 1968 is amended in subsection (a), by inserting "Rules, regulations, and procedures issued under this part may include regulations based on standards developed by another Federal agency for programs related to public safety officer death or disability claims." before the last sentence, in subsection (b) by inserting "(1)" before "In making"',
 'And by adding at the end the following: In making a determination under section 1201, the Bureau shall give substantial weight to the evidence and all findings of fact presented by a State, local, or Federal administrative or investigative agency regarding eligibility for death or disability benefits."',
 'And by adding at the end the following: (1)(A) Not later than 30 days after the date of enactment of this subse

## Third dataset: CA test

In [24]:
# Read data
new_data_3 = pd.read_json('ca_test_data_final.jsonl', lines=True)

In [25]:
# Pre-process
new_data_3['clean_text'] = new_data_3.text.map(clean_text)        
new_data_3['clean_summary'] = new_data_3.summary.map(clean_text)
new_data_3['clean_title'] = new_data_3.title.map(clean_text)

In [26]:
# Write resulting dataset
new_data_3.to_json('ca_test_data_final_clean.jsonl', lines=True, orient='records')

In [27]:
# Read back and confirm that everything is fine
new_data_3_clean = pd.read_json('ca_test_data_final_clean.jsonl', lines=True)

In [28]:
# Example: first row
new_data_3_clean.loc[0,'clean_text']

'The people of the State of California do enact as follows: <SECTION-HEADER> The Legislature finds and declares all of the following: (1) Since 1899 congressionally chartered veteransâ€™ organizations have provided a valuable service to our nationâ€™s returning service members. These organizations help preserve the memories and incidents of the great hostilities fought by our nation, and preserve and strengthen comradeship among members. These veteransâ€™ organizations also own and manage various properties including lodges, posts, and fraternal halls. These properties act as a safe haven where veterans of all ages and their families can gather together to find camaraderie and fellowship, share stories, and seek support from people who understand their unique experiences. This aids in the healing process for these returning veterans, and ensures their health and happiness. As a result of congressional chartering of these veteransâ€™ organizations, the United States Internal Revenue Ser

In [29]:
# Get corresponding sentences
new_data_3_clean['clean_sentences'] = 0
# Convert to object data to be able to assign a list
new_data_3_clean['clean_sentences'] = new_data_3_clean['clean_sentences'].astype(object)
size_new_data_3_clean = new_data_3_clean.shape[0]

In [30]:
# Split text into sentences
print(size_new_data_3_clean)
for i in range(size_new_data_3_clean):
    sentences = new_data_3_clean.loc[i,'clean_text'].split(". ")
    new_data_3_clean.iat[i,new_data_3_clean.columns.get_loc('clean_sentences')] = sentences

1237


In [31]:
# Sanity check
new_data_3_clean.loc[1000,'clean_sentences']

['The people of the State of California do enact as follows: <SECTION-HEADER> Section 56804 of the Government Code is amended to read: 56804',
 'For any proposal that includes a disincorporation, the executive officer shall prepare, or cause to be prepared by contract, a comprehensive fiscal analysis',
 'This analysis shall become part of the report required pursuant to Section 56665',
 'Data used for the analysis shall be from the most recent fiscal year for which data is available, preceding the issuances of the certificate of filing',
 'When data requested by the executive officer in the notice to affected agencies, pursuant to paragraph (2) of subdivision (b) of Section 56658, is unavailable, the analysis shall document the source and methodology of the data used',
 'The analysis shall review and document each of the following: The direct and indirect costs incurred by the city proposed for disincorporation for providing public services during the three fiscal years immediately pre

In [32]:
# Write results
new_data_3_clean.to_json('ca_test_data_final_clean_with_sentences.jsonl', lines=True, orient='records')

In [33]:
# Read one last time and confirm that everything is fine
new_data_3_clean_with_sentences = pd.read_json('ca_test_data_final_clean_with_sentences.jsonl', lines=True)

In [34]:
# Example: first row
new_data_3_clean_with_sentences.loc[0,"clean_sentences"]

['The people of the State of California do enact as follows: <SECTION-HEADER> The Legislature finds and declares all of the following: (1) Since 1899 congressionally chartered veteransâ€™ organizations have provided a valuable service to our nationâ€™s returning service members',
 'These organizations help preserve the memories and incidents of the great hostilities fought by our nation, and preserve and strengthen comradeship among members',
 'These veteransâ€™ organizations also own and manage various properties including lodges, posts, and fraternal halls',
 'These properties act as a safe haven where veterans of all ages and their families can gather together to find camaraderie and fellowship, share stories, and seek support from people who understand their unique experiences',
 'This aids in the healing process for these returning veterans, and ensures their health and happiness',
 'As a result of congressional chartering of these veteransâ€™ organizations, the United States Inte

In [35]:
# A few weird symbols...
new_data_3.loc[0,'text']

'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nThe Legislature finds and declares all of the following:\n(a) (1) Since 1899 congressionally chartered veteransâ€™ organizations have provided a valuable service to our nationâ€™s returning service members. These organizations help preserve the memories and incidents of the great hostilities fought by our nation, and preserve and strengthen comradeship among members.\n(2) These veteransâ€™ organizations also own and manage various properties including lodges, posts, and fraternal halls. These properties act as a safe haven where veterans of all ages and their families can gather together to find camaraderie and fellowship, share stories, and seek support from people who understand their unique experiences. This aids in the healing process for these returning veterans, and ensures their health and happiness.\n(b) As a result of congressional chartering of these veteransâ€™ organizations, the United States Inter