In [None]:
%reload_ext autoreload
%autoreload 2

# Legal and Regulatory Analysis

In 2024, Senator Roger Wicker [published a report](https://insidedefense.com/sites/insidedefense.com/files/documents/2024/dec/12192024_wicker1.pdf) on strengthening the defense industrial base and reforming defense acquisition.  The latter involves cutting red tape by streamlining regulations.  

In this tutorial, we will analyze parts of the Federal Acquisition Regulations (FAR) to identify which parts of it are driven by statuatory requirement.


For illustration purposes, we will focus our analysis on Part 9 of the FAR: *contractor qualifications*.


In [None]:
# | notest

part_prefixes = ['9.']


In [None]:
# | notest

from onprem import LLM
from onprem.ingest import load_single_document, extract_files
from onprem import utils as U
from tqdm import tqdm

import pandas as pd


pd.set_option('display.max_colwidth', None)

## STEP 1: Download the Data

We will first download the HTML version of the FAR.

In [None]:
# | notest


import zipfile
import tempfile
import os

# URL of the ZIP file
url = "https://www.acquisition.gov/sites/default/files/current/far/zip/html/FARHTML.zip"

# Create a temporary directory
temp_dir = tempfile.mkdtemp()
zip_path = os.path.join(temp_dir, "FARHTML.zip")

# Download the ZIP file
U.download(url, zip_path, verify=True)

# Extract the ZIP file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(temp_dir)

print(f"\nFiles extracted to: {temp_dir}")

filenames = [fname for fname in extract_files(temp_dir) if any(fname.lower().endswith('.html') and os.path.basename(fname).startswith(prefix) for prefix in part_prefixes)]
print(f'Total files: {len(list(extract_files(temp_dir)))}')
print(f'Number of files of interest: {len(filenames)}')
print('Sample:')
for fname in filenames[:5]:
    print(f'\t{fname}')

[██████████████████████████████████████████████████]
Files extracted to: /tmp/tmp3xa2rj1w
Total files: 3900
Number of files of interest: 106
Sample:
	/tmp/tmp3xa2rj1w/dita_html/9.406-3.html
	/tmp/tmp3xa2rj1w/dita_html/9.505-4.html
	/tmp/tmp3xa2rj1w/dita_html/9.201.html
	/tmp/tmp3xa2rj1w/dita_html/9.406-5.html
	/tmp/tmp3xa2rj1w/dita_html/9.104-1.html


## STEP 2: Text Extraction

We'll extract text from each of the HTML files.

In [None]:
# | notest

content = {}
for filename in tqdm(filenames, total=len(filenames)):
    text = load_single_document(filename)[0].page_content
    content[os.path.basename(filename)] = text

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 106/106 [00:06<00:00, 16.76it/s]


In [None]:
# | notest

print(content['9.505-4.html'])

9.505-4 Obtaining access to proprietary information.

(a) When a contractor requires proprietary information from others to perform a Government contract and can use the leverage of the contract to obtain it, the contractor may gain an unfair competitive advantage unless restrictions are imposed. These restrictions protect the information and encourage companies to provide it when necessary for contract performance. They are not intended to protect information-

(1) Furnished voluntarily without limitations on its use; or

(2) Available to the Government or contractor from other sources without restriction.

(b) A contractor that gains access to proprietary information of other companies in performing advisory and assistance services for the Government must agree with the other companies to protect their information from unauthorized use or disclosure for as long as it remains proprietary and refrain from using the information for any purpose other than that for which it was furnished.

## STEP 3: Setup LLM and Test Prompt

Next, we will setup the LLM, construct a prompt for this task, and test it on  a small sample of passages from the FAR.

Since the FAR is a publicly available document, we will use a cloud LLM (i.e., `gpt-4o-mini`) for this task.

In [None]:
# | notest

llm = LLM(model_url='openai://gpt-4o-mini', mute_stream=True, temperature=0)



In [None]:
# | notest

prompt = """
Given text from the Federal Acquisition Regulations (FAR), extract a list of explicitly cited statutes.
If there are no explicitly cited statutes,  return NA.  If there are, retun a list of cited statutes with each statute on a separate line.  
Do not include references to the FAR itself which are numbers with dots or dashes (e.g., 1.102-1, 3.104).

# Example 1:

<TEXT>
(2)A violation, as determined by the Secretary of Commerce, of any agreement of the group known as the "Coordination Committee" for purposes of the Export Administration Act of 1979 (50 U.S.C. App. 2401, et seq.) or any similar bilateral or multilateral export control agreement.

<STATUTES>
50 U.S.C. App. 2401 

# Example 2:

<TEXT>
9.400 Scope of subpart.
(a) This subpart-

(1) Prescribes policies and procedures governing the debarment and suspension of contractors by agencies for the causes given in 9.406-2 and 9.407-2;

(2) Provides for the listing of contractors debarred, suspended, proposed for debarment, and declared ineligible (see the definition of "ineligible" in 2.101); and

(3) Sets forth the consequences of this listing.

<STATUTES>

NA

# Example 3:

<TEXT>

--CONTENT--

<STATUTES>
"""

In [None]:
# | notest

samples = [ '9.104-1.html', '9.104-2.html', '9.104-3.html', '9.104-4.html', '9.104-5.html', '9.104-6.html', '9.104-7.html']
results = []
for sample in samples:
    output = llm.prompt(prompt.replace('--CONTENT--', content[sample]))
    results.extend([(sample, o.strip()) for o in output.strip().split('\n') if o != 'NA'])
    #print(output)

In [None]:
# | notest

df = pd.DataFrame(results, columns =['Section', 'Statute'])
df.head()

Unnamed: 0,Section,Statute
0,9.104-5.html,Pub. L. 113-235
1,9.104-6.html,41 U.S.C. 2313(d)(3)
2,9.104-7.html,Pub. L. 113-235


## STEP 4: Run Analyses on FAR



In [None]:
# | notest

results = []
for k in tqdm(content, total=len(content)):
    output = llm.prompt(prompt.replace('--CONTENT--', content[k]))
    results.extend([(k, o.strip()) for o in output.strip().split('\n') if o != 'NA'])

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 106/106 [00:55<00:00,  1.90it/s]


In [None]:
# | notest

df = pd.DataFrame(results, columns =['FAR Section', 'Cited Statute'])
df = df.sort_values(by='FAR Section')
df.head(50)

Unnamed: 0,FAR Section,Cited Statute
46,9.103.html,15 U.S.C. 637
31,9.104-5.html,Pub. L. 113-235
7,9.104-6.html,41 U.S.C. 2313
0,9.104-7.html,Pub. L. 113-235
27,9.105-2.html,Pub. L. 111-212
44,9.106-4.html,15 U.S.C. 637
28,9.107.html,41 U.S.C. chapter 85
4,9.108-1.html,6 U.S.C. 395(c)
3,9.108-1.html,6 U.S.C. 395(b)
25,9.108-2.html,Pub. L. 110-161
