# Use LLM to extract certain data based on definition

This notebook reads in certain state bills and uses LLM to extract certain data based on some definitions, e.g. whether a bill's scope is on government's use of AI/ADS.

## Import

In [None]:
import os
import re
import glob
import shutil

import time
import json
import pickle

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm


In [None]:
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains.llm import LLMChain
from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document

In [None]:
%load_ext dotenv
%dotenv

In [None]:
summary_texts = (
    pd.DataFrame(procsum)
    .set_index('openstates_id')
    [['bill_id', 'jurisdiction', 'session', 'progress', 'openai_summary']]
    .apply(
    lambda x: '''\
Bill ID: {bill_id}
Jurisdiction: {jurisdiction}
Session: {session}
Bill progress: {progress} %
Summary:
{openai_summary}

'''.format(**x),
        axis=1
    )
)

In [None]:
topic_source = 'zeroshotv1-bertopics'
topic_file = f'data/proc/01-topics/{topic_source}/doc-topic-prob.csv'
thres_over_chance = 3
max_prob_thres = 0.9

df_ids = df['openstates_id']
topic_df = (
    pd.read_csv(topic_file)
    .set_index('openstates_id')
    .loc[df_ids]
    .drop(columns=['bill_id'])
)
chance_level = 1 / len(topic_df.columns)
thres_prob = min(thres_over_chance * chance_level, max_prob_thres)

topic_dict = (
    topic_df
    .melt(
        var_name='topic',
        value_name='prob',
        ignore_index=False
    )
    .query('prob > @thres_prob')
    .reset_index()
    .groupby('topic')
    ['openstates_id']
    .agg(list)
    .to_dict()
)
topics = list(topic_dict.keys())

print(f'''\
Topic source = "{topic_source}" 
Number of topics = {len(topics)}
Number of bills from `df` = {len(df)}
Number of bills after applying topics threshold = {len(set(np.concatenate(list(topic_dict.values()))))}\
''')

In [None]:
topic_source = 'zeroshotv2-bertopics'
topic_file = f'data/proc/01-topics/{topic_source}/doc-topic-prob.csv'
thres_over_chance = 3
max_prob_thres = 0.7

df_ids = df['openstates_id']
topic_df = (
    pd.read_csv(topic_file)
    .set_index('openstates_id')
    .loc[df_ids]
    .drop(columns=['bill_id'])
)
chance_level = 1 / len(topic_df.columns)
thres_prob = min(thres_over_chance * chance_level, max_prob_thres)

topic_dict = (
    topic_df
    .melt(
        var_name='topic',
        value_name='prob',
        ignore_index=False
    )
    .query('prob > @thres_prob')
    .reset_index()
    .groupby('topic')
    ['openstates_id']
    .agg(list)
    .to_dict()
)
topics = list(topic_dict.keys())

print(f'''\
Topic source = "{topic_source}" 
Number of topics = {len(topics)}
Number of bills from `df` = {len(df)}
Number of bills after applying topics threshold = {len(set(np.concatenate(list(topic_dict.values()))))}\
''')

In [None]:
topic_docs = dict()
max_bills_per_chunk = 40

for topic, os_ids in topic_dict.items():
    num_bills = len(os_ids)
    batch_ids = np.array_split(np.array(os_ids), int(np.ceil(num_bills / max_bills_per_chunk)))
    topic_docs[topic] = []
    for ids in batch_ids:
        sum_texts = '\n'.join(summary_texts.loc[ids])
        topic_docs[topic].append(Document(
            page_content = sum_texts,
            metadata = dict(
                topic = topic,
                topic_source = topic_source,
                openstates_ids = ids,
            )
        ))

In [None]:
{k: len(v) for k, v in topic_docs.items()}

In [None]:
topic_summaries = []
for topic, docs in tqdm(topic_docs.items(), total=len(topic_docs)):
    prompt_template = """You are a helpful assistant for legislators and researchers. 
    Write a concise summary about the topic %s of the bills, and if needed give examples of bills:
    {text}
    CONCISE SUMMARY:""" %(topic)
    prompt = PromptTemplate.from_template(prompt_template)

    refine_template = (
        "Your job is to produce a final summary of the bills about the topic " + topic + "\n"
        "We have provided an existing summary up to a certain point: {existing_answer}\n"
        "We have the opportunity to refine the existing summary"
        "(only if needed) with some more context below.\n"
        "------------\n"
        "{text}\n"
        "------------\n"
        "Given the new context, refine the original summary."
        "If the context isn't useful, return the original summary."
        "If needed, provide some bill examples."
        "The final output should have between 10 to 20 sentences."
    )

    refine_prompt = PromptTemplate.from_template(refine_template)
    chain = load_summarize_chain(
        llm=llm,
        chain_type="refine",
        question_prompt=prompt,
        refine_prompt=refine_prompt,
        return_intermediate_steps=True,
        input_key="input_documents",
        output_key="output_text",
    )
    result = chain({"input_documents": docs})
    topic_summaries.append(dict(
        topic = topic,
        topic_source = topic_source,
        result = result
    ))

In [None]:
(
    pd.json_normalize(topic_summaries, sep='_')
    .drop(columns=['result_input_documents', 'result_intermediate_steps'])
    .to_json(f'data/proc/02-summary/summary-topic-{topic_source}.json')
)

In [None]:
df

In [None]:
!ls data/proc/02-summary/

In [None]:
for x in topic_summaries:
    print(x['topic'].upper())
    print(x['result']['output_text'])
    print()

In [None]:
!ls data/proc/02-summary/*


In [None]:
ts_df = pd.concat([
    pd.read_json('data/proc/02-summary/summary-topic-zeroshotv1-bertopics.json'),
    pd.read_json('data/proc/02-summary/summary-topic-zeroshotv2-bertopics.json')
], ignore_index=True
)

In [None]:
ts_df['result_output_text'].loc[10]

In [None]:
yaml_template = r'''---
geometry:
    - margin=0.5in
output: pdf_document
colorlinks: true
fontsize: 9pt
toc: false
urlcolor: "violet"
header-includes:
    - \usepackage{titling}
    - \setlength{\droptitle}{-7em}
    - \pagenumbering{gobble}
    - \setlength{\parindent}{0em}
    - \usepackage{sansmathfonts}
    - \usepackage[T1]{fontenc}
    - \renewcommand*\familydefault{\sfdefault} 
    - \usepackage{wrapfig}
    - \usepackage{booktabs}
    - \usepackage[export]{adjustbox}
    - \newcommand{\forceindent}{\leavevmode{\parindent=1em\indent}}
    - \usepackage{fvextra}
    - \DefineVerbatimEnvironment{Highlighting}{Verbatim}{breaklines,commandchars=\\\{\}}
---
'''

text = yaml_template + '''\

# Demonstrative summary of bills (from 2023-2024) based on topics

## Method

### Initial summary

These are early summarization attempts of bills using OpenAI ChatGPT with the following prompt:

> *You are a helpful assistant for legislators and researchers.*
>
> *Write a concise bullet point summary of the following text, keep it maximum 10 bullet points and do not repeat:*
>
> `TEXT`

### Topic summary

Because each bill can usually be assigned a topic (or a few) using topic modelling, we can summarize the resulting summaries for each topic, with some "targeting/focusing".

Below is part of an example starting prompt:

> *You are a helpful assistant for legislators and researchers*.
>
> *Write a concise summary about the topic **Algorithmic Discrimination or Auditing** of the bills, and if needed give examples of bills*.
> 
> `TEXT`

Because of the limits of context for using LLM, we'd have to split into chunks of text, summarize them (i.e. summarize subsets of initial summaries), and refine the process.

Below is the prompt for refinement, also topic-focused:

> *Your job is to produce a final summary of the bills about the topic **Algorithmic Discrimination or Auditing***.
>
> *We have provided an existing summary up to a certain point*: `EXISTING_ANSWER`.
>
> *We have the opportunity to refine the existing summary (only if needed) with some more context below.*
>
> *Given the new context, refine the original summary.*
>
> `TEXT`
>
> *If the context isn't useful, return the original summary.*
>
> *If needed, provide some bill examples.*
>
> *The final output should have between 10 to 20 sentences.*

Note: some of the formating and variables of the actual prompts have been changed here to highlight the ideas of the prompts being used, not the exact copies of the prompts.

'''

source_explanations = {
    'zeroshotv1-bertopics': 'These topics are curated from Citris Policy Lab, with some modifications from early iteration of our AI legislative tracker.',
    'zeroshotv2-bertopics': 'These topics are curated from our ongoing Scorecard Devlopment categories.'
}
for source in ts_df['topic_source'].unique():
    text += f'## Topic source from: `{source}`\n{source_explanations[source]}\n\n'
    
    for i, row in ts_df.query('topic_source == @source').reset_index(drop=True).iterrows():
        text += f'### {i+1}. {row["topic"]}\n\n'
        text += re.sub(
            r'([A-Z]{2}_\[[^\]]+\]_[A-Z][-_\s]?\w+)',
            r'`\1`',
            row['result_output_text']
        ) + '\n\n'
        
print(text)

## Read and select data

The following selects curated data (i.e. bills that have met certain threshold of AI/ADS-related words, usually relative to the length of the document). However, to widen the search, below also attempts to included bills not marked by such curation based on threshold, as long as they contain `artificial intelligence` or `automat(ed|ic) decision`. Additionally, only **state** bills proposed from 2023-2024 are selected.

In [None]:
df = pd.read_json('data/bill_data.json')
df['text'] = df['text'].str.lower()


In [None]:
df['has_ai_ads'] = ~df.filter(regex='^query_').agg(
    lambda x: ' '.join(list(set(np.concatenate(list(x))))),
    axis=1
).str.extract(
    r'(artificial intelligence|automat\w* decision)',
    expand=False
).isna()

In [None]:
df = (
    df
    .query(
        '''
        (curated or has_ai_ads)
        and
        first_date.str.contains("2023|2024", regex=True)
        and 
        jurisdiction_code != "US"
    '''.replace('\n',' ').strip()
    )
    .reset_index(drop=True)
)

len(df)

## Split text

Some bills are quite long, so break into chunks

In [None]:
text_splitter = TokenTextSplitter(
    chunk_size=10000,
    chunk_overlap=200,
)

In [None]:
docs = []

for _, row in df.iterrows():
    text = row['text']
    metadata = {
        k: v for k, v in row.items() 
        if k not in ['text', 'raw_text', 'abstract']
    }
    
    chunks = text_splitter.split_documents([Document(page_content=text)])
    docs.append(dict(
        bill_id = metadata['bill_id'],
        chunks = chunks,
        metadata = metadata,
        num_chunks = len(chunks)
    ))


In [None]:
len(docs), sum([len(x['chunks']) for x in docs])

## Definitions

These are some definitions and intructions to handle them. 

In [None]:
definitions = [
    dict(
        name = "has_government_scope",
        description = '''\
Indicates whether the bill has government scope: \
a bill has government scope if it governs the government's use of artificial intelligence (AI) or \
automated decision systems (ADS) in its operations. \
This scope specifically focuses on the **government**'s use and procurement of these technologies.''',
        instruction = '''\
- First, answer only "Yes" or "No".
- If "Yes", also include 1-2 sentence excerpts from the text to support the government scope, \
label variable as `excerpt_government_scope`.'''
    ),
    dict(
        name = "has_ai_governance_body",
        description = '''\
Indicates whether the bill designates, indicates or establishes an AI governance body: \
an AI governance body is a group of people in the within a government entity or organization that \
has the authority to manage and oversee the use of AI or ADS by that entity or organization.''',
        instruction = '''\
- First, answer only "Yes" or "No".
- If "Yes", also include the name(s) of the governance body, \
label variable as `ai_governance_body_names`.'''
    ),
    dict(
        name = "has_harmonization",
        description = '''\
Indicates whether the bill outlines intent or strategy to harmonize legislation between state and federal government. \
Harmonization is defined as cooperation between different state and federal jurisdictions \
to make laws identical or at least more similar.''',
        instruction = '''\
- First, answer only "Yes" or "No".
- If "Yes", also include 1-2 sentence excerpts from the text to support existence of hamornization, \
label variable as `excerpt_harmonization`.'''
    ),
]

definitions

In [None]:
def_str = '\n'.join([
'''\
{index}. `{name}`: {description}

Instruction: 
{instruction}
'''.format(**d, index=i+1)
for i, d in enumerate(definitions)
])

print(def_str)

## Create LLM chain

Note: Technically there are probably better ways to do this, like with structured data extraction & `pydandic`.

The following uses summary chain with refinement instead as some documents are long and broken into chunks.

In [None]:
# note: more models can be input here if need be
llms = [
    ChatOpenAI(
        temperature=0,
        model_name="gpt-3.5-turbo-0125",
    ),
]

In [None]:
prompt_template = """You are a helpful assistant for legislators, researchers and lawyers.
You are given a task to read a bill and extract necessary information from them.
Below are the variables and instructions:

%s 

Use only the definitions and follow instructions here.
Only use the existing text as reference. Do not make things up.
If you know an answer is empty, just use an empty string "". 
If you do not know an answer for a variable, just answer as "unknown".

Please output as a JSON format.

Here is the text:

{text}

JSON_OUTPUT:""" %(def_str)
prompt = PromptTemplate.from_template(prompt_template)

refine_template = '''
Your job is extract necessary information about a bill.
We have provided an existing JSON output up to a certain point: 

```json
{existing_answer}
```

We have the opportunity to refine the existing JSON output\
(only if needed) with some more context below.
------------
{text}
------------
Given the new context, refine the original JSON output.
If the context isn't useful, return the original JSON output.

The final answer should be a JSON output with only these variables:

%s
''' %(def_str)

refine_prompt = PromptTemplate.from_template(refine_template)


chains = []
for llm in llms:
    if hasattr(llm, 'model'):
        model_source = llm.model
    if hasattr(llm, 'model_name'):
        model_source = llm.model_name
    chains.append(dict(
        model = model_source,
        chain = load_summarize_chain(
            llm=llm,
            chain_type="refine",
            question_prompt=prompt,
            refine_prompt=refine_prompt,
            return_intermediate_steps=True,
            input_key="input_documents",
            output_key="output_text",
        )
    ))


## Extract from chain

In [None]:
outputs = dict()

for i, bill in tqdm(enumerate(docs), total = len(docs)):
    bill_id = bill['bill_id']
    for chain_dict in chains:
        model = chain_dict['model']
        chain = chain_dict['chain']
        
        outkey = (model, bill_id)
        if outkey in outputs:
            continue
        
        result = chain({"input_documents": bill['chunks']})
        outputs[outkey] = dict(
            model = model,
            bill_id = bill_id,
            extraction = result['output_text'],
            metadata = bill['metadata']
        )


In [None]:
for k, v in outputs.items():
    assert k == (v['model'], v['bill_id'])

## Process output

In [None]:
dfe = pd.DataFrame([dict(
    bill_id = x['bill_id'],
    model = x['model'],
    **{
        k: v for k, v in x['metadata'].items()
        if k in  [
            'session', 
            'classification',
            'jurisdiction',
            'jurisdiction_code',
            'introduced_chamber',
            'first_date',
            'title',
            'curated',
            'has_ai_ads'
        ]
    },
    **json.loads(
        re.search(r'{((.|\n)+)}', x['extraction']).group()
    )
) for x in outputs.values()])

In [None]:
for k in dfe.filter(regex='has_|is_').columns:
    if dfe[k].dtype == bool:
        continue
    dfe[k] = (
        dfe[k].str.upper()
        .fillna('NO')
        .map({
            '': False, 
            'NO': False, 
            'YES': True
        })
    )


dfe['ai_governance_body_names'] = dfe['ai_governance_body_names'].fillna('').apply(
    lambda x: x.title() if isinstance(x, str) else '; '.join(x).title()
)

dfe['excerpt_government_scope'] = dfe['excerpt_government_scope'].fillna('')
dfe['excerpt_harmonization'] = dfe['excerpt_harmonization'].fillna('')

assert all(dfe['has_government_scope'] == (dfe['excerpt_government_scope'] != ""))
assert all(dfe['has_ai_governance_body'] == (dfe['ai_governance_body_names'] != ""))
assert all(dfe['has_harmonization'] == (dfe['excerpt_harmonization'] != ""))

In [None]:
dfe

In [None]:
print(
    dfe['has_ai_governance_body'].unique(),
    dfe['has_government_scope'].unique(),
    dfe['has_harmonization'].unique()    
)

In [None]:
dfe.to_csv('data/memo-criteria-extract.csv', index=False)