# Find keywords and create report

This notebook uses information from the `extract.ipynb` to limit to only government-scoped state bills and find keywords based on different categories with regular expression. 

## Import

In [None]:
import os
import glob
import shutil
import re
from typing import Dict, List

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm
from tqdm.contrib.concurrent import process_map
from functools import partial

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('mpl.sty.yml')

## Read data

### Bill data

In [None]:
df = pd.read_json('data/bill_data.json')
df['text'] = df['text'].str.lower()

In [None]:
df['has_ai_ads'] = ~df.filter(regex='^query_').agg(
    lambda x: ' '.join(list(set(np.concatenate(list(x))))),
    axis=1
).str.extract(
    r'(artificial intelligence|automat\w* decision)',
    expand=False
).isna()

In [None]:
df = (
    df
    .query(
        '''
        (curated or has_ai_ads)
        and
        first_date.str.contains("2023|2024", regex=True)
        and 
        jurisdiction_code != "US"
    '''.replace('\n',' ').strip()
    )
    .reset_index(drop=True)
    .set_index([
        'bill_id'
    ])
)


df['classification'] = df['classification'].apply(
    lambda x: '; '.join(x)
)
df

### Extracted data

In [None]:
extract_df = (
    pd.read_csv('data/memo-criteria-extract.csv')
    .fillna({
        'excerpt_government_scope': '',
        'ai_governance_body_names': '',
        'excerpt_harmonization': ''
    })
    .reset_index(drop=True)
)

In [None]:
extract_df['jurisdiction_code'].unique()

In [None]:
print('Total state bills:', extract_df['bill_id'].nunique())
print('First date:', extract_df['first_date'].min())
print('Has gov scoped suggested by ChatGPT:', 
      extract_df.drop_duplicates('bill_id')['has_government_scope'].sum()
 )

In [None]:
# limit to only government scoped as suggested by LLM
extract_df = (
    extract_df.query('has_government_scope')
    .set_index('bill_id')
)

In [None]:
# note: commissions are usually more short-term, not considering
extract_df['ai_governance_body_names'].unique()

### Combine

In [None]:
df = (
    extract_df
    .filter(set(extract_df.columns) - set(df.columns))
    .join(df)
)

In [None]:
df

## Keyword regular expressions

In [None]:
max_words_before = 4
max_words_after = 12
max_words_harmonize = 15 # harmonize a bit more complicated

In [None]:
space_regex = r'\s*[^\w\.\?\!]?\s*'

AI_regex = '(' + (r'|'.join([
    r'\bai\b',
    fr'(generative{space_regex})?artificial{space_regex}intelligence',
    fr'automat(ed|ic){space_regex}decision{space_regex}(making)?{space_regex}systems?',
    fr'frontier{space_regex}model',
    fr'(face|facial|iris|gait){space_regex}(recog|match)\w*',
])) + ')'

board_regex = '(' + (r'|'.join([
    fr'governance{space_regex}bod(y|ies)',
    r'\w*cabinet',
    fr'(governance{space_regex})?board',
    r'council',
    fr'(ethic\w*){space_regex}commission',
    r'division',
    r'\boffice\b',
    r'department',
    r'agency',
    r'branch',
    # r'institut\w*', # usually academic, not government
    # r'committee', # tend to be short-term?
    # r'commission', # tend to be short-term?
])) + ')'

impact_regex = r'(impact|use|\w*[^\w]?risk)'

In [None]:
patterns = [
    dict(
        name = 'AIOfficer',
        pattern = fr'(chief)?{space_regex}{AI_regex}{space_regex}officer',
        desc = '''
This is detecting `chief`, then a few potential AI-related terms:

- `AI`
- `artificial intelligence`
- `automated decision (making) system`
- `frontier model`
- `face/facial/iris/gait recog/match`

then `officer`.

'''
    ),
    dict(
        name = 'GovBody',
        pattern = (
            '(' + (r'|'.join([
                fr'{AI_regex}{space_regex}{board_regex}',
                fr'{board_regex}{space_regex}(of|on|for){space_regex}{AI_regex}',       
            ])) + ')'
        ),
        desc = '''
This is detecting AI-related terms:

- `AI`
- `artificial intelligence`
- `automated decision (making) system`
- `frontier model`
- `face/facial/iris/gait recog/match`

then board-related terms:

- `governance body`
- `cabinet`
- `board`
- `council`
- `division`
- `office` (but not `officer`)
- `department`
- `agency`
- `branch`
- `(ethics) commission`

This detects both:

- AI-related terms, then board-related terms;
- as well as: board-related terms, followed by `of|on|for` then AI-related terms.

This is a difficult category to capture correctly with just word detection and will need manual verification.

'''
    ),
    dict(
        name = 'Inventory',
        pattern = (
            '(' + (r'|'.join([
                fr'{AI_regex}(\w+\s+){{,{max_words_before}}}inventor(y|ies)',
                fr'inventor(y|ies){space_regex}(\w+\s+){{,{max_words_after}}}{AI_regex}',
                fr'{impact_regex}(\w+\s+){{,{max_words_before}}}inventor(y|ies)',
                fr'inventor(y|ies){space_regex}(\w+\s+){{,{max_words_after}}}{impact_regex}',
            ])) + ')'
        ),
        desc = '''
This is detecting either:

- AI-related terms:
    - `AI`
    - `artificial intelligence`
    - `automated decision (making) system`
    - `frontier model`
    - `face/facial/iris/gait recog/match`
- or impact related terms:
    - `impact`
    - `use`
    - `risk`
    
then the word `inventory`.

This also detects the other way around: `inventory`, then AI/impact-related terms.

Note that this also allows for some words in between of AI/impact-related terms and `inventory`
to allow more flexible capturing of such pairing.

'''
    ),
    dict( 
        name = 'Procurement',
        pattern = (
            '(' + (r'|'.join([
                fr'{AI_regex}(\w+\s+){{,{max_words_before}}}(procur|purchas|acqui(r|s))\w*',
                fr'(procur|purchas|acqui(r|s))\w*{space_regex}(\w+\s+){{,{max_words_after}}}{AI_regex}',
            ])) + ')'
        ),
        desc = '''
This is detecting AI-related terms:

- `AI`
- `artificial intelligence`
- `automated decision (making) system`
- `frontier model`
- `face/facial/iris/gait recog/match`
    
then the procurement-related terms:

- `procure`
- `purchase`
- `acquire` (or `acquis` for `acquisition`)

This also allows the other way around: procurement- then AI-related terms.

Note that this also allows for some words in between of AI-related terms 
and procurement-related terms to allow more flexible capturing of such pairing.
'''
    ),
    dict(
        name = 'ImpactAssess',
        pattern = (
            '(' + (r'|'.join([
                fr'{impact_regex}{space_regex}(assess|evaluat|manage)\w*',
                fr'(assess|evaluat|manage){space_regex}(\w+\s+){{,{max_words_after}}}{impact_regex}',
            ])) + ')'
        ),
        desc = '''
This is detecting impact-related terms:

- `impact`
- `use`
- `risk`

then the assessment-related verbs:

- `assess`
- `evaluate`
- `manage`

This also allows the other way around: assessment- then AI-related terms.

For impact-related then verbs, no flexible wording in between is considered.

However, for the other way around, verb then impact, some words in between are allowed.

'''
    ),
    dict(
        name = 'PotentialHarmonize',
        pattern = (
            '(' + (r'|'.join([
                fr'{board_regex}(\w+\s+){{,{max_words_harmonize}}}(harmoniz|collab|coordinat)\w*',
                fr'(harmoniz|collab|coordinat)\w*(\w+\s+){{,{max_words_harmonize}}}{space_regex}{board_regex}',
            ])) + ')'
        ),
        desc = '''
This is detecting board-related terms:

- `governance body`
- `cabinet`
- `board`
- `council`
- `division`
- `office` (but not `officer`)
- `department`
- `agency`
- `branch`
- `(ethics) commission`  

then possible harmonization verbs:

- `harmonize`
- `collaborate`
- `coordinate`

This detects also the other way around and also allows for flexible wordings between.

This is a very difficult category to operationalize, highly prone to false positives. This really needs manual verification.

'''
    ),
]


for i in range(len(patterns)):
    p = patterns[i]
    p['regex'] = re.compile(p['pattern'].lower())


## Perform keyword search

In [None]:
text_records = df[['text']].reset_index().to_dict('records')

In [None]:
def query_1_bill(record, patterns=[]):
    bill_id = record['bill_id']
    text = record['text']
    
    out = dict(bill_id=bill_id)    
    for p in patterns:
        f = []
        for fi in p['regex'].finditer(text):
            f.append(re.sub('\s', ' ', fi.group()).strip())
        out[p['name']] = f
        
    return out

results = process_map(
    partial(
        query_1_bill,
        patterns=patterns
    ),
    text_records,
    max_workers=4
)

## Process output

In [None]:
kwdf = (
    pd.DataFrame(results)
    .melt(
        id_vars='bill_id',
        var_name='keyword_category',
        value_name='keyword'
    )
    .explode('keyword')
    .dropna()
    .reset_index(drop=True)
)

In [None]:
cdf = (
    kwdf
    .groupby(['bill_id','keyword_category'])
    ['keyword'].agg(lambda x: list(set(x)))
    .to_frame('keywords')
    .reset_index()
    .merge(
        df.reset_index()
        .filter([
            'bill_id', 'identifier', 'session', 'classification',
            'jurisdiction', 'jurisdiction_code', 'title', 'first_date',
            'has_government_scope', 'excerpt_government_scope',
            'has_ai_governance_body', 'ai_governance_body_names',
            'has_harmonization', 'excerpt_harmonization',
            'source', 'openstates_url', 'plural_url',
        ]),
    )
    .merge(
        pd.DataFrame(patterns)
        .filter(['name', 'desc'])
        .rename(columns={
            'name': 'keyword_category',
            'desc': 'category_description'
        })
    )
)
cdf

In [None]:
cdf['bill_id'].nunique()

## Visualize bill counts

In [None]:
vdf = (
    cdf
    .value_counts([
        'jurisdiction', 'jurisdiction_code',
        'keyword_category', 
    ])
    .to_frame('num_bills')
    .reset_index()
)

vdf
    

In [None]:
(
    sns.catplot(
        vdf,
        y='jurisdiction_code',
        x='num_bills',
        col = 'keyword_category',
        col_order=pd.DataFrame(patterns)['name'],
        kind = 'bar',
        height = 15,
        aspect = 0.3,
        sharex=False,
        sharey=True,
        width=0.7,
        color='#6baed6',
    )
    .tick_params(
        left=True,
        labelleft=True,
    )
    .despine(
        trim=True,
        offset=10
    )
    .set_xlabels(label='# Bills')
    .set_ylabels(label='State')
    .set_titles('{col_name}')
    .tight_layout()
)

plt.savefig('docs/bill_count.pdf')

## Create detailed reports

In [None]:
text = r'''
---
geometry:
    - margin=0.5in
output: pdf_document
colorlinks: true
fontsize: 9pt
toc: false
urlcolor: "violet"
header-includes:
    - \usepackage{titling}
    - \setlength{\droptitle}{-7em}
    - \pagenumbering{gobble}
    - \setlength{\parindent}{0em}
    - \usepackage{sansmathfonts}
    - \usepackage[T1]{fontenc}
    - \usepackage{graphicx}
    - \renewcommand*\familydefault{\sfdefault} 
    - \usepackage{wrapfig}
    - \usepackage{booktabs}
    - \usepackage[export]{adjustbox}
    - \newcommand{\forceindent}{\leavevmode{\parindent=1em\indent}}
    - \usepackage{fvextra}
    - \DefineVerbatimEnvironment{Highlighting}{Verbatim}{breaklines,commandchars=\\\{\}}
---

# State bills with found keywords from Memo 2024

\begin{figure}[h]
    \centering
    \includegraphics[width=\textwidth, angle=0]{docs/bill_count.pdf}
    \caption{\textbf{Preliminary government-scoped} bill counts per ``categories'' across states}
\end{figure}

'''

In [None]:
main_prompt = r'''
You are a helpful assistant for legislators, researchers and lawyers.
You are given a task to read a bill and extract necessary information from them.
Below are the variables and instructions:

1. `has_government_scope`: Indicates whether the bill has government scope: a bill has government scope if it governs the government's use of artificial intelligence (AI) or automated decision systems (ADS) in its operations. This scope specifically focuses on the **government**'s use and procurement of these technologies.

Instruction: 
- First, answer only "Yes" or "No".
- If "Yes", also include 1-2 sentence excerpts from the text to support the government scope, label variable as `excerpt_government_scope`.

2. `has_ai_governance_body`: Indicates whether the bill designates, indicates or establishes an AI governance body: an AI governance body is a group of people in the within a government entity or organization that has the authority to manage and oversee the use of AI or ADS by that entity or organization.

Instruction: 
- First, answer only "Yes" or "No".
- If "Yes", also include the name(s) of the governance body, label variable as `ai_governance_body_names`.

3. `has_harmonization`: Indicates whether the bill outlines intent or strategy to harmonize legislation between state and federal government. Harmonization is defined as cooperation between different state and federal jurisdictions \ 
to make laws identical or at least more similar.

Instruction: 
- First, answer only "Yes" or "No".
- If "Yes", also include 1-2 sentence excerpts from the text to support existence of hamornization, label variable as `excerpt_harmonization`.
 

Use only the definitions and follow instructions here.
Only use the existing text as reference. Do not make things up.
If you know an answer is empty, just use an empty string "". 
If you do not know an answer for a variable, just answer as "unknown".

Please output as a JSON format.

Here is the text:

{text}

JSON_OUTPUT:
'''

In [None]:
text += fr'''

## Methods

### OpenAI extraction

Below is the main prompt (there is also a refinement prompt when the text is long not shown here):

```text
{main_prompt}
```

'''

In [None]:
text += '''

### Keyword detection categories

''' + '\n'.join(
    cdf[['keyword_category', 'category_description']]
    .drop_duplicates()
    .apply(
        lambda x:     
'''
### *{keyword_category}*

{category_description}

'''.format(**x),
        axis=1
    )
)


In [None]:
bill_detail_texts = '''
## Bill details
'''

for i, bill_id in enumerate(cdf['bill_id'].unique()):
    sel_cdf = cdf.query('bill_id == @bill_id')
    extract_dict = {
        k: v if type(v) != str else v if len(v) > 0 else 'NA'
        for k, v in sel_cdf.iloc[0].to_dict().items()
        if k in [
            'bill_id',
            'jurisdiction',
            'session',
            'title',
            'has_government_scope',
            'excerpt_government_scope',
            'has_ai_governance_body',
            'ai_governance_body_names',
            'has_harmonization',
            'excerpt_harmonization'
        ]
    }
                    
    bill_detail_texts += '''
### {index}. `{bill_id}`

- Title: *{title}*
- From: {jurisdiction}, session `{session}`
- OpenAI extraction results:
    - `has_government_scope`: {has_government_scope}
        - relevant excerpt: *{excerpt_government_scope}*
    - `has_ai_governance_body`: {has_ai_governance_body}
        - governance bodies: *{ai_governance_body_names}*
    - `has_harmonization`: {has_harmonization}
        - relevant excerpt: *{excerpt_harmonization}*

'''.format(**extract_dict, index=i+1)
    
    bill_detail_texts += '''
    
- Keyword category detection results:

'''

    for _, row in sel_cdf.iterrows():
        bill_detail_texts += '''
    - *{keyword_category}*
{keyword_list}
'''.format(
            keyword_category = row['keyword_category'],
            keyword_list = '\n'.join(['\t\t- *%s*' %(x) for x in row['keywords']])
        )

with open('docs/bill_details.md', 'w') as f:
    f.write(text + bill_detail_texts)
    
!pandoc -s docs/bill_details.md -o docs/bill_details.pdf
