In [5]:
import pandas as pd
from bayes_opt import BayesianOptimization
from langchain import PromptTemplate
from helper import generate_hypothetical_embeddings, preprocess_single_document, get_response_from_query, f_path, PROMPT_TEMPLATE_MODEL
from dotenv import find_dotenv, load_dotenv
import logging
import os
import numpy as np


logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO)
logger = logging.getLogger(__name__)


load_dotenv(find_dotenv())
query_memory = []


embeddings = generate_hypothetical_embeddings()

In [6]:
PROMPT_TEMPLATE_MODEL = PromptTemplate(
    input_variables=["question", "docs"],
    template="""
    As an AI assistant, my role is to meticulously analyze court transcripts and extract information about law enforcement personnel.
    The names of law enforcement personnel will be prefixed by one of the following titles: officer, detective, deputy, lieutenant, 
    sergeant, captain, officer, coroner, investigator, criminalist, patrolman, or technician.

    Query: {question}

    Transcripts: {docs}

    The response will contain:

    1) The name of a officer, detective, deputy, lieutenant, 
       sergeant, captain, officer, coroner, investigator, criminalist, patrolman, or technician - 
       if an individual's name is not associated with one of these titles they do not work in law enforcement.
       Please prefix the name with "Officer Name: ". 
       For example, "Officer Name: John Smith".

    2) If available, provide an in-depth description of the context of their mention. 
       If the context induces ambiguity regarding the individual's employment in law enforcement, 
       remove the individual.
       Please prefix this information with "Officer Context: ". 

    Continue this pattern of identifying persons, until all law enforcement personnel are identified.  

    Additional guidelines for the AI assistant:
    - Titles may be abbreviated to the following Sgt., Cpl, Cpt, Det., Ofc., Lt., P.O. and P/O
    - Titles "Technician" and "Tech" might be used interchangeably.
    - Derive responses from factual information found within the police reports.
    - If the context of an identified person's mention is not clear in the report, provide their name and note that the context is not specified.
    - Do not extract information about victims and witnesses
""",
)


In [7]:
QUERIES = [
    "Identify individuals, by name, with the specific titles of officers, sergeants, lieutenants, captains, detectives, homicide officers, and crime lab personnel in the transcript. Specifically, provide the context of their mention related to key events in the case, if available.",
]

def preprocess_documents_in_directory(directory_path, embeddings, chunk_size, chunk_overlap):
    dbs = []
    
    for filename in os.listdir(directory_path):
        if filename.endswith('.docx'):
            file_path = os.path.join(directory_path, filename)
            db = preprocess_single_document(file_path, embeddings, chunk_size, chunk_overlap)
            dbs.append(db)
            
    return dbs

In [8]:
pbounds_list = [
    {
        'chunk_size': (3000, 6000),
        'chunk_overlap': (1500, 3000),
        'k': (1, 5),
    },
    {
        'chunk_size': (1500, 3000),
        'chunk_overlap': (1000, 2000),
        'k': (1, 10),
    },
    {
        'chunk_size': (750, 1500),
        'chunk_overlap': (250, 500),
        'k': (1, 40),
    },
]


def objective(chunk_size, chunk_overlap, k):
    dbs = preprocess_documents_in_directory(f_path, embeddings, chunk_size, chunk_overlap)
    
    total_token_count = sum(get_response_from_query(db, query, k) for db in dbs)

    return total_token_count


def sample_posterior(optimizer, param_bounds, n_samples=1000):
    x_samples = np.linspace(param_bounds['chunk_size'][0], param_bounds['chunk_size'][1], num=n_samples)
    y_samples = np.linspace(param_bounds['chunk_overlap'][0], param_bounds['chunk_overlap'][1], num=n_samples)
    x_grid, y_grid = np.meshgrid(x_samples, y_samples)
    xy_samples = np.stack([x_grid.ravel(), y_grid.ravel()]).T

    mu, sigma = optimizer._gp.predict(xy_samples, return_std=True)

    f_samples = np.random.normal(loc=mu, scale=sigma, size=(n_samples, len(xy_samples)))

    return xy_samples, f_samples

results_df = pd.DataFrame(columns=['Query', 'Chunk_Size', 'Chunk_Overlap', 'k', 'Tokens'])

for pbounds in pbounds_list:
    for query in QUERIES:
        optimizer = BayesianOptimization(
            f=objective,
            pbounds=pbounds,
            verbose=2, 
            random_state=1,
        )

        optimizer.maximize(
            init_points=2,
            n_iter=5,
        )

        for res in optimizer.res:
            results_df = results_df.append({
                'Query': query, 
                'Chunk_Size': res['params']['chunk_size'], 
                'Chunk_Overlap': res['params']['chunk_overlap'], 
                'k': res['params']['k'], 
                'Tokens': res['target']}, ignore_index=True)

        xy_samples, f_samples = sample_posterior(optimizer, pbounds)

        for i in range(len(xy_samples)):
            results_df = results_df.append({
                "Query": query,
                "Sampled_Chunk_Size": xy_samples[i, 0],
                "Sampled_Chunk_Overlap": xy_samples[i, 1],
                "Sampled_k": optimizer.max["params"]["k"],
                "Sampled_Tokens": f_samples[i],
            }, ignore_index=True)

2023-07-26 12:21:10,505 - INFO - Processing Word document: ../../data/convictions/transcripts/evaluate\Adams_Exhibit Volumes FILED.docx
2023-07-26 12:21:10,543 - INFO - Text loaded from Word document: ../../data/convictions/transcripts/evaluate\Adams_Exhibit Volumes FILED.docx


|   iter    |  target   | chunk_... | chunk_... |     k     |
-------------------------------------------------------------


2023-07-26 12:21:13,309 - INFO - Performing query...
2023-07-26 12:22:01,999 - INFO - Processing Word document: ../../data/convictions/transcripts/evaluate\Adams_Exhibit Volumes FILED.docx
2023-07-26 12:22:02,033 - INFO - Text loaded from Word document: ../../data/convictions/transcripts/evaluate\Adams_Exhibit Volumes FILED.docx


| [0m1        [0m | [0m341.0    [0m | [0m2.126e+03[0m | [0m5.161e+03[0m | [0m1.0      [0m |


2023-07-26 12:22:06,038 - INFO - Performing query...


In [None]:
results_df.to_csv('output/optimization_results.csv', index=False)