In [233]:
import os
from langchain.document_loaders import Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain.chat_models import ChatOpenAI
from langchain import PromptTemplate
from langchain.chains import LLMChain
from dotenv import find_dotenv, load_dotenv
import pandas as pd
import logging
import pprint
from helper import summarize_context, generate_hypothetical_embeddings, f_path, PROMPT_TEMPLATE_HYDE
import re


logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO)
logger = logging.getLogger(__name__)


load_dotenv(find_dotenv())
query_memory = []


CHUNK_SIZE = 500
CHUNK_OVERLAP = 250
TEMPERATURE = 0
k = 20

In [234]:

def clean_name(officer_name):
    return re.sub(
        r"(Detective|Officer|Deputy|Captain|[CcPpLl]|Sergeant|Lieutenant|Techn?i?c?i?a?n?|Investigator)\.?\s+",
        "",
        officer_name,
    )

def extract_officer_data(response):
    response = response.split("\n\n")
    officer_data = []
    for line in response:
        officer_dict = {}
        match = re.search(r"Officer Name:\s*(.*)\s*Officer Context:\s*(.*)\s*Officer Role:\s*(.*)", line)
        if match:
            officer_dict["Officer Name"] = match.group(1).strip()
            officer_dict["Officer Context"] = match.group(2).strip()
            officer_dict["Officer Role"] = match.group(3).strip()
            officer_data.append(officer_dict)
    return officer_data


In [235]:
ROLES_PROMPT = """
US-IPNO-Exonerations: Model Evaluation Guide 
Roles:
Lead Detective
•	Coordinates with other detectives and law enforcement officers on the case.
•	Liaises with the prosecutor's office, contributing to legal strategy and court proceedings.
•	May be involved in obtaining and executing search warrants.
•	Could be called to testify in court about the investigation.
Detective
•	Might gather evidence from crime scenes.
•	Collaborates with other detectives, patrol officers, and forensic analysts.
•	May follow up on leads, which could involve surveillance or undercover work.
•	Can work with informants to gather intelligence.
Interrogator
•	Often conducts multiple rounds of questioning with a suspect.
•	Works with other investigators to develop a line of questioning based on evidence.
•	Ensures the suspect's rights are maintained throughout the process.
•	Statements obtained can become crucial pieces of evidence.
Officer on Scene
•	First responding officers often provide initial reports that frame the subsequent investigation.
•	Might coordinate with emergency medical services if there are injuries.
•	Often identifies and interviews immediate witnesses.
•	May need to make immediate decisions to preserve life or apprehend suspects.
Arresting Officer
•	Often writes a report detailing the circumstances of the arrest.
•	May testify in court about the arrest and the suspect's demeanor or statements at the time.
•	Might have to physically subdue the suspect if they resist arrest.
•	Ensures all procedures are correctly followed to avoid potential legal issues later.
Criminalist
•	Often specializes in specific types of evidence such as ballistics, trace evidence, or digital forensics.
•	Documents findings in detailed reports that can become part of the court record.
•	Can be called to testify as expert witnesses in court.
•	Often works closely with detectives and other law enforcement to provide context for their findings.
Transporting Officer
•	Needs to maintain security during transport to prevent escapes.
•	Ensures the individual's rights and well-being are maintained during transport.
•	Might also be responsible for managing paperwork or property associated with the transported individual.
•	May be called to testify about the individual's behavior or statements during transport.
Supervising Officer
•	Oversees the work of other officers and ensures procedures are correctly followed.
•	May coordinate resources and personnel for investigations or operations.
•	Often reviews and signs off on reports and paperwork.
•	Might be called to testify about department policies or the conduct of officers under their supervision.
Patrol Officer
•	Often the first to respond to a crime scene or incident.
•	Carries out routine patrols and responds to emergency and non-emergency calls.
•	May conduct preliminary investigations, gather evidence, and take witness statements.
•	Often makes initial arrests and might be called to testify about their observations.
Crime Scene Investigator
•	Collects, catalogs, and preserves physical evidence from crime scenes.
•	Often works closely with detectives to understand what kind of evidence to look for.
•	May specialize in certain types of evidence or crime scenes.
•	Documents the crime scene through photographs, sketches, and detailed reports.
Informant Handler/Coordinator
•	Manages the relationship with the informant.
•	Assesses the credibility of the information provided by the informant.
•	Shares relevant information from the informant with detectives and other law enforcement personnel working on the case.
•	Ensures that the use of the informant complies with law enforcement policies and legal guidelines.
•	May be called to testify about the information provided by the informant, while protecting the informant's identity.
•	In some cases, might work to provide protection or other resources for the informant.
"""

In [236]:

PROMPT_TEMPLATE_MODEL = PromptTemplate(
    input_variables=["roles" ,"question", "docs"],
    template="""
    As an AI assistant, my role is to meticulously analyze court transcripts, traditional officer roles, and extract information about law enforcement personnel.
    The names of law enforcement personnel will be prefixed by one of the following titles: officer, detective, deputy, lieutenant, 
    sergeant, captain, officer, coroner, investigator, criminalist, patrolman, or technician.

    Query: {question}

    Transcripts: {docs}

    Roles: {roles}

    The response will contain:

    1) The name of a officer, detective, deputy, lieutenant, 
       sergeant, captain, officer, coroner, investigator, criminalist, patrolman, or technician - 
       if an individual's name is not associated with one of these titles they do not work in law enforcement.
       Please prefix the name with "Officer Name: ". 
       For example, "Officer Name: John Smith".

    2) If available, provide an in-depth description of the context of their mention. 
       If the context induces ambiguity regarding the individual's employment in law enforcement, 
       remove the individual.
       Please prefix this information with "Officer Context: ". 

    3) Review the context to discern the role of the officer.
       Please prefix this information with "Officer Role: "
      For example, the column "Officer Role: Lead Detective" will be filled with a value of 1 for officer's who were the lead detective.


    Continue this pattern of identifying persons, until all law enforcement personnel are identified.  

    Additional guidelines for the AI assistant:
    - Titles may be abbreviated to the following Sgt., Cpl, Cpt, Det., Ofc., Lt., P.O. and P/O
    - Titles "Technician" and "Tech" might be used interchangeably.
    - Derive responses from factual information found within the police reports.
    - If the context of an identified person's mention is not clear in the report, provide their name and note that the context is not specified.
    - Do not extract information about victims and witnesses
""",
)

In [237]:
def preprocess_document(file_path, embeddings):
    logger.info(f"Processing Word document: {file_path}")

    loader = Docx2txtLoader(file_path)
    text = loader.load()
    logger.info(f"Text loaded from Word document: {file_path}")

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
    docs = text_splitter.split_documents(text)

    db = FAISS.from_documents(docs, embeddings)

    return db

In [238]:

def get_response_from_query(db, query, temperature, k, ROLES_PROMPT):
    logger.info("Performing query...")
    docs = db.similarity_search(query, k=k)
    docs_page_content = " ".join([d.page_content for d in docs])

    llm = ChatOpenAI(model_name="gpt-4")

    prompt = PROMPT_TEMPLATE_MODEL

    chain = LLMChain(llm=llm, prompt=prompt)
    response = chain.run(roles=ROLES_PROMPT, question=query, docs=docs_page_content, temperature=temperature)
    print(response)

    return response, docs

In [239]:
QUERIES = [
    "Identify individuals, by name, with the specific titles of officers, sergeants, lieutenants, captains, detectives, homicide officers, and crime lab personnel in the transcript. Specifically, provide the context of their mention related to key events in the case, if available.",
]

def process_query(embeddings):
    for file_name in os.listdir(f_path):
        if file_name.endswith(".docx"):
            csv_output_path = os.path.join(f_path, f"{file_name}.csv")
            if os.path.exists(csv_output_path):
                logger.info(f"CSV output for {file_name} already exists. Skipping...")
                continue

            file_path = os.path.join(f_path, file_name)
            output_data = []

            db = preprocess_document(file_path, embeddings)
            for query in QUERIES:
                officer_data_string, _ = get_response_from_query(db, query, TEMPERATURE, k, ROLES_PROMPT)
                
                officer_data = extract_officer_data(officer_data_string)  # parse the response string

                for item in officer_data:
                    item["Query"] = query
                    item["Prompt Template for Hyde"] = PROMPT_TEMPLATE_HYDE
                    item["Prompt Template for Model"] = PROMPT_TEMPLATE_MODEL
                    item["Chunk Size"] = CHUNK_SIZE
                    item["Chunk Overlap"] = CHUNK_OVERLAP
                    item["Temperature"] = TEMPERATURE
                    item["k"] = k
                    item["hyde"] = "1"
                output_data.extend(officer_data)

            output_df = pd.DataFrame(output_data)
            output_df.to_csv(csv_output_path, index=False)

In [240]:
def main():
    embeddings = generate_hypothetical_embeddings()
    process_query(embeddings)

if __name__ == "__main__":
    main()

2023-07-26 15:19:45,152 - INFO - Processing Word document: ../../data/convictions/transcripts/evaluate\Adams_Exhibit Volumes FILED.docx
2023-07-26 15:19:45,213 - INFO - Text loaded from Word document: ../../data/convictions/transcripts/evaluate\Adams_Exhibit Volumes FILED.docx
2023-07-26 15:19:51,743 - INFO - Performing query...


Based on the provided transcript, here's the list of identified law enforcement personnel with their respective context and roles:

1. Officer Name: Sgt. William Townsend
   Officer Context: Sgt. Townsend was part of the crime lab personnel who arrived at the crime scene to process it for latent prints and physical evidence under the instruction of Detective Gebbia. He supervised the crime scene search and lifted several latents from the residence for further investigation.
   Officer Role: Supervising Officer, Crime Scene Investigator

2. Officer Name: Detective Gebbia
   Officer Context: Detective Gebbia was one of the detectives present at the crime scene. He instructed Sgt. Townsend to process the scene, supervised the crime scene search, and conducted interviews.
   Officer Role: Lead Detective

3. Officer Name: Technician Norville Orazio
   Officer Context: Technician Norville Orazio was part of the crime lab personnel who arrived at the crime scene to assist in processing it.
  