In [6]:
import os
from langchain.document_loaders import Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.faiss import FAISS
from langchain.chat_models import ChatOpenAI
from langchain import PromptTemplate
from langchain.chains import LLMChain
from dotenv import find_dotenv, load_dotenv
from langchain.llms import OpenAI
from langchain.chains import LLMChain, HypotheticalDocumentEmbedder
import re
import pandas as pd
import logging
from summarizer import Summarizer
import pprint


logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO)
logger = logging.getLogger(__name__)


load_dotenv(find_dotenv())
query_memory = []


CHUNK_SIZE = 500
CHUNK_OVERLAP = 250
TEMPERATURE = 0
k = 20


PROMPT_TEMPLATE_MODEL = PromptTemplate(
    input_variables=["question", "docs"],
    template="""
    As an AI assistant, my role is to meticulously analyze court transcripts and extract information about law enforcement personnel.
    The names of law enforcement personnel will be prefixed by one of the following titles: officer, detective, deputy, lieutenant, 
    sergeant, captain, officer, coroner, investigator, criminalist, patrolman, or technician.

    Query: {question}

    Transcripts: {docs}

    The response will contain:

    1) The name of a officer, detective, deputy, lieutenant, 
       sergeant, captain, officer, coroner, investigator, criminalist, patrolman, or technician - 
       if an individual's name is not associated with one of these titles they do not work in law enforcement.
       Please prefix the name with "Officer Name: ". 
       For example, "Officer Name: John Smith".

    2) If available, provide an in-depth description of the context of their mention. 
       If the context induces ambiguity regarding the individual's employment in law enforcement, 
       remove the individual.
       Please prefix this information with "Officer Context: ". 

    Continue this pattern of identifying persons, until all law enforcement personnel are identified.  

    Additional guidelines for the AI assistant:
    - Titles may be abbreviated to the following Sgt., Cpl, Cpt, Det., Ofc., Lt., P.O. and P/O
    - Titles "Technician" and "Tech" might be used interchangeably.
    - Derive responses from factual information found within the police reports.
    - If the context of an identified person's mention is not clear in the report, provide their name and note that the context is not specified.
    - Do not extract information about victims and witnesses
""",
)


PROMPT_TEMPLATE_HYDE = PromptTemplate(input_variables=["question"], template="""
    You're an AI assistant specializing in criminal justice research. 
    Your main focus is on identifying the names and providing detailed context of mention for each law enforcement personnel. 
    This includes police officers, detectives, deupties, lieutenants, sergeants, captains, technicians, coroners, investigators, patrolman, and criminalists, 
    as described in court transcripts.
    Be aware that the titles "Detective" and "Officer" might be used interchangeably.
    Be aware that the titles "Technician" and "Tech" might be used interchangeably.

    Question: {question}

    Roles and Responses:""")


def clean_name(officer_name):
    return re.sub(
        r"(Detective|Officer|Deputy|Captain|[CcPpLl]|Sergeant|Lieutenant|Techn?i?c?i?a?n?|Investigator)\.?\s+",
        "",
        officer_name,
    )


def extract_officer_data(formatted_response):
    officer_data = []
    response_lines = formatted_response.split("\n")

    for line in response_lines:
        if line.startswith("Officer Name"):
            officer_name = line.split(":", 1)[1].strip()
            officer_title = re.search(
                r"(Detective|Officer|Deputy|Captain|[CcPpLl]|Sergeant|Lieutenant|Techn?i?c?i?a?n?|Investigator)\.?",
                officer_name,
            )
            if officer_title:
                officer_title = officer_title.group()
            else:
                officer_title = ""
            officer_name = clean_name(officer_name)
        elif line.startswith("Officer Context"):
            split_line = line.split(":", 1)
            if len(split_line) > 1:
                officer_context = split_line[1].strip()
            else:
                officer_context = ""  
            officer_data.append(
                {
                    "Officer Name": officer_name,
                    "Officer Context": officer_context,
                    "Officer Title": officer_title,
                }
            )

    return officer_data


def generate_hypothetical_embeddings():
    llm = OpenAI()
    prompt = PROMPT_TEMPLATE_HYDE

    llm_chain = LLMChain(llm=llm, prompt=prompt)

    base_embeddings = OpenAIEmbeddings()

    embeddings = HypotheticalDocumentEmbedder(
        llm_chain=llm_chain, base_embeddings=base_embeddings
    )
    return embeddings


def get_response_from_query(db, query, temperature, k):
    logger.info("Performing query...")
    docs = db.similarity_search(query, k=k)
    pprint.pprint(docs)
    docs_page_content = " ".join([d.page_content for d in docs])
    pprint.pprint(docs_page_content)

    llm = ChatOpenAI(model_name="gpt-3.5-turbo-0613")

    prompt = PROMPT_TEMPLATE_MODEL

    chain = LLMChain(llm=llm, prompt=prompt)
    response = chain.run(question=query, docs=docs_page_content, temperature=temperature)

    formatted_response = ""
    officers = response.split("Officer Name:")
    for i, officer in enumerate(officers):
        if officer.strip() != "":
            formatted_response += f"Officer Name {i}:{officer.replace('Officer Context:', 'Officer Context ' + str(i) + ':')}\n\n"

    officer_data = extract_officer_data(formatted_response)
    return officer_data, docs


# QUERIES = [
#     "Identify individuals, by name, with the specific titles of police officers, sergeants, lieutenants, captains, detectives, homicide officers, and crime lab personnel in the transcript. Provide the context of their mention, if available.",
#     "List individuals, by name, specifically titled as police officers, sergeants, lieutenants, captains, detectives, homicide units, and crime lab personnel mentioned in the transcript and provide the context of their mention.",
#     "Locate individuals, by name, specifically referred to as police officers, sergeants, lieutenants, captains, detectives, homicide units, and crime lab personnel in the transcript and explain their context of mention.",
#     "Highlight individuals, by name, explicitly titled as police officers, sergeants, lieutenants, captains, detectives, homicide units, and crime lab personnel in the transcript and describe their context of mention.",
#     "Outline individuals, by name, directly identified as police officers, sergeants, lieutenants, captains, detectives, homicide units, and crime lab personnel in the transcript and specify their context of mention.",
#     "Pinpoint individuals, by name, directly labeled as police officers, sergeants, lieutenants, captains, detectives, homicide units, and crime lab personnel in the transcript and provide their context of mention.",
# ]


QUERIES = [
    "Identify individuals, by name, with the specific titles of officers, sergeants, lieutenants, captains, detectives, homicide officers, and crime lab personnel in the transcript. Specifically, provide the context of their mention related to key events in the case, if available.",
    "List individuals, by name, directly titled as officers, sergeants, lieutenants, captains, detectives, homicide units, and crime lab personnel mentioned in the transcript. Provide the context of their mention in terms of any significant decisions they made or actions they took.",
    "Locate individuals, by name, directly referred to as officers, sergeants, lieutenants, captains, detectives, homicide units, and crime lab personnel in the transcript. Explain the context of their mention in relation to their interactions with other individuals in the case.",
    "Highlight individuals, by name, directly titled as officers, sergeants, lieutenants, captains, detectives, homicide units, and crime lab personnel in the transcript. Describe the context of their mention, specifically noting any roles or responsibilities they held in the case.",
    "Outline individuals, by name, directly identified as officers, sergeants, lieutenants, captains, detectives, homicide units, and crime lab personnel in the transcript. Specify the context of their mention in terms of any noteworthy outcomes or results they achieved.",
    "Pinpoint individuals, by name, directly labeled as officers, sergeants, lieutenants, captains, detectives, homicide units, and crime lab personnel in the transcript. Provide the context of their mention, particularly emphasizing any significant incidents or episodes they were involved in.",

]


def summarize_context(context):
    model = Summarizer()
    result = model(context, min_length=60)
    summary = "".join(result)
    return summary


def preprocess_single_document(file_path, embeddings):
    logger.info(f"Processing Word document: {file_path}")

    loader = Docx2txtLoader(file_path)
    text = loader.load()
    logger.info(f"Text loaded from Word document: {file_path}")

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
    docs = text_splitter.split_documents(text)

    db = FAISS.from_documents(docs, embeddings)

    return db


def process_query(embeddings):
    doc_directory = "../../data/convictions/transcripts"

    for file_name in os.listdir(doc_directory):
        if file_name.endswith(".docx"):
            csv_output_path = os.path.join(doc_directory, f"{file_name}.csv")
            if os.path.exists(csv_output_path):
                logger.info(f"CSV output for {file_name} already exists. Skipping...")
                continue

            file_path = os.path.join(doc_directory, file_name)
            output_data = []

            db = preprocess_single_document(file_path, embeddings)
            for query in QUERIES:
                officer_data, _ = get_response_from_query(db, query, TEMPERATURE, k)
                for item in officer_data:
                    item["Query"] = query
                    item["Prompt Template for Hyde"] = PROMPT_TEMPLATE_HYDE
                    item["Prompt Template for Model"] = PROMPT_TEMPLATE_MODEL
                    item["Chunk Size"] = CHUNK_SIZE
                    item["Chunk Overlap"] = CHUNK_OVERLAP
                    item["Temperature"] = TEMPERATURE
                    item["k"] = k
                    item["hyde"] = "1"
                output_data.extend(officer_data)

            output_df = pd.DataFrame(output_data)
            # output_df = output_df.drop_duplicates(subset=["of", "officer_context"])
            output_df.to_csv(csv_output_path, index=False)

def main():
    embeddings = generate_hypothetical_embeddings()
    process_query(embeddings)

if __name__ == "__main__":
    main()

2023-07-21 10:53:42,241 - INFO - Processing Word document: ../../data/convictions/transcripts\Jimmy Bass Trial Transcript Complete.docx
2023-07-21 10:53:42,288 - INFO - Text loaded from Word document: ../../data/convictions/transcripts\Jimmy Bass Trial Transcript Complete.docx
2023-07-21 10:53:47,514 - INFO - Performing query...


[Document(page_content="an investigator with the police department, also, and Ollie\nWhite, also an investigator with the police department, came\nto the scene and Charles Anderson, deputy sheriff with the\nBolivar County Sheriff's Department; Butch Prescott, an\ninvestigator with the Bolivar County Sheriff's Department,\nand several auxiliary officers were coming 10/8 at that time.\nQ. And what happens when you, as an investigator,\ngo to a scene of a crime such as this?\nQuinton - Direct.\n31\n-\n......", metadata={'source': '../../data/convictions/transcripts\\Jimmy Bass Trial Transcript Complete.docx'}),
 Document(page_content="32\nYes, sir.\nA.\nQ. Were there any suspects there at the scene when\nyou arrived?\nNo, sir.\nA.\nDid you know who might have been involved when you\nQ.\narrived?\nA.\nNo, sir.\nAnd what then took place?\nQ.\nMyself and Officer Ollie White went out to the\nA.\nhospital to attempt to get a description of some type from\nthe victim. But at that time we were t

2023-07-21 10:54:11,438 - INFO - Performing query...


[Document(page_content='about persons that she had seen in the lineups that had\nbeen presented to her by Mr. Quinton and other officers and\nher identification of persons in those lineups and also the\nVCR lineup. The lineups had been talked about in direct\nexamination and cross-examination. All of this has been\ntalked about by the various witnesses in the case, at least\nMr. Quinton and Mrs. Townsend. The photographs have not\nbeen offered into evidence, any of the lineups, but they', metadata={'source': '../../data/convictions/transcripts\\Jimmy Bass Trial Transcript Complete.docx'}),
 Document(page_content="at the police station.\nBY MR. MELLEN: That's all.\nBILL QUINTON\nupon being called to testify as a rebuttal witness for and\non behalf of the State, after having been previously sworn,\n-\ntestified as follows, to-wit:\nDIRECT EXAMINATION BY MR. MELLEN:\nQ. Mr. Quinton, you have testified earlier, so\nwe know where you work and so forth. I want to ask you,\nif in the investig

2023-07-21 10:54:28,284 - INFO - Performing query...


[Document(page_content="an investigator with the police department, also, and Ollie\nWhite, also an investigator with the police department, came\nto the scene and Charles Anderson, deputy sheriff with the\nBolivar County Sheriff's Department; Butch Prescott, an\ninvestigator with the Bolivar County Sheriff's Department,\nand several auxiliary officers were coming 10/8 at that time.\nQ. And what happens when you, as an investigator,\ngo to a scene of a crime such as this?\nQuinton - Direct.\n31\n-\n......", metadata={'source': '../../data/convictions/transcripts\\Jimmy Bass Trial Transcript Complete.docx'}),
 Document(page_content='investigating?\nA. Mary Townsend, who had been employed there at the\n61 Quiki for about eight or nine days, had been shot in an\narmed robbery.\nQ. Okay. Then your investigation was of an armed\nrobbery?\nQuinton - Direct\n32\nYes, sir.\nA.\nQ. Were there any suspects there at the scene when\nyou arrived?\nNo, sir.\nA.\nDid you know who might have been invo

2023-07-21 10:54:48,597 - INFO - Performing query...


[Document(page_content='investigating?\nA. Mary Townsend, who had been employed there at the\n61 Quiki for about eight or nine days, had been shot in an\narmed robbery.\nQ. Okay. Then your investigation was of an armed\nrobbery?\nQuinton - Direct\n32\nYes, sir.\nA.\nQ. Were there any suspects there at the scene when\nyou arrived?\nNo, sir.\nA.\nDid you know who might have been involved when you\nQ.\narrived?\nA.\nNo, sir.\nAnd what then took place?\nQ.\nMyself and Officer Ollie White went out to the\nA.', metadata={'source': '../../data/convictions/transcripts\\Jimmy Bass Trial Transcript Complete.docx'}),
 Document(page_content="an investigator with the police department, also, and Ollie\nWhite, also an investigator with the police department, came\nto the scene and Charles Anderson, deputy sheriff with the\nBolivar County Sheriff's Department; Butch Prescott, an\ninvestigator with the Bolivar County Sheriff's Department,\nand several auxiliary officers were coming 10/8 at that time.\

2023-07-21 10:55:13,439 - INFO - Performing query...


[Document(page_content="are present with their attorneys and the State's attorneys\nare present. A question has arisen concerning the cross-\nIn Chambers - Jury Out\n79\nexamination of the witness and victim, Mary Townsend.\nDuring cross-examination, Mr. Wong was asking her questions\nabout persons that she had seen in the lineups that had\nbeen presented to her by Mr. Quinton and other officers and\nher identification of persons in those lineups and also the\nVCR lineup. The lineups had been talked about in direct", metadata={'source': '../../data/convictions/transcripts\\Jimmy Bass Trial Transcript Complete.docx'}),
 Document(page_content="at the police station.\nBY MR. MELLEN: That's all.\nBILL QUINTON\nupon being called to testify as a rebuttal witness for and\non behalf of the State, after having been previously sworn,\n-\ntestified as follows, to-wit:\nDIRECT EXAMINATION BY MR. MELLEN:\nQ. Mr. Quinton, you have testified earlier, so\nwe know where you work and so forth. I want to

2023-07-21 10:55:34,347 - INFO - Performing query...


[Document(page_content="32\nYes, sir.\nA.\nQ. Were there any suspects there at the scene when\nyou arrived?\nNo, sir.\nA.\nDid you know who might have been involved when you\nQ.\narrived?\nA.\nNo, sir.\nAnd what then took place?\nQ.\nMyself and Officer Ollie White went out to the\nA.\nhospital to attempt to get a description of some type from\nthe victim. But at that time we were told by the -- there\nwas a doctor there, strange to me, a female, who was the\nemergency room physician, told us that we couldn't talk with", metadata={'source': '../../data/convictions/transcripts\\Jimmy Bass Trial Transcript Complete.docx'}),
 Document(page_content="an investigator with the police department, also, and Ollie\nWhite, also an investigator with the police department, came\nto the scene and Charles Anderson, deputy sheriff with the\nBolivar County Sheriff's Department; Butch Prescott, an\ninvestigator with the Bolivar County Sheriff's Department,\nand several auxiliary officers were coming 10/8