In [1]:
import os
from langchain.document_loaders import Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.faiss import FAISS
from langchain.chat_models import ChatOpenAI
from langchain import PromptTemplate
from langchain.chains import LLMChain
from dotenv import find_dotenv, load_dotenv
from langchain.llms import OpenAI
from langchain.chains import LLMChain, HypotheticalDocumentEmbedder
import re
import pandas as pd
import logging
from summarizer import Summarizer
import pprint


logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO)
logger = logging.getLogger(__name__)


load_dotenv(find_dotenv())
query_memory = []


CHUNK_SIZE = 500
CHUNK_OVERLAP = 250
TEMPERATURE = 0
k = 10


PROMPT_TEMPLATE_MODEL = PromptTemplate(
    input_variables=["question", "docs"],
    template="""
    As an AI assistant, my role is to meticulously analyze court transcripts and extract information about law enforcement personnel.
    The names of law enforcement personnel will be prefixed by one of the following titles: officer, detective, deputy, lieutenant, 
    sergeant, captain, officer, coroner, investigator, criminalist, patrolman, or technician.

    Query: {question}

    Transcripts: {docs}

    The response will contain:

    1) The name of a officer, detective, deputy, lieutenant, 
       sergeant, captain, officer, coroner, investigator, criminalist, patrolman, or technician - 
       if an individual's name is not associated with one of these titles they do not work in law enforcement.
       Please prefix the name with "Officer Name: ". 
       For example, "Officer Name: John Smith".

    2) If available, provide an in-depth description of the context of their mention. 
       If the context induces ambiguity regarding the individual's employment in law enforcement, 
       remove the individual.
       Please prefix this information with "Officer Context: ". 

    Continue this pattern of identifying persons, until all law enforcement personnel are identified.  

    Additional guidelines for the AI assistant:
    - Titles may be abbreviated to the following Sgt., Cpl, Cpt, Det., Ofc., Lt., P.O. and P/O
    - Titles "Technician" and "Tech" might be used interchangeably.
    - Derive responses from factual information found within the police reports.
    - If the context of an identified person's mention is not clear in the report, provide their name and note that the context is not specified.
    - Do not extract information about victims and witnesses
""",
)


PROMPT_TEMPLATE_HYDE = PromptTemplate(input_variables=["question"], template="""
    You're an AI assistant specializing in criminal justice research. 
    Your main focus is on identifying the names and providing detailed context of mention for each law enforcement personnel. 
    This includes police officers, detectives, deupties, lieutenants, sergeants, captains, technicians, coroners, investigators, patrolman, and criminalists, 
    as described in court transcripts.
    Be aware that the titles "Detective" and "Officer" might be used interchangeably.
    Be aware that the titles "Technician" and "Tech" might be used interchangeably.

    Question: {question}

    Roles and Responses:""")


def clean_name(officer_name):
    return re.sub(
        r"(Detective|Officer|Deputy|Captain|[CcPpLl]|Sergeant|Lieutenant|Techn?i?c?i?a?n?|Investigator)\.?\s+",
        "",
        officer_name,
    )


def extract_officer_data(formatted_response):
    officer_data = []
    response_lines = formatted_response.split("\n")

    for line in response_lines:
        if line.startswith("Officer Name"):
            officer_name = line.split(":", 1)[1].strip()
            officer_title = re.search(
                r"(Detective|Officer|Deputy|Captain|[CcPpLl]|Sergeant|Lieutenant|Techn?i?c?i?a?n?|Investigator)\.?",
                officer_name,
            )
            if officer_title:
                officer_title = officer_title.group()
            else:
                officer_title = ""
            officer_name = clean_name(officer_name)
        elif line.startswith("Officer Context"):
            split_line = line.split(":", 1)
            if len(split_line) > 1:
                officer_context = split_line[1].strip()
            else:
                officer_context = ""  
            officer_data.append(
                {
                    "Officer Name": officer_name,
                    "Officer Context": officer_context,
                    "Officer Title": officer_title,
                }
            )

    return officer_data


def generate_hypothetical_embeddings():
    llm = OpenAI()
    prompt = PROMPT_TEMPLATE_HYDE

    llm_chain = LLMChain(llm=llm, prompt=prompt)

    base_embeddings = OpenAIEmbeddings()

    embeddings = HypotheticalDocumentEmbedder(
        llm_chain=llm_chain, base_embeddings=base_embeddings
    )
    return base_embeddings


def get_response_from_query(db, query, temperature, k):
    logger.info("Performing query...")
    doc_list = db.similarity_search_with_score(query, k=k)

    docs = sorted(doc_list, key=lambda x: x[1], reverse=True)

    third = len(docs) // 3

    highest_third = docs[:third]
    middle_third = docs[third:2*third]
    lowest_third = docs[2*third:]

    highest_third = sorted(highest_third, key=lambda x: x[1], reverse=True)
    middle_third = sorted(middle_third, key=lambda x: x[1], reverse=True)
    lowest_third = sorted(lowest_third, key=lambda x: x[1], reverse=True)

    docs = highest_third + lowest_third + middle_third

    docs_page_content = " ".join([d[0].page_content for d in docs])

    llm = ChatOpenAI(model_name="gpt-4", verbose=True)

    prompt = PROMPT_TEMPLATE_MODEL

    chain = LLMChain(llm=llm, prompt=prompt, verbose=True)
    response = chain.run(question=query, docs=docs_page_content, temperature=temperature)

    formatted_response = ""
    officers = response.split("Officer Name:")
    for i, officer in enumerate(officers):
        if officer.strip() != "":
            formatted_response += f"Officer Name {i}:{officer.replace('Officer Context:', 'Officer Context ' + str(i) + ':')}\n\n"

    officer_data = extract_officer_data(formatted_response)
    return officer_data,



# QUERIES = [
    # "Identify individuals, by name, with the specific titles of officers, sergeants, lieutenants, captains, detectives, homicide officers, and crime lab personnel in the transcript. Specifically, provide the context of their mention related to key events in the case, if available.",
#     "List individuals, by name, directly titled as officers, sergeants, lieutenants, captains, detectives, homicide units, and crime lab personnel mentioned in the transcript. Provide the context of their mention in terms of any significant decisions they made or actions they took.",
#     "Locate individuals, by name, directly referred to as officers, sergeants, lieutenants, captains, detectives, homicide units, and crime lab personnel in the transcript. Explain the context of their mention in relation to their interactions with other individuals in the case.",
    # "Highlight individuals, by name, directly titled as officers, sergeants, lieutenants, captains, detectives, homicide units, and crime lab personnel in the transcript. Describe the context of their mention, specifically noting any roles or responsibilities they held in the case.",
    # "Outline individuals, by name, directly identified as officers, sergeants, lieutenants, captains, detectives, homicide units, and crime lab personnel in the transcript. Specify the context of their mention in terms of any noteworthy outcomes or results they achieved.",
    # "Pinpoint individuals, by name, directly labeled as officers, sergeants, lieutenants, captains, detectives, homicide units, and crime lab personnel in the transcript. Provide the context of their mention, particularly emphasizing any significant incidents or episodes they were involved in.",
# ]


QUERIES = [
    "Identify individuals, by name, with the specific titles of officers, sergeants, lieutenants, captains, detectives, homicide officers, and crime lab personnel in the transcript. Specifically, provide the context of their mention related to key events in the case, if available.",
]


def summarize_context(context):
    model = Summarizer()
    result = model(context, min_length=60)
    summary = "".join(result)
    return summary


def preprocess_single_document(file_path, embeddings):
    logger.info(f"Processing Word document: {file_path}")

    loader = Docx2txtLoader(file_path)
    text = loader.load()
    logger.info(f"Text loaded from Word document: {file_path}")

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
    docs = text_splitter.split_documents(text)

    db = FAISS.from_documents(docs, embeddings)
    return db


def process_query(embeddings):
    doc_directory = "../../../data/convictions/transcripts/iterative"
    iteration_times = 6

    for file_name in os.listdir(doc_directory):
        if file_name.endswith(".docx"):
            csv_output_path = os.path.join(doc_directory, f"{file_name}.csv")
            if os.path.exists(csv_output_path):
                logger.info(f"CSV output for {file_name} already exists. Skipping...")
                continue

            file_path = os.path.join(doc_directory, file_name)
            output_data = []

            for iteration in range(1, iteration_times + 1):  
                db = preprocess_single_document(file_path, embeddings)
                for query in QUERIES:
                    officer_data, _ = get_response_from_query(db, query, TEMPERATURE, k)
                    for item in officer_data:
                        item["Query"] = query
                        item["Prompt Template for Hyde"] = PROMPT_TEMPLATE_HYDE
                        item["Prompt Template for Model"] = PROMPT_TEMPLATE_MODEL
                        item["Chunk Size"] = CHUNK_SIZE
                        item["Chunk Overlap"] = CHUNK_OVERLAP
                        item["Temperature"] = TEMPERATURE
                        item["k"] = k
                        item["hyde"] = "0"
                        item["iteration"] = iteration  
                    output_data.extend(officer_data)

            output_df = pd.DataFrame(output_data)
            output_df.to_csv(csv_output_path, index=False)

def main():
    embeddings = generate_hypothetical_embeddings()
    process_query(embeddings)

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm
2023-08-04 16:46:28,155 - INFO - Processing Word document: ../../../data/convictions/transcripts/iterative\(C) Det. Martin Venezia Testimony - Trial One.docx
2023-08-04 16:46:28,200 - INFO - Text loaded from Word document: ../../../data/convictions/transcripts/iterative\(C) Det. Martin Venezia Testimony - Trial One.docx
2023-08-04 16:46:30,775 - INFO - Loading faiss with AVX2 support.
2023-08-04 16:46:30,776 - INFO - Could not load library with AVX2 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx2'")
2023-08-04 16:46:30,777 - INFO - Loading faiss.
2023-08-04 16:46:30,813 - INFO - Successfully loaded faiss.
2023-08-04 16:46:30,854 - INFO - Performing query...


TypeError: 'Document' object is not subscriptable