In [1]:
import pandas as pd
import langchain
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory, ConversationBufferWindowMemory, ConversationSummaryMemory, ConversationSummaryBufferMemory
from langchain.memory import ChatMessageHistory
from langchain.schema import message_to_dict
from langchain.callbacks import get_openai_callback
import threading
import json
import time
from collections import OrderedDict
from openai import OpenAI

from langchain.prompts.chat import (
            ChatPromptTemplate,
            SystemMessagePromptTemplate,
            AIMessagePromptTemplate,
            HumanMessagePromptTemplate,
            MessagesPlaceholder
        )
from langchain.chains import LLMChain
from langchain_core.messages import AIMessage, HumanMessage
import re
import openai
import pinecone
import numpy as np
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
import chromadb

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredPDFLoader, TextLoader
from chromadb.utils import embedding_functions

  from tqdm.autonotebook import tqdm


In [2]:
openai_client = OpenAI(
    api_key =  ##prem key
)

In [3]:
with open('search_data_simple_GDPR.txt', 'r') as file:
    codes_questions = file.readlines()

codes_questions
questions = [item.strip('\n\t') for item in codes_questions]
questions
questions_all = {}
 
for quest in range(0, len(questions), 2):
    code = questions[quest].strip(':')
    question = questions[quest+1]
    questions_all[code] = question


In [4]:
chroma = Chroma()
client = chromadb.PersistentClient()

In [5]:
def get_embedding(input):
    return openai_client.embeddings.create(model='text-embedding-ada-002', input=input).data[0].embedding

In [7]:
def chunk_text(text, chunk_size=15000, chunk_overlap=1000, max_chunk_size=21000):
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    chunks = []
    current_chunk = ''
    all_sentences = []  
    all_chunk_lists = []
    overlap = ''
    for sentence in sentences:
        if len(current_chunk) + len(sentence) > chunk_size and len(current_chunk) + len(sentence) < max_chunk_size:
            if len(chunks) > 0:
                overlap = ' '.join(all_chunk_lists[-1][-3:])
            all_chunk_lists.append(all_sentences)
            all_sentences = []
            current_chunk = overlap + current_chunk
            chunks.append(current_chunk)
            current_chunk = ''
            overlap = ''
        current_chunk += sentence + ' '
        all_sentences.append(sentence)
    if current_chunk and len(current_chunk) < max_chunk_size:
        chunks.append(current_chunk)

    return chunks

In [8]:
with open('abstract_policies.json', 'r', encoding='utf=8') as file:
    abstracts_json = json.load(file)

In [9]:
def add_data_to_chroma(parts, policyid, collection_chroma):

    for index, chunk in enumerate(parts):
        collection_chroma.add(
            embeddings=[get_embedding(chunk)],
            documents=[chunk],
            metadatas=[{"source": str(policyid)}],
            ids=[str(index)]
        )

In [10]:
def add_closing_quote(string):
    single_quotes_count = string.count("'")
    if single_quotes_count % 2 != 0:
        string += "'"
    return string

In [11]:
def add_matching_braces(string):
    stack = []
    unmatched_indices = []

    for i, char in enumerate(string):
        if char == '{':
            stack.append(i)
        elif char == '}':
            if not stack:
                unmatched_indices.append(i)
            else:
                stack.pop()

    for index in reversed(unmatched_indices):
        string = string[:index] + '}' + string[index:]

    while stack:
        string += '}'
        stack.pop()

    return string

In [12]:
def parse_response(response):
    transformed_data = {}
    if response:
        for key, value in response.items():
            value = add_matching_braces(value)
            print(value)
            transformed_data[key] = json.loads(value)
    return transformed_data

In [13]:
def policy_responses(policyid, policy, embedded_questions, collection_chroma):
    policy_chunks = chunk_text(policy)
    add_data_to_chroma(policy_chunks, policyid, collection_chroma)
    policy_response = {}
    for questionid, question_content in embedded_questions.items():
        closest_chunk = collection_chroma.query(
        query_embeddings=question_content['Embedding'],
        n_results=1
        )['documents'][0][0]
        
        context = """For the following policy:""" + f"{closest_chunk}" + """ answer the following question with a "Yes"/"No" answer and if the answer is "Yes" then provide an extract
        from the policy that is LONGER than 200 characters and best fits the answer, otherwise return an empty string, nothing else that differs from this. Make sure that your response
        is only in a JSON format like this and DO NOT PROVIDE ANY ADDITIONAL TEXT: `{"Answer": "Your answer", "Extract": "Extract from the policy"}`, where " Your Answer" 
        represents your answer to the question, "Extract from the policy" is the best fit extract (make sure it is composed of whole sentences and also don't include any quatiotion marks). Here is the question: """ f"{question_content['Text']}"
        print(len(context))
        
        response = openai_client.chat.completions.create(
                    model="gpt-4-1106-preview",
                    response_format={ "type": "json_object" },
                    messages=[{"role":"user","content": context}]
                )
        policy_response[questionid] = response.choices[0].message.content
    return policy_response
    

In [14]:
def extract_alphanumeric(text):
    return re.sub(r'[^a-zA-Z0-9]', '', text)

In [15]:
def get_response_by_policy_id(data, policy_id):
    if policy_id in data.keys():
        return data[policy_id]
    return None

In [16]:
def get_non_cached_questions(cached_questions, original_questions, question_embeddings):
    missing_question_embeddings = {}
    for key, value in original_questions.items():
        if cached_questions:
            if key not in cached_questions.keys():
                missing_question_embeddings[key] = question_embeddings[key]
        else:
            return question_embeddings
    return missing_question_embeddings

    

In [17]:
with open('search_questions.json', 'r', encoding='utf-8') as file:
    questions_for_answering = json.load(file)

In [18]:
test_policies = {}

for key, value in list(abstracts_json.items()):
    test_policies[key] = value

In [21]:
def save_responses(responses, filepath):
    with open(filepath, "w") as file:
        json.dump(responses, file, indent=4)

In [22]:
def save_response_txt(policy_id, response_content, file_path):
    with open(file_path, "a") as file:
        file.write(f"PolicyID: {policy_id}\n")
        file.write(f"Response: {response_content}\n\n")

In [23]:
def save_response_trial(policy_id, response, file_path):
    with open(file_path, "r") as file:
        existing_data = json.load(file)
    
    existing_data[policy_id] = response

    with open(file_path, "w") as file:
        json.dump(existing_data, file, indent=4)

In [24]:
def analyse_policies(policies, questions_for_answering, cached_responses, cached_question_embeddings, question_embeddings_filepath, cached_response_filepath):
    policies_responses = {}
    questions_embeddings = {}
    for questionid, question in questions_for_answering.items():
        if questionid not in cached_question_embeddings:
            question_content = {}
            question_content["Embedding"] = get_embedding(question)
            question_content["Text"] = question
            questions_embeddings[questionid] = question_content
            with open(question_embeddings_filepath, "w") as file:
                json.dump(questions_embeddings, file, indent=4)
        else:
            questions_embeddings[questionid] = cached_question_embeddings[questionid]
        
    for policyid, policy in policies.items():
        if policy: 
            cached_answers = get_response_by_policy_id(cached_responses, policyid)
            missing_question_embeddings = get_non_cached_questions(cached_answers, questions_for_answering, questions_embeddings)
            collection_name = "whizz"
            collection_chroma = client.get_or_create_collection(name=collection_name)
            response = policy_responses(policyid, policy, missing_question_embeddings, collection_chroma)
            save_response_trial(policyid, response, cached_response_filepath)
            policies_responses[policyid] = response
    return policies_responses


In [None]:
cached_filepath = '' # filepath of json where answers are saved
question_embeddings_filepath = '' # filepath for json where questions are embedded
with open(cached_filepath, "r") as file:
    cached_responses = json.load(file)
with open(question_embeddings_filepath, "r") as file:
    question_embeddings = json.load(file)
responses = analyse_policies(abstracts_json, questions_all, cached_responses, question_embeddings, question_embeddings_filepath, cached_filepath)
