In [55]:
import pandas as pd
import langchain
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory, ConversationBufferWindowMemory, ConversationSummaryMemory, ConversationSummaryBufferMemory
from langchain.memory import ChatMessageHistory
from langchain.schema import message_to_dict
from langchain.callbacks import get_openai_callback
import threading
import json
import time
from collections import OrderedDict
from openai import OpenAI

from langchain.prompts.chat import (
            ChatPromptTemplate,
            SystemMessagePromptTemplate,
            AIMessagePromptTemplate,
            HumanMessagePromptTemplate,
            MessagesPlaceholder
        )
from langchain.chains import LLMChain
from langchain_core.messages import AIMessage, HumanMessage
import re
import openai
import pinecone
import numpy as np
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
import chromadb

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredPDFLoader, TextLoader
from chromadb.utils import embedding_functions
from langchain_openai.embeddings import OpenAIEmbeddings
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from langchain_core.documents.base import Document
from langchain_qdrant import Qdrant
from langchain_experimental.text_splitter import SemanticChunker


In [52]:
client = QdrantClient("http://localhost:6333")

In [2]:
openai_client = OpenAI(
    api_key =  ##prem key
)

In [3]:
with open('search_data_simple_GDPR.txt', 'r') as file:
    codes_questions = file.readlines()

codes_questions
questions = [item.strip('\n\t') for item in codes_questions]
questions
questions_all = {}
 
for quest in range(0, len(questions), 2):
    code = questions[quest].strip(':')
    question = questions[quest+1]
    questions_all[code] = question


In [99]:
with open('abstract_policies.json', 'r', encoding='utf=8') as file:
    abstracts_json = json.load(file)

In [22]:
def get_embedding(input):
    return openai_client.embeddings.create(model='text-embedding-ada-002', input=input).data[0].embedding

In [23]:
text_splitter = SemanticChunker(OpenAIEmbeddings(api_key = ))

In [100]:
documents = []
for source, content in abstracts_json.items():
    source_info = {"source": source}
    documents.append(Document(metadata=source_info, page_content=content))

In [102]:
docs = text_splitter.split_documents(documents)


In [104]:
test_policies = {}

for key, value in list(abstracts_json.items()):
    test_policies[key] = value

In [106]:
questions_for_answering = {}

for key, value in list(questions_all.items()):
    questions_for_answering[key] = value

In [59]:
if not client.collection_exists(collection_name="collection_openai"):
    client.create_collection(collection_name="collection_openai", vectors_config=VectorParams(size=1536, distance=Distance.COSINE),)

In [108]:
doc_embeddings = [get_embedding(doc.page_content) for doc in docs]

In [109]:
for index, (doc, embedding) in enumerate(zip(docs, doc_embeddings)):
    client.upsert(
        collection_name="collection_openai",
        points=[{
            "id": index,
            "vector": embedding,
            "payload": {"document_chunk": doc.page_content, "metadata": doc.metadata}
        }]
    )

In [110]:
questions_with_embeddings = {}

for questionid, question in questions_for_answering.items():
    questions_with_embeddings[questionid] = {"Text": question, "Embedding": get_embedding(question)}

In [111]:
responses_all = {}
for link, content in test_policies.items():
    policy_response = {}
    for questionid, question_info in questions_with_embeddings.items():
        question = question_info["Text"]
        question_embedding = question_info["Embedding"]
        search_result = client.search(
            collection_name="collection_openai",
            query_vector=question_embedding,
            query_filter={
                "must": [
                    {
                        "key": "metadata.source",
                        "match": {
                            "value": link
                        }
                    }
                ]
            },
            limit=1
        )
        closest_chunk = search_result[0].payload['document_chunk']
        context = """For the following policy:""" + f"{closest_chunk}" + """ answer the following question with a "Yes"/"No" answer and if the answer is "Yes" then provide an extract
        from the policy that is LONGER than 200 characters and best fits the answer, otherwise return an empty string, nothing else that differs from this. Make sure that your response
        is only in a JSON format like this and DO NOT PROVIDE ANY ADDITIONAL TEXT: `{"Answer": "Your answer", "Extract": "Extract from the policy"}`, where " Your Answer" 
        represents your answer to the question, "Extract from the policy" is the best fit extract (make sure it is composed of whole sentences and also don't include any quatiotion marks). Here is the question: """ f"{question}"

        
        response = openai_client.chat.completions.create(
                    model="gpt-3.5-turbo-0125",
                    response_format={ "type": "json_object" },
                    messages=[{"role":"user","content": context}]
                )
        policy_response[questionid] = response.choices[0].message.content
    responses_all[link] = policy_response

In [113]:
with open("", 'w', encoding="utf-8") as json_file: #save responses to a json
    json.dump(responses_all, json_file, indent=4)