In [None]:
import json
from pathlib import Path
from pprint import pprint


file_path='abstract_policies.json'
with open(file_path, encoding='utf-8') as data_file:
    data = json.load(data_file)

In [None]:
messages = []
for link,number in zip(data,range(1,len(data)+1)):
  messages.append({'url':link,'content':data[link]})


new_json={'messages':messages}

In [None]:
json_object = json.dumps(new_json, indent=4)

In [None]:
with open("abstract_policies_transformed.json", "w") as outfile:
    outfile.write(json_object)

In [None]:
import os
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings
from transformers import AutoTokenizer, pipeline, logging
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
import torch

torch.cuda.empty_cache()

torch.cuda.memory_summary(device=None, abbreviated=False)

In [None]:
from langchain_community.document_loaders import JSONLoader

In [None]:
def metadata_func(record: dict, metadata: dict) -> dict:

    metadata["url"] = record.get("url")

    return metadata

In [None]:
loader = JSONLoader(
    file_path='abstract_policies_transformed.json',
    jq_schema='.messages[]',
    content_key="content",
    metadata_func=metadata_func
)

docs = loader.load()

In [None]:
docs[0]

In [None]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-base-en"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

model_norm = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda'},
    encode_kwargs=encode_kwargs
)


In [None]:
from langchain_experimental.text_splitter import SemanticChunker
text_splitter = SemanticChunker(
   model_norm, breakpoint_threshold_type="percentile"
)

In [None]:
splits = text_splitter.split_documents(docs)

In [None]:
from langchain_community.vectorstores import FAISS

db = FAISS.from_documents(splits, model_norm)

In [None]:
model_name_or_path = "TheBloke/CapybaraHermes-2.5-Mistral-7B-GPTQ"

model_basename = "model"

use_triton = False

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

"""
model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
        model_basename=model_basename,
        use_safetensors=True,
        use_strict = False,
        trust_remote_code=True,
        device="cuda:0",
        use_triton=use_triton,
        quantize_config=None)

"""
#To download from a specific branch, use the revision parameter, as in this example:

# model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
#         revision="gptq-8bit-64g-actorder_True",
#         model_basename=model_basename,
#         use_safetensors=True,
#         trust_remote_code=True,
#         use_strict = False,
#         device="cuda:0",
#         use_cache=True,
#         quantize_config=None,
#         load_in_8bit=True)


use_strict = False

use_triton = False

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

quantize_config = BaseQuantizeConfig(
        bits=4,
        group_size=128,
        desc_act=False
    )

model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
        use_safetensors=True,
        strict=use_strict,
        model_basename=model_basename,
        device="cuda:0",
        use_triton=use_triton,
        quantize_config=quantize_config)


In [None]:
def filter_documents(vector_store, query_metadata,question):
    # Get all documents from the vector store
    all_documents = vector_store.similarity_search(question)
    # Filter documents based on the query metadata
    filtered_docs = [doc for doc in all_documents if all(item in doc.metadata.items() for item in query_metadata.items())]
    return filtered_docs

In [None]:
# Function to get metadata filter from user input
def get_metadata_filter(user_input):
    lines = user_input.split('\n')
    query_metadata = {}
    for line in lines:
        if 'Policy: ' in line:
            print(line.split(":"))
            query_metadata['url'] = line.split(':')[1].strip()
        
    return query_metadata

In [None]:
prompt = " Policy: https://www.clarityenglish.com/privacy.php Do you provide the information about the identity and the contact details of the controllers and, where applicable, of the controller's representative?Companies which do not have their seat in the EEA should appoint a representative within the EU. ."
template=f'''SYSTEM:You are a highly knowledgeable assistant with a strong foundation in GDPR principles and guidelines, as established by the European Union.
Your expertise encompasses data privacy, individual rights under GDPR, data processing requirements, and the obligations of data controllers and processors.

ANSWER THE QUESTION WITH YES/NO:
'Yes': if you assume that the policy has the requirements specified in the question
'No':otherwise

---------------------------------------------------------------
Example:
USER: {prompt}
ASSISTANT:
'''

# Prevent printing spurious transformers error when using pipeline with AutoGPTQ
logging.set_verbosity(logging.CRITICAL)

print("*** Pipeline:")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.7,
    top_p=0.95,
    repetition_penalty=1.15,
)

# print(pipe(template)[0]['generated_text'])

In [None]:
llm = HuggingFacePipeline(pipeline=pipe)
conversation_chain = RetrievalQA.from_llm(
        llm=llm,
        retriever=db.as_retriever(search_kwargs={"k": 2}),
        return_source_documents=True,
    )

In [None]:
import textwrap

# Function to answer yes/no question based on filtered documents
# def answer_yes_no_question(filtered_docs, question):
#     context = " ".join([doc.page_content for doc in filtered_docs])[:4000]
#     response = conversation_chain( f"Context: {context}\nQuestion: {question}")
#     return response

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def llm_chatbot(question):
    query_metadata = get_metadata_filter(question)

    # Filter documents based on the query metadata
    filtered_documents = filter_documents(db, query_metadata,question)[:3]
    # print(answer_yes_no_question(filtered_documents,question))
    # context = " ".join([doc.page_content for doc in filtered_documents])
    context = ""
    batch_size=5
    for i in range(0, len(filtered_documents), batch_size):
        batch = filtered_documents[i:i+batch_size]
        batch_context = " ".join([doc.page_content for doc in batch])
        context += batch_context
        if len(context) > 1000:  # Limit context size to prevent memory issues
            break
    # llm_response=conversation_chain(f"Context: {context}\nQuestion: {question}")
    # print(llm_response['result'].split("\n")[-1].split(": ")[1])
    # print('\n\nSources:')
    # for source in llm_response["source_documents"]:
    #     print(source.page_content)
    print(llm(f"Context: {context}\nQuestion: {question}",use_gup=False))

In [None]:
from datetime import datetime


start = datetime.now()
llm_chatbot("""Policy: https://www.clarityenglish.com/privacy.php, https://www.iris.co.uk/privacy-policy/, https://www.edukey.co.uk/privacy/ 
            Do you provide the information about the identity and the contact details of the controllers and, where applicable, of the controller's representative?Companies which do not have their seat in the EEA should appoint a representative within the EU. .
""")

end = datetime.now()
print(end-start)

In [None]:
torch.cuda.empty_cache()

torch.cuda.memory_summary(device=None, abbreviated=False)

In [None]:
f = open("search_data_simple_GDPR.txt", "r")
text = f.read()
text = text.split("\n")
gdprs=[]
for line in text:
    if not line.startswith("GDPR"):
        gdprs.append(line.replace("\t",''))

gdprs

In [None]:
policies = ['https://www.clarityenglish.com/privacy.php', 'https://www.iris.co.uk/privacy-policy/',' https://www.edukey.co.uk/privacy/' ]
times=[]
for policy in policies:
    for gdpr in gdprs:
        start = datetime.now()

        llm_chatbot(f"""Policy: {policy}
           {gdpr}""")
        end = datetime.now()
        times.append(end-start)



In [None]:
import numpy as np

np.array(times).sum()

In [None]:
717.63/60

In [None]:
file_path='abstract_policies.json'
with open(file_path, encoding='utf-8') as data_file:
    data = json.load(data_file)


In [None]:
list(data.keys())[30:40]

In [None]:
policies = list(data.keys())[30:40]
times=[]
for policy in policies:
    for gdpr in gdprs[:10]:
        start = datetime.now()

        llm_chatbot(f"""Policy: {policy}
           {gdpr}""")
        end = datetime.now()
        times.append(end-start)





In [None]:
import numpy as np

np.array(times).sum()


In [None]:
536.480860/60

In [None]:
policies = list(data.keys())[68:118]
times=[]
for policy in policies:
    for gdpr in gdprs:
        start = datetime.now()

        llm_chatbot(f"""Policy: {policy}
           {gdpr}""")
        end = datetime.now()
        times.append(end-start)



