# Loading LLM and Embedding model

In [5]:
import os
from langchain.schema import HumanMessage
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

model = ChatOpenAI()
embeddings = OpenAIEmbeddings()

# PDF loader

This part of the code will load the PDF documents and separate them by pages. The content of the PDFs is stored in objects of the Document class that has two elements: page_content and metadata. Metadata is useful for storing data of the data we want to include such as page number, section, document, document summary, etc.

To add more metadata, I created a 'library' csv file where I include all the extra information I want the documents to have.

In [20]:
from pathlib import Path
def extract_folder(doc):
    path = Path(doc.metadata['source'])
    return path.parent.name

def extract_name(doc):
    path = Path(doc.metadata['source'])
    return path.name

def input_metadata(doc, df):
    name = extract_name(doc)
    #print(name)
    df_copy = df[df['file_name'] == name].reset_index(drop=True)
    doc.metadata['field'] = df_copy.loc[0,'field']
    doc.metadata['link'] = df_copy.loc[0,'link']
    doc.metadata['description'] = df_copy.loc[0,'description']

In [17]:
import pandas as pd 
df = pd.read_csv('./library.csv', delimiter=';')
df

Unnamed: 0,file_name,field,link,description
0,EDS0 Introduction Data Science in Context.pdf,Theory,https://github.com/arturofredes/Ethical_AI_RAG...,
1,EDS1 Ethical Foundations.pdf,Theory,https://github.com/arturofredes/Ethical_AI_RAG...,
2,"EDS2 Legitimacy, values and decisions.pdf",Theory,https://github.com/arturofredes/Ethical_AI_RAG...,
3,EDS3 Fundamental Limits of ML.pdf,Theory,https://github.com/arturofredes/Ethical_AI_RAG...,
4,EDS4 Bias and Fairness I.pdf,Theory,https://github.com/arturofredes/Ethical_AI_RAG...,
5,Microsoft_Standards.pdf,Applied,https://github.com/arturofredes/Ethical_AI_RAG...,


In [18]:
from langchain_community.document_loaders import PyPDFLoader
#load multiple documents
loaders = [
PyPDFLoader(".\documents\EDS0 Introduction Data Science in Context.pdf"),
PyPDFLoader(".\documents\EDS1 Ethical Foundations.pdf"),
PyPDFLoader(".\documents\EDS2 Legitimacy, values and decisions.pdf"),
PyPDFLoader(".\documents\EDS3 Fundamental Limits of ML.pdf"),
PyPDFLoader(".\documents\EDS4 Bias and Fairness I.pdf"),
PyPDFLoader(".\documents\Microsoft_Standards.pdf")
]
docs = []
for l in loaders:
    docs.extend(l.load())

In [21]:
#añadir metadata departamento
for doc in docs:
    input_metadata(doc,df)
docs

[Document(page_content='Jordi Vitrià\nIntroduction  +  Data Science in Context', metadata={'source': '.\\documents\\EDS0 Introduction Data Science in Context.pdf', 'page': 0, 'field': 'Theory', 'link': 'https://github.com/arturofredes/Ethical_AI_RAG/blob/main/documents/EDS0%20Introduction%20Data%20Science%20in%20Context.pdf', 'description': nan}),
 Document(page_content='•Data science has the potential to be both beneﬁcial (Improved Decision-Making, Predictive Analytics, Personalized Services, Efﬁciency and Automation, etc.) and detrimental (Privacy Concerns, Bias and Fairness Issues, Security Risks, Loss of Jobs, Data Manipulation, etc.) to individuals (individual harms) and/or to society (sistemic risks).  •To help eliminate/mitigate any adverse effects, we must seek to understand the potential impact of our work for people. •In this course, we will explore the social and ethical ramiﬁcations of the choices we make at the different stages of the data analysis pipeline, from data coll

# Text splitter
In this step, we will divide the text into chunks, which are the pieces of informations that will be embedded and stored in the data base. We wil later retrieve them and give them as context to the LLM.

In [24]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
#splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=500)
texts = text_splitter.split_documents(docs)

In [25]:
texts[3].metadata

{'source': '.\\documents\\EDS0 Introduction Data Science in Context.pdf',
 'page': 3,
 'field': 'Theory',
 'link': 'https://github.com/arturofredes/Ethical_AI_RAG/blob/main/documents/EDS0%20Introduction%20Data%20Science%20in%20Context.pdf',
 'description': nan}

# Embeddings y Vector store (chroma)

## Creating database
We will use Chroma, an opensource vector store included in langchain to store our chunks and embeddings. When we create the database, the chunks we created will be embedded using the selected model, in our case OpenAi's ada

In [27]:
from langchain.vectorstores import Chroma
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'

vectordb = Chroma.from_documents(documents=texts, 
                                 embedding=embeddings,
                                 persist_directory=persist_directory)

This step will persist the database in memory, so next time we will only have to load it

In [28]:
vectordb.persist()

## Loading database

In [26]:
from langchain.vectorstores import Chroma
# Now we can load the persisted database from disk, and use it as normal. 
persist_directory = 'db'
vectordb = Chroma(persist_directory=persist_directory, 
                  embedding_function=embeddings)

## Updating metadata
In some cases we would like to update the metadata once the database was created, we would do so with the following code.

In [3]:
from langchain_core.documents.base import Document

In [4]:
ids = vectordb.get()['ids']
docs = vectordb.get()['documents']
meta = vectordb.get()['metadatas']

In [6]:
from pathlib import Path
def extract_folder(metadata):
    path = Path(metadata['source'])
    return path.parent.name

def extract_name(metadata):
    path = Path(metadata['source'])
    return path.name

def input_metadata(doc, df):
    name = meta['source']
    #print(name)
    df_copy = df[df['file_name'] == name].reset_index(drop=True)
    doc.metadata['field'] = df_copy.loc[0,'field']
    doc.metadata['link'] = df_copy.loc[0,'link']
    doc.metadata['description'] = df_copy.loc[0,'description']

In [None]:
import pandas as pd 
df = pd.read_csv('.library.csv',delimiter=';')
df

In [8]:
errors =[]
for i in range(len(ids)):
    try:
        input_metadata(df,meta[i])
        doc = Document(page_content=docs[i], metadata=meta[i])
        vectordb.update_document(ids[i],doc)
    except Exception as e:
        errors.append(e)
        continue

In [9]:
print(len(errors))
errors[0]

15


KeyError(0)

In [None]:
vectordb.get(ids[820])

In [23]:
vectordb.persist()

## Filtering

We can filter the database before retrieving. This should help us to get a more specific response and get the programme running faster

In [29]:
vectordb.get(where={'field' : 'Theory'})

{'ids': ['a4ebdddc-e2db-11ee-bd0e-38fc98f3447b',
  'a4ebdddd-e2db-11ee-9fb0-38fc98f3447b',
  'a4ebddde-e2db-11ee-b583-38fc98f3447b',
  'a4ebdddf-e2db-11ee-adc4-38fc98f3447b',
  'a4ebdde0-e2db-11ee-99f8-38fc98f3447b',
  'a4ebdde1-e2db-11ee-b148-38fc98f3447b',
  'a4ebdde2-e2db-11ee-abb6-38fc98f3447b',
  'a4ebdde3-e2db-11ee-b9d3-38fc98f3447b',
  'a4ebdde4-e2db-11ee-a615-38fc98f3447b',
  'a4ebdde5-e2db-11ee-9415-38fc98f3447b',
  'a4ebdde6-e2db-11ee-8b06-38fc98f3447b',
  'a4ebdde7-e2db-11ee-bcb2-38fc98f3447b',
  'a4ebdde8-e2db-11ee-bef6-38fc98f3447b',
  'a4ebdde9-e2db-11ee-a9f4-38fc98f3447b',
  'a4ebddea-e2db-11ee-80d9-38fc98f3447b',
  'a4ebddeb-e2db-11ee-9102-38fc98f3447b',
  'a4ebddec-e2db-11ee-828e-38fc98f3447b',
  'a4ebdded-e2db-11ee-a630-38fc98f3447b',
  'a4ebddee-e2db-11ee-9210-38fc98f3447b',
  'a4ebddef-e2db-11ee-a930-38fc98f3447b',
  'a4ebddf0-e2db-11ee-b353-38fc98f3447b',
  'a4ebddf1-e2db-11ee-8ba3-38fc98f3447b',
  'a4ebddf2-e2db-11ee-aec7-38fc98f3447b',
  'a4ebddf3-e2db-11ee-b310-

# Retriever
This piece is the search engine, and will look for data similar to the user query in our database. We can select how many references we want to extract.

In [30]:
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

We could also add filters to the retriever

In [None]:
retriever = vectordb.as_retriever(search_kwargs ={"filter":{"field":"Theory"},"k":1})

# QA chain
Finally, we will put all of this together to create a QA chain. It will take the users query and return an answer using the provided knowledge

In [31]:
from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

# create the chain to answer questions 
qa_chain = RetrievalQA.from_chain_type(llm=model, 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

#try qa with sources
qa_sources = RetrievalQAWithSourcesChain.from_chain_type(
    llm=model, 
    retriever=retriever, 
    return_source_documents=True
)

In [32]:
# full example
query = "Why is ethics important in AI?"
llm_response = qa_chain(query)
llm_response

  warn_deprecated(


{'query': 'Why is ethics important in AI?',
 'result': 'Ethics is important in AI to ensure that the development and deployment of AI technologies are done in a responsible and considerate manner. It helps in challenging the status quo, identifying deficits and blind spots, and ensuring that AI systems are transparent, fair, and accountable. Ethical principles also guide companies in protecting privacy, avoiding harm, and complying with regulations, ultimately contributing to the well-being of employees, customers, partners, and communities affected by AI technologies.',
 'source_documents': [Document(page_content='27Why Ethics?   in technology, data science, AI…', metadata={'field': 'Theory', 'link': 'https://github.com/arturofredes/Ethical_AI_RAG/blob/main/documents/EDS0%20Introduction%20Data%20Science%20in%20Context.pdf', 'page': 26, 'source': '.\\documents\\EDS0 Introduction Data Science in Context.pdf'}),
  Document(page_content='30In an ideal world, our ethical beliefs shape law 

In [33]:
# full example
query = "What do elephants eat?"
llm_response = qa_sources(query)
llm_response

{'question': 'What do elephants eat?',
 'answer': "I don't know.\n",
 'sources': '',
 'source_documents': [Document(page_content='2', metadata={'field': 'Theory', 'link': 'https://github.com/arturofredes/Ethical_AI_RAG/blob/main/documents/EDS2%20Legitimacy%2C%20values%20and%20decisions.pdf', 'page': 1, 'source': '.\\documents\\EDS2 Legitimacy, values and decisions.pdf'}),
  Document(page_content='The human factor', metadata={'field': 'Theory', 'link': 'https://github.com/arturofredes/Ethical_AI_RAG/blob/main/documents/EDS4%20Bias%20and%20Fairness%20I.pdf', 'page': 64, 'source': '.\\documents\\EDS4 Bias and Fairness I.pdf'}),
  Document(page_content='36', metadata={'field': 'Theory', 'link': 'https://github.com/arturofredes/Ethical_AI_RAG/blob/main/documents/EDS0%20Introduction%20Data%20Science%20in%20Context.pdf', 'page': 35, 'source': '.\\documents\\EDS0 Introduction Data Science in Context.pdf'})]}

## Custom Prompt Retrievers
In some cases, we would like to modify the prompt of the QA chain. We will modify the prompt so it answers even though there is no provided context.

In [41]:
# Build prompt
from langchain.prompts import PromptTemplate
retriever = vectordb.as_retriever(search_kwargs={"k": 3})
template = """
Answer the question based on the information from the context. If there's no information in the context, answer the question, but you must notify that the information is not in the documentation. Furthermore mark 'yes' or 'no' between the tags (at the end) depending on wether there is or not information.
Context: {context}
Question: {question}
Answer:<answer>

<INFO>yes/no<INFO>
"""
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"],template=template,)

# Run chain
from langchain.chains import RetrievalQA
qa_chain = RetrievalQA.from_chain_type(model,
                                       verbose=False,
                                       # retriever=vectordb.as_retriever(),
                                       retriever=retriever,
                                       return_source_documents=True,
                                       chain_type_kwargs={"prompt": QA_CHAIN_PROMPT})

We will use regular expressions to process our final output

In [49]:
import re

def extract_tag(text):
    """
    Extracts if there is INFO/NOINFO tag
    """
    # Regular expression pattern to find text between <Answer> and </Answer>
    pattern = "<INFO>(.*?)<INFO>"  
    # Use re.findall to find all occurrences that match the pattern
    matches = re.findall(pattern, text) 
    return matches[0].lower()


def remove_last_line(text):
    # Split the string into a list of lines
    lines = text.split('\n')
    # Remove the last line
    lines = lines[:-1]
    # Join the list back into a string
    modified_string = '\n'.join(lines)
    return(modified_string)


## Cite sources
def process_llm_response(llm_response):
    text = llm_response['result']
    try:
        # Assuming extract_tag and remove_last_line are defined elsewhere
        info = extract_tag(text)
        text = remove_last_line(text)
    except Exception as e:
        info = 'no'
    if info == 'yes':
        text += '\n\nFurther information in:'
    else:
        text += '\n\nMay be useful:'

    # Use a dictionary to group pages by document
    documents = {}
    for source in llm_response["source_documents"]:
        source_name = source.metadata['source']
        page = source.metadata['page']
        link = source.metadata['link']
        if source_name not in documents:
            documents[source_name] = {'pages': [page], 'link': link}
        else:
            if page not in documents[source_name]['pages']:
                documents[source_name]['pages'].append(page)

        # Append the aggregated information to the text
    for source_name, info in documents.items():
        pages_text = ', '.join(str(page) for page in info['pages'])
        text += f'\n**Document:** {source_name}, **Pages** {pages_text} \n{info["link"]}'

    return text

As we can see, now the LLM will answer question outside of the scope of the documents

In [50]:
resp = qa_chain.invoke("Why is ethics important in AI?")
print(process_llm_response(resp))

Ethics is important in AI to challenge the status quo, identify deficits and blind spots, gather insights and perspectives from various stakeholders, ensure transparency, fairness, accountability, privacy, and prevent harm caused by AI technologies. It also helps companies comply with existing and anticipated regulations governing AI and industry best practices. 


Further information in:
**Document:** .\documents\EDS0 Introduction Data Science in Context.pdf, **Pages** 26 
https://github.com/arturofredes/Ethical_AI_RAG/blob/main/documents/EDS0%20Introduction%20Data%20Science%20in%20Context.pdf
**Document:** .\documents\EDS1 Ethical Foundations.pdf, **Pages** 29 
https://github.com/arturofredes/Ethical_AI_RAG/blob/main/documents/EDS1%20Ethical%20Foundations.pdf
**Document:** .\documents\EDS2 Legitimacy, values and decisions.pdf, **Pages** 8 
https://github.com/arturofredes/Ethical_AI_RAG/blob/main/documents/EDS2%20Legitimacy%2C%20values%20and%20decisions.pdf


In [51]:
resp = qa_chain.invoke("What do elephants eat?")
print(process_llm_response(resp))

The information about what elephants eat is not provided in the context. 


May be useful:
**Document:** .\documents\EDS2 Legitimacy, values and decisions.pdf, **Pages** 1 
https://github.com/arturofredes/Ethical_AI_RAG/blob/main/documents/EDS2%20Legitimacy%2C%20values%20and%20decisions.pdf
**Document:** .\documents\EDS4 Bias and Fairness I.pdf, **Pages** 64 
https://github.com/arturofredes/Ethical_AI_RAG/blob/main/documents/EDS4%20Bias%20and%20Fairness%20I.pdf
**Document:** .\documents\EDS0 Introduction Data Science in Context.pdf, **Pages** 35 
https://github.com/arturofredes/Ethical_AI_RAG/blob/main/documents/EDS0%20Introduction%20Data%20Science%20in%20Context.pdf


# Chat history
In conversation it may be important to leverage previous user messages to get a better retrieval. First, we will see an example on how to create a conversational chatbo using langchain, and the next step would be to create an agent that synthesises all previous interactions into a question

In [12]:
from langchain.prompts import ChatMessagePromptTemplate, ChatPromptTemplate, MessagesPlaceholder
from langchain.schema import HumanMessage, AIMessage
from langchain.chains.conversation.memory import ConversationSummaryMemory
from langchain.chains import LLMChain

prompt = ChatPromptTemplate.from_messages([
("system", "You are a helpful assistant"),
MessagesPlaceholder(variable_name='chat_history'),
("human","{question}"),
])

chat_chain = LLMChain(llm=model, prompt=prompt)
chat_hist=[]
user_input = ''

while user_input != 'exit':
    user_input = input()
    print(user_input)
    response = chat_chain.invoke({'question':user_input, "chat_history":chat_hist})
    print(response['text'])
    chat_hist.append(HumanMessage(user_input))
    chat_hist.append(AIMessage(response['text']))


Hola me llamo arturo
¡Hola, Arturo! ¿Cómo puedo asistirte hoy?
Me gusta la escalada y me gustaria que me dieras consejos para mejorar
¡Genial, Arturo! Me alegra que te guste la escalada. Aquí te dejo algunos consejos que podrían ayudarte a mejorar:

1. **Fortalecimiento de la fuerza de agarre**: Tu capacidad para sostener y mover tu cuerpo depende en gran medida de la fuerza de tu agarre. Considera ejercicios de fortalecimiento de la mano y el antebrazo.

2. **Mejora tu técnica de pies**: Muchas personas se centran en sus manos y brazos cuando escalan, pero tus pies también juegan un papel importante. Trabaja en la colocación precisa de los pies y el equilibrio.

3. **Acondicionamiento físico general**: La escalada es un deporte de todo el cuerpo que requiere más que solo fuerza en la parte superior del cuerpo. El entrenamiento de la fuerza central, la flexibilidad y el acondicionamiento cardiovascular pueden ser de gran ayuda.

4. **Practica la visualización**: Antes de hacer un movim

Now, we would like to apply this to our RAG system. For that, I will ask the LLM to reformulate the question given a list of messages.

In [52]:
from langchain.prompts import ChatMessagePromptTemplate, ChatPromptTemplate, MessagesPlaceholder
from langchain.schema import HumanMessage, AIMessage
from langchain.chains.conversation.memory import ConversationSummaryMemory
from langchain.chains import LLMChain

reformulating_prompt = ChatPromptTemplate.from_messages([
("system", "As an assistant within a Retrieval-Augmented Generation (RAG) system, your role is to interpret the conversation with the user and formulate it into a succinct question. This question should accurately capture the user's intent, leveraging specific keywords to ensure that the system's response aligns closely with what the user is seeking. It's crucial to maintain a high level of semantic similarity between the user's request and your question to the system. This approach helps in retrieving the most relevant information or answer from the database, enhancing the user experience."),
MessagesPlaceholder(variable_name='chat_history'),
("human","{question}"),
])

chat_chain = LLMChain(llm=model, prompt=reformulating_prompt)
chat_hist=[]
user_input = ''

while user_input != 'exit':
    user_input = input()
    chat_hist.append(HumanMessage(user_input))
    print(user_input)
    refor = chat_chain.invoke({'question':user_input, "chat_history":chat_hist})
    print(refor['text'])
    resp = qa_chain.invoke(refor['text'])['result']
    print(resp)
    chat_hist.append(AIMessage(resp))


What is utilitarism?
Could you please provide an overview of utilitarianism and its key principles?
Utilitarianism is a theory that focuses on whether an action maximizes happiness and well-being for all affected individuals. The key principle of utilitarianism is to aim for the greatest good for the greatest number of people. Utilitarian calculus allows for the possibility of sacrificing some individuals for the greater good, as long as it benefits society as a whole.
How can it be applied to ethics in AI?
To what extent can utilitarianism be utilized in ethical considerations within the field of artificial intelligence (AI)?
Utilitarianism can be utilized in ethical considerations within the field of artificial intelligence (AI) to the extent that it focuses on maximizing overall well-being and minimizing harm for the greatest number of people. 

<INFO>yes<INFO>
exit
If you have any more questions in the future, feel free to ask. Goodbye!
There is no information provided in the conte

# New splitting and retrieving strategies

## Parent document retriever
- It is possible that inside one of the chunks we talk about different things. This results in more generic embeddings and can cause problems when looking for very specific piece of information.
- On the other hand, if we split the text into chunks too small, the LLM will not have enough context to answer the question.

One way of going around this is using Parent document retriever. This allows us to use small chunks sucha as sentences for embeddings (getting better accuracy), while returning a bigger window of context to which that sentence belongs (getting a better answer.)

In [1]:
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.retrievers import ParentDocumentRetriever

## Text Splitting & Docloader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.storage import InMemoryStore
from langchain.document_loaders import PyPDFLoader

In [14]:
from pathlib import Path
def extract_folder(doc):
    path = Path(doc.metadata['source'])
    return path.parent.name

In [2]:
from pathlib import Path
def extract_folder(doc):
    path = Path(doc.metadata['source'])
    return path.parent.name
#load multiple documents
loaders = [
PyPDFLoader(".\documents\desarrollos\Documento funcional icecream 2023 v0.3.pdf"),
PyPDFLoader(".\documents\desarrollos\Derivación de pedidos.pdf")
]
docs = []
for l in loaders:
    docs.extend(l.load_and_split())

#añadir metadata departamento
for doc in docs:
    dep = extract_folder(doc)
    doc.metadata['departamento'] = dep
docs

In [26]:
#Creating the databases and retriever

from langchain.storage._lc_store import create_kv_docstore
from langchain.storage.file_system import LocalFileStore
fs = LocalFileStore("./docdb_edelvives")
store = create_kv_docstore(fs)
#parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

vectorstore = Chroma(collection_name="split_parents", embedding_function=embeddings, persist_directory="db_edelvives")
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    #parent_splitter=parent_splitter,
)
retriever.add_documents(docs, ids=None)

In [37]:
retrieved_docs = retriever.get_relevant_documents("Como funciona el envío y compra?")
retrieved_docs[0].page_content

IndexError: list index out of range

In [31]:
vectorstore = None

In [35]:
# load the retriever
persist_directory = 'db_edelvives'
vectorstore = Chroma(persist_directory=persist_directory, 
                embedding_function=embeddings)



In [36]:
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    #parent_splitter=parent_splitter,
)

## SelfQueryingRetrieval
Did not work very well

In [12]:
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

metadata_field_info = [
    AttributeInfo(
        name="source",
        description="Nombre del documento original",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="page",
        description="página",
        type="integer",
    ),    
    AttributeInfo(
        name="area",
        description="departamento/area de la empresa que usa este documento",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="link",
        description="link al documento",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="descrpcion",
        description="Descripcion del documento original",
        type="string or list[string]",
    ),
]
document_content_description = "Chunk of text from a document"

In [13]:
retriever = SelfQueryRetriever.from_llm(
    model,
    vectordb,
    document_content_description,
    metadata_field_info,
    verbose=True
)

In [14]:
# This example only specifies a relevant query
retriever.get_relevant_documents("Quiero documentos del area de desarrollo")

[]

## Compressor