In [None]:
!pip install langchain openai tiktoken chromadb

In [None]:
from google.colab import userdata
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.schema import (SystemMessage, HumanMessage, AIMessage)
from langchain.schema import document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone, Chroma
from langchain.chains import ConversationalRetrievalChain, RetrievalQAWithSourcesChain, RetrievalQA, OpenAIModerationChain, SequentialChain, LLMChain, SimpleSequentialChain
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory

In [None]:
from langchain.document_loaders import CSVLoader, PyPDFLoader, PyPDFDirectoryLoader, DirectoryLoader, UnstructuredFileLoader

In [None]:
llm = ChatOpenAI(
    model_name='gpt-3.5-turbo',
    temperature=0,
    openai_api_key = OPENAI_API_KEY)

Document loading

In [None]:
!pip install unstructured unstructured[pdf]

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken

def rag(directory):
  loader = DirectoryLoader(directory, loader_cls=UnstructuredFileLoader)

  input_docs = loader.load()
  print(str(len(input_docs)) + " documents loaded")

  # Create function to count tokens
  tokenizer = tiktoken.get_encoding('cl100k_base')

  def length_function(text: str) -> int:
      return len(tokenizer.encode(text))

  # Define the splitter
  text_splitter = RecursiveCharacterTextSplitter(
      length_function=length_function, #usa i token e non i caratteri come unità di splitting
      chunk_size=200, #numero massimo di unità in ogni chunk
      chunk_overlap=0
  )

  # Split documents in chunks
  doc_chunks = text_splitter.split_documents(input_docs)
  print("Input Data - Now you have {0} number of chunks.".format(len(doc_chunks)))

  # Define embedding model and vector DB
  embed_model = OpenAIEmbeddings(
      openai_api_key=OPENAI_API_KEY
    )

  vDB = Chroma.from_documents(doc_chunks, embed_model)

  retriever = vDB.as_retriever()
  retriever.search_kwargs = {'k': 20}


  # Define memory
  memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True,
    input_key='question',
    output_key='answer'
  )

  chatbot = ConversationalRetrievalChain.from_llm(
      llm=llm,
      retriever=retriever,
      memory=memory
  )


  return chatbot

In [None]:
chatbot = rag('/content/documents')

In [None]:
question = 'A quanto ammontano le risorse finanziarie complessivamente messe a disposizione dalla Camera di Commercio?'
response = chatbot(
    {"question":question}
)

print("Question: " + question)
print("Answer: " + response['answer'])
#print("Sources: " + response['sources'])

chat_history = response['chat_history']

In [None]:
question = """Quali sono i requisiti per partecipare al bando?"""
response = chatbot(
    {"question":question},
    return_only_outputs=True
)

print("Question: " + question)
print("Answer: " + response['answer'])