Initialization of variables

In [None]:
!pip install openai

# Webbased Loader Only

In [None]:
!pip install llama_index

In [None]:
from llama_index import GPTVectorStoreIndex, download_loader

# 'Where can I send draft application to research data management for comments?'
# 'Where can I find examples of data management plans?'

input_text = 'Where can I find examples of data management plans?'

SimpleWebPageReader = download_loader("SimpleWebPageReader") # convert static html to text https://llamahub.ai/l/web-simple_web

loader = SimpleWebPageReader()
documents = loader.load_data(urls=['https://www.forschungsdaten.uni-bonn.de/en/services/dmps', 'https://uni-tuebingen.de/en/research/research-infrastructure/research-data-management-rdm/'])
index = GPTVectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()
response = query_engine.query(input_text)
print(response)

# https://python.langchain.com/docs/use_cases/question_answering/

# WebBaseLoader and Retriever

In [None]:
!pip install langchain
!pip install chromadb

Find the most relevant documents for the query (using vector store)

In [15]:
# Workflow from https://python.langchain.com/docs/use_cases/question_answering/

from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.retrievers import SVMRetriever


# Loading
#loader = WebBaseLoader("https://www.forschungsdaten.uni-bonn.de/en/services/dmps")
loader = WebBaseLoader(["https://www.forschungsdaten.uni-bonn.de/en/services/dmps", "https://uni-tuebingen.de/en/research/research-infrastructure/research-data-management-rdm/"])

data = loader.load()

# Splitting
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 0)
all_splits = text_splitter.split_documents(data)

# Embedding
vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())

# Similarity search
# https://python.langchain.com/docs/integrations/vectorstores/chroma#basic-example

#question = "Which contact person?"
#question = "Where can I find examples of data management plans?"
question = "What is research data management?"
docs = vectorstore.similarity_search(question)
#docs = vectorstore.similarity_search_with_relevance_scores(question)

print(docs[0])
print(docs[0].page_content)

page_content="Research data management (RDM) is an essential prerequisite for the digital preservation, reusability and archiving of scientific data. In line with the recommendations of the German Rectors' Conference, the Research Data Alliance and the E-Science Strategy of the state of Baden-Württemberg, the University of Tübingen supports responsible and sustainable handling of research data." metadata={'description': 'Research data management (RDM) is an essential prerequisite for the digital preservation, reusability and archiving of scientific data.', 'language': 'en-GB', 'source': 'https://uni-tuebingen.de/en/research/research-infrastructure/research-data-management-rdm/', 'title': 'Research Data Management (RDM) | University of Tübingen'}
Research data management (RDM) is an essential prerequisite for the digital preservation, reusability and archiving of scientific data. In line with the recommendations of the German Rectors' Conference, the Research Data Alliance and the E-Sci

Using a retriever instead of a vector store

In [18]:
# https://python.langchain.com/docs/integrations/retrievers/svm
svm_retriever = SVMRetriever.from_documents(all_splits, OpenAIEmbeddings())
docs_svm=svm_retriever.get_relevant_documents(question)
#print(docs_svm)

for doc in docs_svm:
  print(doc)
  print(doc.metadata)
len(docs_svm)

page_content="Research data management (RDM) is an essential prerequisite for the digital preservation, reusability and archiving of scientific data. In line with the recommendations of the German Rectors' Conference, the Research Data Alliance and the E-Science Strategy of the state of Baden-Württemberg, the University of Tübingen supports responsible and sustainable handling of research data." metadata={'source': 'https://uni-tuebingen.de/en/research/research-infrastructure/research-data-management-rdm/', 'title': 'Research Data Management (RDM) | University of Tübingen', 'description': 'Research data management (RDM) is an essential prerequisite for the digital preservation, reusability and archiving of scientific data.', 'language': 'en-GB'}
{'source': 'https://uni-tuebingen.de/en/research/research-infrastructure/research-data-management-rdm/', 'title': 'Research Data Management (RDM) | University of Tübingen', 'description': 'Research data management (RDM) is an essential prerequi

4

In [None]:
!pip install langchain
!pip install langchainhub

Formulate an answer based on the most relevant documents for the query

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain import hub

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
rag_prompt = hub.pull("rlm/rag-prompt")

qa_chain = RetrievalQA.from_chain_type(
    llm,
    #retriever=vectorstore.as_retriever(),
    retriever=svm_retriever,
    chain_type_kwargs={"prompt": rag_prompt},
     return_source_documents=True
)

#question = "What is research data management?"
#question = "Which contact person?"
#question = "Where can I send draft application to research data management for comments?"
#question = "What is the aim of the national research data infrastructure?"
question = "What are useful examples of DMPs?"

result = qa_chain({"query": question})
print(result["result"])
print(result['source_documents'])

Useful examples of DMPs include liber-dmp and dmp-examples on Zenodo, DMPs of the Rio Journal, DMPs of the LIBER Research Data Management Working Group, DMPs from DMP Tool, DMPs of the DCC, DMPs from DMP online, and DMPs at McGill University.
[Document(page_content='Example DMPs\n\n\n\n\nDMPs on Zenodo : liber-dmp and dmp-examples\nDMPs of the Rio Journal\nDMPs of the LIBER Research Data Management Working Group\nHints and examples at the HU Berlin\nDMPs from DMP Tool\nDMPs of the DCC\nDMPS from DMP online\nDMPs at the McGill University\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nDiscipline-specific resources:\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nHumanities', metadata={'source': 'https://www.forschungsdaten.uni-bonn.de/en/services/dmps', 'title': 'DMPs and RDM in grant applications — Research Data Service Center', 'description': 'Our DMP-Service supports you in creating a data management plan (DMP) for your project or application for third party funding.', 'language': 'en'}), Document(page_co

Pick a prompt so that the answer returned is in German

In [None]:
!pip install langchain
!pip install langchainhub
!pip install pip install tiktoken

In [14]:
# Adapted from https://python.langchain.com/docs/use_cases/question_answering/vector_db_qa#custom-prompts
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain import hub
from langchain.retrievers import SVMRetriever
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts import PromptTemplate

# Load the data

loader = WebBaseLoader(["https://www.forschungsdaten.uni-bonn.de/en/services/dmps", "https://uni-tuebingen.de/en/research/research-infrastructure/research-data-management-rdm/"])

data = loader.load()

# Splitting
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 0)
all_splits = text_splitter.split_documents(data)

# initialize the retriever
svm_retriever = SVMRetriever.from_documents(all_splits, OpenAIEmbeddings())

# Specify the prompt
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
rag_prompt = hub.pull("rlm/rag-prompt")

prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Answer in German:""" # in English then in German
en_to_german_prompt = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)


qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=svm_retriever,
    chain_type_kwargs={"prompt": rag_prompt},
    #chain_type_kwargs={"prompt": en_to_german_prompt},
     return_source_documents=True
)

#question = "What is research data management?"
#question = "Which contact person?"
#question = "Where can I send draft application to research data management for comments?"
#question = "What is the aim of the national research data infrastructure?"
question = "What are useful examples of DMPs?"

result = qa_chain({"query": question})
# Print the question
print(result["result"])
# Print the source documents where the answer was found
print(result['source_documents'])

Useful examples of DMPs include liber-dmp and dmp-examples on Zenodo, DMPs of the Rio Journal, DMPs of the LIBER Research Data Management Working Group, DMPs from DMP Tool, DMPs of the DCC, DMPs from DMP online, and DMPs at McGill University.
[Document(page_content='Example DMPs\n\n\n\n\nDMPs on Zenodo : liber-dmp and dmp-examples\nDMPs of the Rio Journal\nDMPs of the LIBER Research Data Management Working Group\nHints and examples at the HU Berlin\nDMPs from DMP Tool\nDMPs of the DCC\nDMPS from DMP online\nDMPs at the McGill University\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nDiscipline-specific resources:\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nHumanities', metadata={'source': 'https://www.forschungsdaten.uni-bonn.de/en/services/dmps', 'title': 'DMPs and RDM in grant applications — Research Data Service Center', 'description': 'Our DMP-Service supports you in creating a data management plan (DMP) for your project or application for third party funding.', 'language': 'en'}), Document(page_co

# BeautifulSoup for Website Parsing

In [19]:
!pip install beautifulsoup4



In [None]:
import nltk
from urllib.request import urlopen
from bs4 import BeautifulSoup


#url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
url = "https://www.forschungsdaten.uni-bonn.de/en/services/dmps"
html = urlopen(url).read()
#raw = nltk.clean_html(html)
raw = BeautifulSoup(html).get_text() # https://www.crummy.com/software/BeautifulSoup/bs4/doc/#get-text

print(raw)