In [83]:
%conda env list
#conda activate docs-chatbot

# conda environments:
#
default               *  /home/studio-lab-user/.conda/envs/default
docs-chatbot             /home/studio-lab-user/.conda/envs/docs-chatbot
studiolab                /home/studio-lab-user/.conda/envs/studiolab
studiolab-safemode       /opt/amazon/sagemaker/safemode-home/.conda/envs/studiolab-safemode
base                     /opt/conda


Note: you may need to restart the kernel to use updated packages.


In [84]:
%pip install langchain openai unstructured pdf2image pinecone-client tiktoken pypandoc ipywidgets

Note: you may need to restart the kernel to use updated packages.


## Imports and initial setup

In [86]:
import os
from dotenv import load_dotenv, find_dotenv

# Langchain and openai
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader

In [87]:
# Find and load the .env file
dotenv_path = find_dotenv()
load_dotenv(dotenv_path)

True

## Load data

In [98]:
# Convert from rst to md
import os
import pypandoc

convertToMarkdown = False


if convertToMarkdown:
    docs_path = 'feide-docs-master/'
    # Iterate through all files in the directory
    for root, dirs, files in os.walk(docs_path):
        for file in files:
            if file.endswith('.rst'):
                # Full path to the source .rst file
                source_file_path = os.path.join(root, file)

                # Full path to the output .md file
                # os.path.splitext(file)[0] gives the file name without the extension
                output_file_path = os.path.join(root, os.path.splitext(file)[0] + '.md')

                # Convert the file
                output = pypandoc.convert_file(source_file_path, 'markdown', format='rst')

                # Write the markdown output to a .md file
                with open(output_file_path, 'w') as output_file:
                    output_file.write(output)


In [89]:
loader = DirectoryLoader('feide-docs-master/', glob="**/*.md", show_progress=True, use_multithreading=True)
docs = loader.load()

# Note: If you're using PyPDFLoader then it will split by page for you already
print (f'You have {len(docs)} document(s) in your data')
print (f'There are {len(docs[30].page_content)} characters in your document')

100%|██████████| 196/196 [00:13<00:00, 14.65it/s]

You have 196 document(s) in your data
There are 540 characters in your document





In [90]:
# Note: If you're using PyPDFLoader then we'll be splitting for the 2nd time.
# This is optional, test out on your own data.

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_documents(docs)

In [91]:
print (f'Now you have {len(texts)} documents')

Now you have 426 documents


## Create embeddings of your documents to get ready for semantic search

In [92]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

In [93]:
embeddings = OpenAIEmbeddings(openai_api_key=os.environ.get("OPENAI_APIKEY"))

In [94]:
# initialize pinecone
pinecone.init(
    api_key=os.environ.get("PINECONE_API_KEY"),  # find at app.pinecone.io
    environment=os.environ.get("PINECONE_API_ENV")  # next to api key in console
)
index_name = "platon2" # put in the name of your pinecone index here

In [95]:
reIndex = False

if reIndex:
    docsearch = Pinecone.from_texts([t.page_content for t in text_chunks], embeddings, metadatas=metadata_list, index_name=index_name)
else:
    docsearch = Pinecone.from_existing_index(index_name=index_name, embedding=embeddings)

In [96]:
def queryDoc(input):
    pages = docsearch.similarity_search(input)

    # Here's an example of the first document that was returned
    print(pages[0].page_content)
    print(pages)

In [97]:
queryDoc("OpenID Connect secret?")

OpenID Connect details </reference/oauth_oidc/openid_connect_details>{.interpreted-text
role="doc"}
[Document(page_content='OpenID Connect details </reference/oauth_oidc/openid_connect_details>{.interpreted-text\nrole="doc"}', metadata={}), Document(page_content='OpenID Connect and OAuth 2.0\n\n::: {.toctree maxdepth="1"}\nuserids openid_connect_details legacy_userinfo\n:::', metadata={}), Document(page_content='OpenID Connect details\n\nOpenID Connect (OIDC) is a simple standardized identity (authentication)\nlayer on top of OAuth 2.0.\n\nAfter a successful login, the user agent is in possession of an access\ntoken and an ID token <id_token>{.interpreted-text role="ref"}. The\naccess token looks the same as for plain OAuth2. The ID token is a\nsigned JSON Web Token with info about\nthe user. The information is also available from the userinfo endpoint\ndescribed in the reference doc:\n\nUserinfo Endpoint </reference/apis/userinfo>{.interpreted-text\nrole="doc"}.\n\nDiscovery and confi

## Query Platon documentation

In [99]:
llm = OpenAI(openai_api_key=os.environ.get("OPENAI_APIKEY"))

In [100]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [101]:
chain = load_qa_chain(llm, chain_type="stuff")

In [102]:
def query(query):
    doc_context = docsearch.similarity_search(query)
    return chain.run(input_documents=doc_context, question=query)

In [103]:
r = query("Jeg ønsker å sette opp et API hvor Feide brukes for styre tilgangen, hvordan går jeg frem?")
r

' To set up an API where Feide is used to manage access, first fill out the application form to become a service provider in Feide. Then, Feide administrators must register accounts at the Feide OpenIdP. Once those steps are completed, send an email to kontakt@sikt.no with the usernames of the Feide administrators. After that, you can manage access to services through Feide, by enabling other login providers in addition to Feide, and choosing which host organizations or individual schools will have access to activate the service in the customer portal under the "Host organization" tab when editing the service.'

In [104]:
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

def chatbot_interactive(user_input):
    response = query(user_input)
    #response = user_input
    out = widgets.HTML(
        value='<div style="padding: 2em; margin: 3em 1em; border: 1px solid #aaa;">' + response + '</textarea>'
    )
    return out

interact_manual(chatbot_interactive, 
                user_input=widgets.Textarea(value='Hvordan gjør man …… i Feide?', rows=5, description='Spørsmål:', 
                                            layout=widgets.Layout(width='90%')),
                manual_name="YAY")

interactive(children=(Textarea(value='Hvordan gjør man …… i Feide?', description='Spørsmål:', layout=Layout(wi…

<function __main__.chatbot_interactive(user_input)>