In [5]:
%conda env list
#conda activate docs-chatbot

# conda environments:
#
default               *  /home/studio-lab-user/.conda/envs/default
docs-chatbot             /home/studio-lab-user/.conda/envs/docs-chatbot
studiolab                /home/studio-lab-user/.conda/envs/studiolab
studiolab-safemode       /opt/amazon/sagemaker/safemode-home/.conda/envs/studiolab-safemode
base                     /opt/conda


Note: you may need to restart the kernel to use updated packages.


In [6]:
%pip install langchain openai unstructured pdf2image pinecone-client tiktoken

Note: you may need to restart the kernel to use updated packages.


## Imports and initial setup

In [23]:
import os
from dotenv import load_dotenv, find_dotenv

# Langchain and openai
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader

In [24]:
# Find and load the .env file
dotenv_path = find_dotenv()
load_dotenv(dotenv_path)

True

## Load data

In [25]:
loader = DirectoryLoader('docs/', glob="**/*.md", show_progress=True, use_multithreading=True)
docs = loader.load()


# Note: If you're using PyPDFLoader then it will split by page for you already
print (f'You have {len(docs)} document(s) in your data')
print (f'There are {len(docs[30].page_content)} characters in your document')

100%|██████████| 75/75 [00:04<00:00, 15.82it/s]

You have 75 document(s) in your data
There are 310 characters in your document





In [37]:
# Note: If you're using PyPDFLoader then we'll be splitting for the 2nd time.
# This is optional, test out on your own data.

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
text_chunks = text_splitter.split_documents(docs)

In [38]:
print (f'Now you have {len(text_chunks)} documents')

Now you have 131 documents


In [43]:
# create an empty list to store the metadata for each document
metadata_list = []

# loop through the text_chunks and set the metadata for each document
for i, chunk in enumerate(text_chunks):
    # get the filename of the source text file from the metadata of the first page in the chunk
    filename = chunk.metadata['source'].split(os.sep)[-1].split('.')[0]
    # set the sources metadata key to the filename
    metadata = {"source": filename, "path": chunk.metadata['source']}
    # add the metadata dictionary to the metadata_list
    metadata_list.append(metadata)

## Create embeddings of your documents to get ready for semantic search

In [39]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

In [40]:
embeddings = OpenAIEmbeddings(openai_api_key=os.environ.get("OPENAI_APIKEY"))

In [20]:
# initialize pinecone
pinecone.init(
    api_key=os.environ.get("PINECONE_API_KEY"),  # find at app.pinecone.io
    environment=os.environ.get("PINECONE_API_ENV")  # next to api key in console
)
index_name = "platon2" # put in the name of your pinecone index here

In [48]:
reIndex = False

if reIndex:
    docsearch = Pinecone.from_texts([t.page_content for t in text_chunks], embeddings, metadatas=metadata_list, index_name=index_name)
else:
    docsearch = Pinecone.from_existing_index(index_name=index_name, embedding=embeddings)

In [51]:
query = "Hvordan sette opp logging til humio fra paas?"
pages = docsearch.similarity_search(query)

# Here's an example of the first document that was returned
print(pages[0].page_content)

Felles logging

Humio is a Log Analytics plattform delivered by Cyber Security Center for Research and Education (eduCSC).

Support Requests

Send an email to kontakt@sikt.no for any request regarding users, repositories, views, etc in Humio.

Web Console

Humio Web Console

Our Humio instance is available for all Sikt employees that need to analyse logs for their servies. It uses Feide for login. A user must be created for you before you can log in.

Humio Documentation

Some of the main concepts in Humio is described in this section. See the official docs for more in-depth information.

Repository

A Repository is a way of organizing data storage in Humio. Within Humio, each repository has its own set of users, dashboards, saved queries, and parsers.

The Sandbox Repository

In Humio there is by default a Sandbox repository for testing data, executing queries, and testing new features of the Humio software.

Views

There may be a situation in which you want a subset of a repository. 

## Query Platon documentation

In [52]:
llm = OpenAI(openai_api_key=os.environ.get("OPENAI_APIKEY"))

In [53]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [54]:
chain = load_qa_chain(llm, chain_type="stuff")

In [55]:
def queryPlaton(query):
    doc_context = docsearch.similarity_search(query)
    return chain.run(input_documents=doc_context, question=query)

In [56]:
r = queryPlaton("Tilbyr Platon noen sikkerhetstjenester?")
r

' Ja, Platon tilbyr flere sikkerhetstjenester, inkludert container scanning i Gitlab CI pipeline, kartlegging av avhengigheter (Dependency Track) og lagring av hemmeligheter og nøkler på en sikker og pålitelig måte med Platons Vault cluster.'

In [57]:
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

def chatbot_interactive(user_input):
    response = queryPlaton(user_input)
    #response = user_input
    out = widgets.HTML(
        value='<div style="padding: 2em; margin: 3em 1em; border: 1px solid #aaa;">' + response + '</textarea>'
    )
    return out

interact_manual(chatbot_interactive, 
                user_input=widgets.Textarea(value='Hva tilbyr Platon av backupløsninger?', rows=5, description='Spørsmål:', 
                                            layout=widgets.Layout(width='90%')),
                manual_name="YAY")

interactive(children=(Textarea(value='Hva tilbyr Platon av backupløsninger?', description='Spørsmål:', layout=…

<function __main__.chatbot_interactive(user_input)>

## Misc

In [None]:
text = "What would be a good company name for a company that makes colorful socks?"
#print(llm(text))

In [None]:
prompt = PromptTemplate(
    input_variables=["product"],
    template="What is a good name for a company that makes {product}?",
)

In [None]:
#print(prompt.format(product="colorful socks"))

In [None]:
chain = LLMChain(llm=llm, prompt=prompt)

In [None]:
#chain.run("colorful socks")

## Memory: Add State to Chains and Agents

In [None]:
from langchain import OpenAI, ConversationChain

conversation = ConversationChain(llm=llm, verbose=True)

output = conversation.predict(input="Can you help me with calculating 2 + 2?")
print(output)

## Chat model

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)

chat = ChatOpenAI(temperature=0, openai_api_key=os.environ.get("OPENAI_APIKEY"))

In [None]:
chat([HumanMessage(content="Translate this sentence from English to French. I love programming.")])
# -> AIMessage(content="J'aime programmer.", additional_kwargs={})

In [None]:
messages = [
    SystemMessage(content="You are a helpful assistant that translates English to French."),
    HumanMessage(content="I love programming.")
]
chat(messages)
# -> AIMessage(content="J'aime programmer.", additional_kwargs={})

In [None]:
batch_messages = [
    [
        SystemMessage(content="You are a helpful assistant that translates English to French."),
        HumanMessage(content="I love programming.")
    ],
    [
        SystemMessage(content="You are a helpful assistant that translates English to French."),
        HumanMessage(content="I love artificial intelligence.")
    ],
]
result = chat.generate(batch_messages)
result
# -> LLMResult(generations=[[ChatGeneration(text="J'aime programmer.", generation_info=None, message=AIMessage(content="J'aime programmer.", additional_kwargs={}))], [ChatGeneration(text="J'aime l'intelligence artificielle.", generation_info=None, message=AIMessage(content="J'aime l'intelligence artificielle.", additional_kwargs={}))]], llm_output={'token_usage': {'prompt_tokens': 57, 'completion_tokens': 20, 'total_tokens': 77}})