# OLM Docs FAQ/Slack Bot


## Import dependencies and credentials

In [2]:
# pip install langchain langchain-openai python-dotenv nltk openai chromadb tiktoken

In [4]:
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
from typing import Optional

from chromadb.api.types import Documents, EmbeddingFunction, Embeddings
from dotenv import load_dotenv
import re
import requests
import pandas as pd
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import MarkdownTextSplitter
from langchain.vectorstores import Chroma
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
from langchain.document_loaders import TextLoader, DirectoryLoader
from langchain_openai import ChatOpenAI
from langchain.schema import StrOutputParser
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

import time
import chromadb
from chromadb.api.types import Documents, EmbeddingFunction, Embeddings
from dotenv import load_dotenv, find_dotenv

### Step 1:
In the `credentials.env` file in the folder, change the XXX value for the `OPENAI_API_KEY` to your actual OPENAI API KEY

In [5]:
load_dotenv(find_dotenv("credentials.env"), override=True)

True

## Read the Dataset and create Vector Store

In [6]:
loader = DirectoryLoader('../data/external/olm/docs/', glob="**/*.md", loader_cls=TextLoader)
documents = loader.load()

In [7]:
# documents

In [8]:
## Verification
'../data/external/olm/docs/slack_samples.md' in [i.metadata['source'] for i in documents]

True

In [9]:
## Split the documents into chunks. Is there a better way than hardcoding size as 1000?
text_splitter = MarkdownTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)

In [10]:
embeddings = OpenAIEmbeddings()

In [11]:
docsearch = Chroma.from_documents(texts, embeddings, persist_directory='database')

In [12]:
qa = pd.read_csv("../data/test/sample_qna.csv")

In [13]:
qa.head()

Unnamed: 0,Question,Answer
0,"I have an operator catalog image with me, `qua...",You can make the operators available for insta...
1,"I added a catalog source to my cluster, but I ...","Once you add a catalog source to your cluster,..."
2,How can I see what operators are available for...,You can see what operators are available for i...
3,How can I install an operator on my cluster fr...,You can see what operators are available for i...
4,"Hey, looking for guidance on how to do Tier 2 ...",You're going to want to avoid a channel called...


In [14]:
questions = qa['Question'].tolist()
real_answers = qa['Answer'].tolist()
generated_answers = list()

In [15]:
selected_questions = questions[0:2]

## Without RAG - Basic Prompt

In [16]:
template = """Question: {question}

Answer:"""

prompt = PromptTemplate(template=template, input_variables=["question"])

In [17]:
runnable_chain = prompt | ChatOpenAI() | StrOutputParser()
runnable_chain.invoke({"question": "Hi what is OLM?"})

'OLM stands for Outlook for Mac. It is a software application developed by Microsoft for Mac users to manage their email, contacts, calendar, and other personal information. OLM is specifically designed for Mac devices and provides similar functionality to Microsoft Outlook for Windows.'

In [18]:
def answer_question(query, runnable):
    """
    Takes in query, and llm chain to generate answer
    """
    ## Generate answer
    # answer = chain({"question": query}, return_only_outputs=True)
    answer = runnable.invoke({"question": query})
    return answer

In [19]:
generated_answers_1 = list()
i=1
for query in selected_questions:

    answer = answer_question(query, runnable_chain)

    generated_answers_1.append(answer)
    print(f"Q: {query}\nA: {answer}\n")

    ## Add delay to avoid rate limit error
    time.sleep(1)
    print(f"{i}/{len(selected_questions)} done")
    i+=1

Q: I have an operator catalog image with me, `quay.io/operator-framework/upstream-community-operators:latest`, how do I make the operators included in the catalog image, available for installation on my cluster? 


A: To make the operators included in the catalog image available for installation on your cluster, you need to perform a few steps:

1. Pull the catalog image to your local machine:
   ```
   docker pull quay.io/operator-framework/upstream-community-operators:latest
   ```

2. Tag the pulled image with a local repository name:
   ```
   docker tag quay.io/operator-framework/upstream-community-operators:latest my-catalog-image:latest
   ```

3. Push the tagged image to a container registry accessible to your cluster:
   ```
   docker push my-registry/my-catalog-image:latest
   ```

4. Create a CatalogSource object in your cluster, specifying the image location and registry details. For example, create a YAML file `catalog-source.yaml` with the following content:
   ```yaml
  

### Without RAG - Advanced Prompt

In [20]:
template = """You are a support engineer who is trying to generate answers for questions around the Operator Frameworks product. Your goal is to answer questions that OLM customers and users would find relevant, informative, and useful. You should be descriptive and provide links to support your answer.

Here is a description of the product:
Operator Lifecycle Manager (OLM) project is a component of the Operator Frameworkan open source toolkit to manage Kubernetes native applications, called Operators, in a streamlined and scalable way.

Question: {question}

Answer:"""

prompt = PromptTemplate(template=template, input_variables=["question"])

In [21]:
runnable_chain = prompt | ChatOpenAI() | StrOutputParser()

In [22]:
generated_answers_2 = list()
i=1
for query in selected_questions:

    answer = answer_question(query, runnable_chain)

    generated_answers_2.append(answer)
    print(f"Q: {query}\nA: {answer}\n")

    ## Add delay to avoid rate limit error
    time.sleep(1)
    print(f"{i}/{len(selected_questions)} done")
    i+=1

Q: I have an operator catalog image with me, `quay.io/operator-framework/upstream-community-operators:latest`, how do I make the operators included in the catalog image, available for installation on my cluster? 


A: To make the operators included in the catalog image available for installation on your cluster, you can use the Operator Lifecycle Manager (OLM) to create a catalog source that points to the catalog image.

Here are the steps to follow:

1. Install OLM on your cluster by following the installation instructions provided in the OLM documentation: [OLM Installation Guide](https://olm.operatorframework.io/docs/installation/)

2. Once OLM is installed, create a catalog source YAML file, for example, `my-catalog-source.yaml`, with the following content:

```yaml
apiVersion: operators.coreos.com/v1alpha1
kind: CatalogSource
metadata:
  name: my-catalog
  namespace: my-namespace
spec:
  displayName: My Catalog
  sourceType: grpc
  image: quay.io/operator-framework/upstream-commun

### With RAG

In [23]:
template = """
You are a support engineer who is trying to generate answers for questions around the Operator Frameworks product. Your goal is to answer questions that OLM customers and users would find relevant, informative, and useful. You should be descriptive and provide links to support your answer.

Here is a description of the product:
Operator Lifecycle Manager (OLM) project is a component of the Operator Framework open source toolkit to manage Kubernetes native applications, called Operators, in a streamlined and scalable way.

Question: {question}

Here are a few input and output pairs examples to guide the model:

Input: "What is OLM?"
Output: "Operator Lifecycle Manager (OLM) project is a component of the Operator Frameworkan open source toolkit to manage Kubernetes native applications, called Operators, in a streamlined and scalable way."

Input: "What are OLM features?"
Output: "OLM provides rich update mechanisms to keep Kubernetes native applications up to date automatically. With OLMs packaging format Operators can express dependencies on the platform and on other Operators."

If you don't know the answer, please respond with: "I'm sorry, I don't have enough information to generate that content."

Please help this OLM customer to find the answer to this {question} given the relevant content around the question {context}:
"""

prompt = PromptTemplate(
    input_variables=["question", "context"],
    template=template,
)

In [24]:
def answer_question(query, index, chain):
    """
    Takes in query, index to search from, and llm chain to generate answer
    """
    ## Retrieve docs
    docs = index.similarity_search(query)
    print(docs)
    print(len(docs))
    ## Generate answer
    answer = chain.invoke({"question": query, "context": docs})
    return answer

In [26]:
generated_answers_3 = list()
i=1
for query in selected_questions:
    answer = answer_question(query, docsearch, runnable_chain)

    generated_answers_3.append(answer)
    print(f"Q: {query}\nA: {answer}\n")

    ## Add delay to avoid rate limit error
    time.sleep(1)
    print(f"{i}/{len(selected_questions)} done")
    i+=1

[Document(page_content='There are many possible ways to build a catalog, but an extremely simple approach would be to:\n\n- Maintain a single configuration file containing image references for each operator in the catalog\n   ```yaml\n   name: community-operators\n   repo: quay.io/community-operators/catalog\n   tag: latest\n   references:\n   - name: etcd-operator\n     image: quay.io/etcd-operator/catalog@sha256:5891b5b522d5df086d0ff0b110fbd9d21bb4fc7163af34d08286a2e846f6be03\n   - name: prometheus-operator\n     image: quay.io/prometheus-operator/catalog@sha256:e258d248fda94c63753607f7c4494ee0fcbe92f1a76bfdac795c9d84101eb317', metadata={'source': '../data/external/olm/docs/Reference/file-based-catalogs.md'}), Document(page_content='As a cluster administrator, you can install an Operator from the OperatorHub using the OpenShift Container Platform web console or the CLI. You can then subscribe the Operator to one or more namespaces to make it available for developers on your cluster.\

# Evaluate the generated answers using a metric

In [28]:
real_answers

['You can make the operators available for installation on your cluster by creating a `CatalogSource` CR: \n\n```\napiVersion: operators.coreos.com/v1alpha1\nkind: CatalogSource\nmetadata:\n  name: my-catalog\n  namespace: olm\nspec:\n  sourceType: grpc\n  image: quay.io/operator-framework/upstream-community-operators:latest\n  displayName: Community Operators\n  publisher: OperatorHub.io\n```',
 "Once you add a catalog source to your cluster, you can check the status of the catalog source by inspecting the CR's status.\n\nIf you have the `yq` tool available locally, you can use `yq`:\n\n```\n$ kubectl get CatalogSource  <catalog-source-name> -n <catalog-source-namespace> -o yaml | yq e '.status' -\n```\n\nIf you don't have the `yq` tool available, you can use `grep` instead \n\n```\n$ kubectl get CatalogSource  <catalog-source-name> -n <catalog-source-namespace> -o yaml | grep status -A 4\n```\n\nThe status should indicate what exactly has gone wrong in your attempt to add the catalog

In [31]:
def calc_bleu(reference, candidate):

    for i in range(len(reference)):
        print('BLEU score -> {}'.format(sentence_bleu(reference[i].split(), candidate[i].split(), smoothing_function=SmoothingFunction().method4)))

In [32]:
calc_bleu(real_answers[0:2], generated_answers_1)

BLEU score -> 0.002764112903140091
BLEU score -> 0.0017733527914034046


In [33]:
calc_bleu(real_answers[0:2], generated_answers_2)

BLEU score -> 0.0021981422129838907
BLEU score -> 0.0019948205939890667


In [34]:
calc_bleu(real_answers[0:2], generated_answers_3)

BLEU score -> 0.001881855501244099
BLEU score -> 0.0019698557937030976


# Post processing or Sanity Checks

In [36]:
def check_links(text):
    urls = re.findall(r'\((https?:\/\/[^\s\/$.?#].[^\s()]*\/[^\s\/$.?#]*(?:\.html)?)\)', text)
    for url in urls:
        try:
            response = requests.get(url)
            if response.status_code == 200:
                print(f"The link {url} is valid.")
            else:
                print(f"The link {url} returned a status code of {response.status_code}.")
        except requests.exceptions.RequestException as e:
            print(f"The link {url} could not be reached. Error: {e}")

print("generated_answers_1 returned")
for text in generated_answers_1:
    check_links(text)

print("generated_answers_2 returned")
for text in generated_answers_2:
    check_links(text)

print("generated_answers_3 returned")
for text in generated_answers_3:
    check_links(text)

generated_answers_1 returned
generated_answers_2 returned
The link https://olm.operatorframework.io/docs/installation/ returned a status code of 404.
The link https://olm.operatorframework.io/docs/ is valid.
The link https://operatorframework.io/ is valid.
The link https://operatorframework.io/community/slack/ returned a status code of 404.
generated_answers_3 returned
The link https://olm.operatorframework.io/docs/troubleshooting/ is valid.


# Appendix

In [109]:
# __import__('pysqlite3')
# import sys
# sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
# from typing import Optional

# import chromadb
# from chromadb.api.types import Documents, EmbeddingFunction, Embeddings
# from dotenv import load_dotenv
# # from pydantic_settings import BaseSettings

# from genai import Client, Credentials
# from genai.schema import TextEmbeddingParameters

### Use IBM BAM for creating vector store

In [None]:
# make sure you have a .env file in the root folder with genaikey and genaiapi
load_dotenv()
api_key = os.getenv("GENAI_KEY", None)
api_endpoint = os.getenv("GENAI_API", None)

In [None]:
loader = DirectoryLoader('../data/external/olm/docs/', glob="**/*.md", loader_cls=TextLoader)
documents = loader.load()

In [None]:
# documents

In [None]:
## Verification
'../data/external/olm/docs/getting-started/_index.md' in [i.metadata['source'] for i in documents]

True

In [None]:
## Split the documents into chunks. Is there a better way than hardcoding size as 1000?
text_splitter = MarkdownTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)

In [None]:
len(texts)

424

In [None]:
texts[0].page_content

'---\ntitle: Operator Lifecycle Manager(OLM)\nlinkTitle: "Documentation"\nmenu:\n  main:\n    weight: 10\n---\n\n[OLM](https://github.com/operator-framework/operator-lifecycle-manager) is a component of the [Operator Framework](https://github.com/operator-framework), an open source toolkit to manage Kubernetes native applications, called Operators, in an effective, automated, and scalable way. OLM extends Kubernetes to provide a declarative way to install, manage, and upgrade Operators and their dependencies in a cluster.\n\nRead more in the [introduction blog post](https://operatorhub.io/what-is-an-operator).\n\n## Features provided by OLM\n\n### Over-the-Air Updates and Catalogs'

In [12]:
# make sure you have a .env file under genai root with
GENAI_KEY=api_key
GENAI_API=api_endpoint

In [13]:
class ChromaEmbeddingFunction(EmbeddingFunction):
    def __init__(self, *, model_id: str, client: Client, parameters: Optional[TextEmbeddingParameters] = None):
        self._model_id = model_id
        self._parameters = parameters
        self._client = client

    def __call__(self, inputs: Documents) -> Embeddings:
        embeddings: Embeddings = []
        for response in self._client.text.embedding.create(
            model_id=self._model_id, inputs=inputs, parameters=self._parameters
        ):
            embeddings.extend(response.results)

        return embeddings

In [14]:
id_doc = {"ids": [], "documents": []}

for idx, text in enumerate(texts):
    id_doc["ids"].append(str(idx + 1))  # Index starts from 1
    id_doc["documents"].append(text.page_content)

In [15]:
credentials = Credentials.from_env()
persistent_client = chromadb.PersistentClient()
collection = persistent_client.get_or_create_collection("collection_olm")
collection.add(ids=id_doc['ids'], documents=id_doc['documents'])

embedding_fn = ChromaEmbeddingFunction(model_id="sentence-transformers/all-minilm-l6-v2", client=persistent_client)

In [35]:
langchain_chroma = Chroma(
    client=persistent_client,
    collection_name="collection_olm",
    embedding_function=embedding_fn,
)

In [40]:
print("There are", langchain_chroma._collection.count(), "in the collection")

There are 424 in the collection
