In [None]:
%pip install langchain

In [15]:
import os
from langchain.llms import HuggingFacePipeline
from langchain.llms import HuggingFaceTextGenInference
from langchain.document_loaders import TextLoader
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores.pgvector import PGVector

In [16]:
# test our Llama-2 interface with langchain

llm = HuggingFaceTextGenInference(
    inference_server_url="http://mistral-7b.broyal.demo/",
    max_new_tokens=512,
    top_k=10,
    top_p=0.95,
    typical_p=0.95,
    temperature=0.01,
    repetition_penalty=1.03,
)
llm("[INST] You are a helpful, respectful and honest assistant who is an expert in explaining Kubernetes concepts. Always answer as helpfully as possible, while being safe and keep your responses less than 200 words. What is a deployment?[/INST]")

' A deployment in Kubernetes is a way to manage the lifecycle of a set of replicas of a pod. It allows you to specify the desired state of the deployment, including the number of replicas, the desired CPU and memory resources, and the container image to use. Kubernetes will then automatically manage the deployment, scaling it up or down as needed to meet the desired state. Deployments also provide rolling updates, which means that new versions of the pods are introduced gradually, with each new version replacing one of the old ones. This ensures that the application remains available during the update process.'

In [17]:
# Load list of URLs -> kubernetes.io/docs/concepts/
file1 = open('./data/k8s-urls.samples.txt', 'r')

loader = WebBaseLoader(file1.readlines())
documents = loader.load()

In [18]:
# Chunk all the kubernetes concept documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
docs = text_splitter.split_documents(documents)

print("%s chunks in %s pages" % (len(docs), len(documents)))

143 chunks in 3 pages


In [19]:
# Load sentence transformer embeddings
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device":"cpu"} # use {"device":"cuda"} for distributed embeddings

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

In [20]:
# Connection string for connecting to Postgres
CONNECTION_STRING = PGVector.connection_string_from_db_params(
    driver=os.environ.get("PGVECTOR_DRIVER", "psycopg2"),
    host=os.environ.get("PGVECTOR_HOST", "localhost"),
    port=int(os.environ.get("PGVECTOR_PORT", "5432")),
    database=os.environ.get("PGVECTOR_DATABASE", "postgres"),
    user=os.environ.get("PGVECTOR_USER", "postgres"),
    password=os.environ.get("PGVECTOR_PASSWORD", "secretpassword"),
)

In [21]:
COLLECTION_NAME = "kubernetes_concepts-1"

db = PGVector.from_documents(
    embedding=embeddings,
    documents=docs,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
)

In [22]:
# A better way with distributed jobs on Kubernetes
! sh ./deploy-indexer.sh

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


job.batch/indexer-job unchanged
job.batch/test-job unchanged
Name:             indexer-job
Namespace:        default
Selector:         batch.kubernetes.io/controller-uid=87b4e76c-72bb-4c0a-b03a-cce47878e1ff
Labels:           job=indexer
Annotations:      batch.kubernetes.io/job-tracking: 
Parallelism:      3
Completions:      <unset>
Completion Mode:  NonIndexed
Start Time:       Tue, 31 Oct 2023 02:21:58 +0000
Completed At:     Tue, 31 Oct 2023 02:22:58 +0000
Duration:         60s
Pods Statuses:    0 Active (0 Ready) / 3 Succeeded / 0 Failed
Pod Template:
  Labels:  batch.kubernetes.io/controller-uid=87b4e76c-72bb-4c0a-b03a-cce47878e1ff
           batch.kubernetes.io/job-name=indexer-job
           controller-uid=87b4e76c-72bb-4c0a-b03a-cce47878e1ff
           job-name=indexer-job
  Containers:
   indexer-job:
    Image:      gcr.io/broyal-llama-demo/llama-demo/indexer:0.1.7
    Port:       <none>
    Host Port:  <none>
    Limits:
      nvidia.com/gpu:  1
    Environment:
      PGVEC

In [23]:
query = "Should I use gateway API in my app?" # "Should I use gateway API in my app?"
docs = db.similarity_search(query)
print(f"Query: {query}")
print(f"Retrieved documents: {len(docs)}")
for doc in docs:
    doc_details = doc.to_json()['kwargs']
    print("Source: ", doc_details['metadata']['source'])
    print("Text: ", doc_details['page_content'], "\n")

Query: Should I use gateway API in my app?
Retrieved documents: 4
Source:  https://kubernetes.io/docs/concepts/services-networking/service/

Text:  cluster. An Ingress lets you consolidate your routing rules into a single resource, so
that you can expose multiple components of your workload, running separately in your
cluster, behind a single listener.The Gateway API for Kubernetes
provides extra capabilities beyond Ingress and Service. You can add Gateway to your cluster -
it is a family of extension APIs, implemented using
CustomResourceDefinitions -
and then use these to configure access to network services that are running in your cluster.Cloud-native service discoveryIf you're able to use Kubernetes APIs for service discovery in your application,
you can query the API server
for matching EndpointSlices. Kubernetes updates the EndpointSlices for a Service
whenever the set of Pods in a Service changes.For non-native applications, Kubernetes offers ways to place a network port or loa

In [24]:
from langchain.prompts import PromptTemplate
prompt_template = """[INST] You are a helpful, respectful and honest assistant who is an expert in explaining Kubernetes concepts. Always answer as helpfully as possible, while being safe.
        Use the following pieces of context to answer the question. If you don't know the answer, just say that you don't know, don't try to make up an answer.
        
        {context}

        Question: {question}
        Answer:[/INST]"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

chain_type_kwargs = {"prompt": PROMPT}

In [25]:
retriever = db.as_retriever()

qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever,
    chain_type_kwargs=chain_type_kwargs,
    verbose=True
)

In [26]:
result = qa.run("When should I use the Gateway API?")
print(result)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
 The Gateway API for Kubernetes provides extra capabilities beyond Ingress and Service. It allows you to configure access to network services that are running in your cluster. You can add Gateway to your cluster by implementing CustomResourceDefinitions, and then use these to configure access to network services.

When deciding whether to use the Gateway API, you should consider the specific needs of your workload. If you need to expose multiple components of your workload behind a single listener, or if you need to configure access to network services in a more complex way than what can be done with Ingress and Service, then the Gateway API may be a good option.

It's also worth noting that the Gateway API is still a relatively new and evolving feature in Kubernetes, so it may not be as well-documented or widely used as some of the other APIs. However, as the API continues to mature and become more widely adopted

In [None]:
query = "What is the nation economic status? Summarize. Keep it under 200 words."
test_rag(qa, query)