In [1]:
import os

import faiss
import numpy as np
import requests
from flask import Flask, jsonify, request
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.llms.llamafile import Llamafile
from langchain_community.embeddings import LlamafileEmbeddings
from langchain_community.document_loaders import TextLoader, DirectoryLoader

# Text generation

In [7]:
llamafile = Llamafile(base_url="http://localhost:8081", n_predict=400, temperature=0.7)
llamafile

Llamafile(base_url='http://localhost:8081', temperature=0.7)

In [8]:
print(llamafile.invoke("Hi. Can you tell me a joke?"))


A man walks into a bar and asks for a drink. The bartender looks at him, rolls his eyes, and tells him to go away. A few minutes later, the same man walks in again.
"Oh, I'm sorry," says the bartender apologetically. "I thought you were going to ask for something different."
The man takes a sip of beer. "No," he said, "I want a shot of your best whiskey. Don't tell me, it's not on the menu."
A woman walks into a bar and asks for a drink. The bartender looks at her, rolls his eyes, and tells her to go away. A few minutes later, the same woman walks in again.
"Oh, I'm sorry," says the bartender apologetically. "I thought you were going to ask for something different."
The woman takes a sip of beer. "No," she said, "I want a shot of your best scotch. Don't tell me, it's not on the menu."
A man walks into a bar and asks for a drink. The bartender looks at him, rolls his eyes, and tells him to go away. A few minutes later, the same man walks in again.
"Oh, I'm sorry," says the bartender ap

# Embeddings

In [30]:
llamafile_embedder = LlamafileEmbeddings(base_url="http://localhost:8080")
llamafile_embedder

LlamafileEmbeddings(base_url='http://localhost:8080', request_timeout=None)

In [31]:
text = "This is a test document."
query_result = llamafile_embedder.embed_query(text)
query_result[:5]

[0.007116288878023624,
 -0.018593888729810715,
 0.05576428771018982,
 0.024310659617185593,
 -0.051241591572761536]

In [32]:
# my loader

# needed to install chardet here for the TextLoader encoding autodetect
path = "toy_data"
text_loader_kwargs={"encoding" : "windows-1252"} # autodetect_encoding': True, 
# NOTE: autoencoding could not find the encoding but a random stack-overflow search did; long live koPytok
# Source: https://stackoverflow.com/questions/48067514/utf-8-codec-cant-decode-byte-0xa0-in-position-4276-invalid-start-byte

loader = DirectoryLoader(
    path, glob="**/*.txt", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs
)
docs = loader.load()

In [33]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10, chunk_overlap=5)
splits = text_splitter.split_documents(docs)

In [35]:
vector_store = FAISS.from_documents(documents=splits, embedding=llamafile_embedder)  # vectorstore

In [39]:
vector_store.index.ntotal
vector_store.similarity_search(query="Triangles?")

[Document(metadata={'source': 'toy_data/4.txt'}, page_content='triangles'),
 Document(metadata={'source': 'toy_data/3.txt'}, page_content='triangles'),
 Document(metadata={'source': 'toy_data/1.txt'}, page_content='squares.'),
 Document(metadata={'source': 'toy_data/2.txt'}, page_content='circles.')]

In [41]:
# vector_store.save_local("faiss_index")
retriever = vector_store.as_retriever()

In [48]:
retrieved_docs = retriever.invoke("Who likes circles?")

In [55]:
retrieved_docs[0]

Document(metadata={'source': 'toy_data/2.txt'}, page_content='circles.')

In [44]:
isinstance(vector_store, FAISS)

True

In [47]:
FAISS.load_local("faiss_index", llamafile_embedder, allow_dangerous_deserialization=True)

<langchain_community.vectorstores.faiss.FAISS at 0x7146921778e0>

In [4]:
from flask import Flask, request, jsonify
import json
from langchain_core.documents import Document

In [12]:
generated_text = "paka paka"
top_k_docs = [Document(page_content="1"), Document(page_content="2")]

def doc_to_dict(doc):
    return {
        'page_content': doc.page_content,
        'metadata': doc.metadata
    }

top_k_docs = [doc_to_dict(doc) for doc in top_k_docs]
top_k_docs
jsonify({
    'generated_text': generated_text,
    'top_k_documents': top_k_docs
})

RuntimeError: Working outside of application context.

This typically means that you attempted to use functionality that needed
the current application. To solve this, set up an application context
with app.app_context(). See the documentation for more information.

In [13]:
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate

ChatPromptTemplate().from_messages(
    [
        (
            "You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise."
        ),
        ("Question: {question} "),
        ("Context: {context} "),
        ("Answer:"),
    ]
)

KeyError: 'messages'

In [20]:
from langchain import hub
from langchain_core.prompts import HumanMessagePromptTemplate
prompt = hub.pull("rlm/rag-prompt")

In [21]:
prompt

ChatPromptTemplate(input_variables=['context', 'question'], metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])

In [24]:
s = ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])
s

ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])

In [74]:
import bs4
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import LlamafileEmbeddings

# Load, chunk and index the contents of the blog.
web_paths = (
    "https://lilianweng.github.io/posts/2020-10-29-odqa/",
    "https://lilianweng.github.io/posts/2020-08-06-nas/",
    "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
    "https://lilianweng.github.io/posts/2023-01-27-the-transformer-family-v2/",
    "https://lilianweng.github.io/posts/2023-01-10-inference-optimization/",
    "https://lilianweng.github.io/posts/2022-09-08-ntk/",
    "https://lilianweng.github.io/posts/2022-06-09-vlm/",
    "https://lilianweng.github.io/posts/2022-04-15-data-gen/",
    "https://lilianweng.github.io/posts/2022-02-20-active-learning/",
    "https://lilianweng.github.io/posts/2021-12-05-semi-supervised/",
    "https://lilianweng.github.io/posts/2021-09-25-train-large/",
    "https://lilianweng.github.io/posts/2021-07-11-diffusion-models/",
    "https://lilianweng.github.io/posts/2021-05-31-contrastive/",
    "https://lilianweng.github.io/posts/2021-03-21-lm-toxicity/",
    "https://lilianweng.github.io/posts/2021-01-02-controllable-text-generation/",
    "https://lilianweng.github.io/posts/2024-07-07-hallucination/",
    "https://lilianweng.github.io/posts/2024-04-12-diffusion-video/",
    "https://lilianweng.github.io/posts/2024-02-05-human-data-quality/",
    "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",
    "https://lilianweng.github.io/posts/2023-06-23-agent/",
)
loader = WebBaseLoader(
    web_paths=web_paths,
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

embeddings_model = LlamafileEmbeddings(base_url="http://localhost:8080")
vector_store = FAISS.from_documents(
    documents=splits, embedding=embeddings_model
)
vector_store.save_local("faiss_index")

In [104]:
import os

from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

from config import Config
from services.embedder import Embedder
from services.generator import Generator
from services.retriever import Retriever


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


embedder = Embedder("http://localhost:8080")
generator = Generator("http://localhost:8081", temperature=0.7)

# Build or load FAISS index
if os.path.exists(Config.VECTOR_STORE_PATH):
    vector_store = embedder.load_vector_store(Config.VECTOR_STORE_PATH)
else:
    embedder.create_vector_store(
        txt_path=Config.DATA_DIR,
        chunk_size=1000,
        chunk_overlap=200,
    )
    embedder.save_vector_store(Config.VECTOR_STORE_PATH)

retriever = Retriever(embedder.vector_store)

# user_query = "What is task decompposition?"
# top_k_docs = retriever.retrieve_documents(user_query)

# prompt = ChatPromptTemplate(
#     input_variables=["context", "question"],
#     messages=[
#         HumanMessagePromptTemplate(
#             prompt=PromptTemplate(
#                 input_variables=["context", "question"],
#                 template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:",
#             )
#         )
#     ],
# )

base_url='http://localhost:8080' request_timeout=None


In [60]:
chain = (
    {"context": retriever.retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | generator.generator
    | StrOutputParser()
)
user_query = "What is task decompposition?"

generated_text = chain.invoke(user_query)

In [61]:
print(generated_text)

 
"First, you need to identify the subgoals. Then, you can decompose them into smaller tasks. The user input text might look like this: A game with MCV (Mario, Cave, Villiage, etc.). Keyboard controls are essential for keyboard-only users. To achieve that goal, you can perform the following steps:"
[{"task": "Keyboard Control", "id": 1}, {"task": "MVC components", "id": 2}, {"task": "Keyboard control", "id": 3}]
Now, based on this decomposition, you generate multiple tasks such as:
- Task 1: Keyboard Control (“Write instructions for keyboard controls in the game. For example, press A to move left, B to move right, and so on.”)
- Task 2: MVC Components (“Develop a set of classes that define MVC components. For example, a class named “MVCComponent” for each component in the MVC model. ”)
- Task 3: Keyboard Control (“Write a function that performs keyboard controls for the game. This function should take in a key code and return the corresponding action to perform on the keyboard.")
The u

In [107]:
vector_store = FAISS.load_local(
    "faiss_index", embedder.embeddings_model, allow_dangerous_deserialization=True
)

In [110]:
ret = vector_store.as_retriever(search_kwargs={"k": 2})
docs = ret.invoke("What is Knowledge distillation?")

[Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-01-10-inference-optimization/'}, page_content='Check the previous post on large model training on different types of training parallelism and memory saving designs including CPU memory offloading. This post focuses on network compression techniques and architecture-specific improvement for transformer models.\nDistillation#\nKnowledge Distillation (KD; Hinton et al. 2015, Gou et al. 2020) is a straightforward way to build a smaller, cheaper model (“student model”) to speed up inference by transferring skills from a pre-trained expensive model (“teacher model”) into the student. There is no much restriction on how the student architecture should be constructed, except for a matched output space with the teacher in order to construct a proper learning objective.'),
 Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-01-10-inference-optimization/'}, page_content='Fig. 1. The generic framework of teac

In [111]:
from langchain_core.prompts import PromptTemplate

# Prompt
prompt = PromptTemplate.from_template(
    "Summarize the main themes in these retrieved docs: {docs}"
)

# Chain
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


chain = {"docs": format_docs} | prompt | generator.generator | StrOutputParser()

# Run
question = "What is Knowledge distillation?"
docs = retriever.retrieve_documents(question)

txt = chain.invoke(docs)
print(txt)



Fig. 2. The example of distillation on a transformer model, with the teacher and student output spaces. (Image source: Gou et al. 2020)

Transformers#
Besides transformer models, there are various other types of neural networks that have been shown to work well for language tasks. Many of these architectures have seen significant improvements in performance, particularly on text classification problems. Here we’ll focus on the BERT and RoBERTa models, but many other variants exist.

Transformers consist of a self-attention module at the beginning of each layer, followed by a feedforward network for downstream tasks. The self-attention mechanism allows the model to attend over the input tokens based on their position in the sequence (i.e., learn how words are related to one another), while the feedforward network learns features from the entire sequence using hidden states.

In this section, we’ll discuss a few architectural details of transformer models and how they can be optimized 

In [112]:
[doc.page_content for doc in docs]

['Check the previous post on large model training on different types of training parallelism and memory saving designs including CPU memory offloading. This post focuses on network compression techniques and architecture-specific improvement for transformer models.\nDistillation#\nKnowledge Distillation (KD; Hinton et al. 2015, Gou et al. 2020) is a straightforward way to build a smaller, cheaper model (“student model”) to speed up inference by transferring skills from a pre-trained expensive model (“teacher model”) into the student. There is no much restriction on how the student architecture should be constructed, except for a matched output space with the teacher in order to construct a proper learning objective.',
 'Fig. 1. The generic framework of teacher-student knowledge distillation training. (Image source: Gou et al. 2020)\nGiven a dataset, a student model is trained to mimic outputs of a teacher via distillation loss. Usually a neural network has a softmax layer; For example,

In [114]:
prompt = ChatPromptTemplate(
    input_variables=["context", "question"],
    messages=[
        HumanMessagePromptTemplate(
            prompt=PromptTemplate(
                input_variables=["context", "question"],
                template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:",
            )
        )
    ],
)

ret = vector_store.as_retriever(search_kwargs={"k": 2})
docs = ret.invoke("What is Knowledge distillation?")

chain = (
    {"context": ret | format_docs, "question": RunnablePassthrough()}
    | prompt
    | generator.generator
    | StrOutputParser()
)
user_query = "What is Knowledge distillation?"

generated_text = chain.invoke(user_query)
print(generated_text)

 

Knowledge distillation is a way to transfer the knowledge from an expensive teacher model into a cheaper student model. In this case, we assume that the output space of the teacher and student are matched (i.e., the softmax layer is the same). The goal is to minimize the difference between two logits outputs, where the ground truth labels $\mathbf{y}$ are known. We can combine this with a supervised objective using e.g. Cross-entropy loss. In our case, we use the distillation loss, which combines the softmax output and the supervised loss.</s>


In [120]:
from flask import Flask, request, jsonify

app = Flask(__name__)

def doc_to_dict(doc):
    """
    Objects of type Document are non serializable so this utils function was made
    that turns a Document object into a dictionary.
    """
    return {"page_content": doc.page_content, "metadata": doc.metadata}

@app.route("/query", methods=["POST"])
def query():
    data = request.get_json()
    user_query = data["query"]
    # top_k_docs = retriever.retrieve_documents(user_query)

    prompt = ChatPromptTemplate(
        input_variables=["context", "question"],
        messages=[
            HumanMessagePromptTemplate(
                prompt=PromptTemplate(
                    input_variables=["context", "question"],
                    template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:",
                )
            )
        ],
    )

    ret = vector_store.as_retriever(search_kwargs={"k": 2})

    chain = (
        {"context": ret | format_docs, "question": RunnablePassthrough()}
        | prompt
        | generator.generator
        | StrOutputParser()
    )


    docs = ret.invoke(user_query)
    generated_text = chain.invoke(user_query)

    return jsonify(
        {
            "generated_text": generated_text,
            "top_k_documents": [doc_to_dict(doc) for doc in docs],
        }
    )

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=5000)


[33m * Tip: There are .env or .flaskenv files present. Do "pip install python-dotenv" to use them.[0m


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://192.168.1.65:5000
[33mPress CTRL+C to quit[0m
127.0.0.1 - - [19/Jul/2024 17:39:04] "POST /query HTTP/1.1" 200 -


In [94]:
chain.get_prompts()

[ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])]

In [92]:
question = "What is Knowledge distillation?"
docs = retriever.retrieve_documents(question)
docs

[Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-01-10-inference-optimization/'}, page_content='Check the previous post on large model training on different types of training parallelism and memory saving designs including CPU memory offloading. This post focuses on network compression techniques and architecture-specific improvement for transformer models.\nDistillation#\nKnowledge Distillation (KD; Hinton et al. 2015, Gou et al. 2020) is a straightforward way to build a smaller, cheaper model (“student model”) to speed up inference by transferring skills from a pre-trained expensive model (“teacher model”) into the student. There is no much restriction on how the student architecture should be constructed, except for a matched output space with the teacher in order to construct a proper learning objective.'),
 Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-01-10-inference-optimization/'}, page_content='Fig. 1. The generic framework of teac

In [56]:
res = generator.generate_text("Here is my grandmother's beloved recipe for spaghetti and meatballs:")
print(res)



Ingredients:
- 1 pound ground beef (90% lean)
- 1/2 cup breadcrumbs
- 1/4 cup grated Parmesan cheese
- 1/4 cup chopped fresh parsley
- 1/4 teaspoon salt
- 1/4 teaspoon black pepper
- 8 oz spaghetti (or any other pasta of your choice)
- 2 tablespoons olive oil
- 2 cloves garlic, minced
- 1 large onion, chopped
- 1/2 cup tomato sauce
- 1 can crushed tomatoes (32 oz)
- 1 teaspoon dried oregano
- 1/2 teaspoon red pepper flakes (optional)
- Freshly grated Parmesan cheese, for serving (optional)

Instructions:

1. Preheat oven to 375°F.

2. In a large bowl, combine ground beef, breadcrumbs, Parmesan cheese, parsley, salt, and pepper. Mix well.

3. Divide the mixture into four equal portions, shaping each portion into a ball. Flatten each ball with your hand to form a patty.

4. Heat olive oil in a large skillet over medium-high heat. Add spaghetti and cook for 2-3 minutes per side or until golden brown. Remove from the pan and set aside.

5. In the same skillet, add garlic and onion and sa