# Create AI-Tutor vector database


In [1]:
from dotenv import load_dotenv

load_dotenv("../.env")

True

In [2]:
import nest_asyncio

nest_asyncio.apply()

### Clean scraped data
- Removes sections with <7 tokens and sections titled "Transformers"

In [3]:
import json
import uuid
import tiktoken
from collections import OrderedDict


def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(
        encoding.encode(
            string, disallowed_special=(encoding.special_tokens_set - {"<|endoftext|>"})
        )
    )
    return num_tokens


def clean_jsonl_file(input_filepath, output_filepath):
    cleaned_data = []

    with open(input_filepath, "r") as file:
        for line in file:
            json_obj = json.loads(line)
            content = json_obj.get("content", "")
            token_count = num_tokens_from_string(content, "cl100k_base")

            # Check conditions for keeping the line
            if token_count > 7 and not (
                token_count == 92 and json_obj.get("name") == "Transformers"
            ):
                # Create a new OrderedDict with 'tokens' as the first key
                new_obj = OrderedDict(
                    [("tokens", token_count), ("doc_id", str(uuid.uuid4()))]
                )
                # Add the rest of the key-value pairs from the original object
                new_obj.update(json_obj)
                cleaned_data.append(new_obj)

    with open(output_filepath, "w") as file:
        for item in cleaned_data:
            json.dump(item, file)
            file.write("\n")

    print(f"Original number of lines: {sum(1 for _ in open(input_filepath))}")
    print(f"Cleaned number of lines: {len(cleaned_data)}")


# Usage
input_filepath = "../hf_transformers_v4_42_0.jsonl"
output_filepath = "../hf_transformers_v4_42_0_cleaned.jsonl"
clean_jsonl_file(input_filepath, output_filepath)

Original number of lines: 10413
Cleaned number of lines: 4123


### Merges sections by 'URL'

- Excluding sections like "model_doc", "internal", "main_classes"


In [4]:
import json
from collections import defaultdict
import tiktoken


def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(
        encoding.encode(
            string, disallowed_special=(encoding.special_tokens_set - {"<|endoftext|>"})
        )
    )
    return num_tokens


def should_not_merge(url):
    """Check if the URL contains any of the exclusion patterns."""
    exclusion_patterns = ["model_doc", "internal", "main_classes"]
    return any(pattern in url for pattern in exclusion_patterns)


def merge_jsonl(input_file, output_file):
    # Dictionary to store merged data
    merged_data = defaultdict(list)

    # Read and process the input file
    with open(input_file, "r") as f:
        for line in f:
            data = json.loads(line)
            url = data["url"]
            merged_data[url].append(data)

    # Write the merged data to the output file
    with open(output_file, "w") as f:
        for url, entries in merged_data.items():
            if len(entries) == 1 or should_not_merge(url):
                # If there's only one entry or it shouldn't be merged, write all entries as is
                for entry in entries:
                    entry["retrieve_doc"] = False
                    json.dump(entry, f)
                    f.write("\n")
            else:
                # Merge the entries
                merged_entry = entries[0].copy()
                merged_entry["content"] = "\n\n".join(
                    entry["content"] for entry in entries
                )
                merged_entry["tokens"] = num_tokens_from_string(
                    merged_entry["content"], "cl100k_base"
                )
                merged_entry["retrieve_doc"] = True
                json.dump(merged_entry, f)
                f.write("\n")


# Usage
input_file = "../hf_transformers_v4_42_0_cleaned.jsonl"
output_file = "../hf_transformers_v4_42_0_merged.jsonl"
merge_jsonl(input_file, output_file)

### Create a set of Llama-index Documents with each section in the jsonl file


In [3]:
from llama_index.core import Document
from llama_index.core.schema import MetadataMode
import json


def create_docs(input_file):
    with open(input_file, "r") as f:
        documents = []
        for i, line in enumerate(f):
            data = json.loads(line)
            documents.append(
                Document(
                    doc_id=data["doc_id"],
                    text=data["content"],
                    metadata={
                        "url": data["url"],
                        "title": data["name"],
                        "tokens": data["tokens"],
                        "retrieve_doc": data["retrieve_doc"],
                        "source": "HF_Transformers",
                    },
                    excluded_llm_metadata_keys=[
                        "url",
                        "title",
                        "tokens",
                        "retrieve_doc",
                        "source",
                    ],
                    excluded_embed_metadata_keys=[
                        "url",
                        "title",
                        "tokens",
                        "retrieve_doc",
                        "source",
                    ],
                )
            )
        return documents


documents = create_docs("../hf_transformers_v4_42_0_merged.jsonl")
print(documents[0])
print(documents[0].metadata)

document_dict = {doc.doc_id: doc for doc in documents}
# save dict to disk, as .pkl file
import pickle

with open("document_dict.pkl", "wb") as f:
    pickle.dump(document_dict, f)

# load dict from disk
with open("document_dict.pkl", "rb") as f:
    document_dict = pickle.load(f)

Doc ID: 85b2c5b6-ce24-4e4e-8a2d-d6557c917012
Text: DeepSpeed is a PyTorch optimization library that makes
distributed training memory-efficient and fast. At it’s core is the
Zero Redundancy Optimizer (ZeRO) which enables training large models
at scale. ZeRO works in several stages: ZeRO-1, optimizer state
partioning across GPUs ZeRO-2, gradient partitioning across GPUs
ZeRO-3, parameteter partit...
{'url': 'https://huggingface.co/docs/transformers/deepspeed', 'title': 'DeepSpeed', 'tokens': 8483, 'retrieve_doc': True, 'source': 'HF_Transformers'}


In [4]:
import chromadb

# create client and a new collection
# chromadb.EphemeralClient saves data in-memory.
chroma_client = chromadb.PersistentClient(path="./ai-tutor-vector-db")
chroma_collection = chroma_client.create_collection("ai-tutor-vector-db")

from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext

# Define a storage context object using the created vector database.
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [5]:
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding

# Build index / generate embeddings using OpenAI embedding model
index = VectorStoreIndex.from_documents(
    documents,
    # embed_model=OpenAIEmbedding(model="text-embedding-3-small"),
    embed_model=OpenAIEmbedding(model="text-embedding-3-large", mode="similarity"),
    transformations=[SentenceSplitter(chunk_size=800, chunk_overlap=400)],
    show_progress=True,
    use_async=True,
    storage_context=storage_context,
)

  from .autonotebook import tqdm as notebook_tqdm
Parsing nodes: 100%|██████████| 3374/3374 [00:36<00:00, 93.13it/s] 
Generating embeddings: 100%|██████████| 21/21 [00:03<00:00,  5.60it/s]
Generating embeddings: 100%|██████████| 21/21 [00:04<00:00,  5.00it/s]
Generating embeddings: 100%|██████████| 21/21 [00:04<00:00,  4.78it/s]
Generating embeddings: 100%|██████████| 21/21 [00:22<00:00,  1.09s/it]
Generating embeddings: 100%|██████████| 21/21 [00:05<00:00,  3.63it/s]
Generating embeddings: 100%|██████████| 21/21 [00:03<00:00,  6.82it/s]
Generating embeddings: 100%|██████████| 21/21 [00:04<00:00,  5.18it/s]
Generating embeddings: 100%|██████████| 21/21 [00:03<00:00,  5.58it/s]
Generating embeddings: 100%|██████████| 21/21 [00:03<00:00,  6.36it/s]
Generating embeddings: 100%|██████████| 21/21 [00:03<00:00,  6.34it/s]
Generating embeddings: 100%|██████████| 21/21 [00:03<00:00,  6.80it/s]
Generating embeddings: 100%|██████████| 21/21 [00:03<00:00,  5.59it/s]
Generating embeddings: 100%|██

In [6]:
retriever = index.as_retriever(
    similarity_top_k=10,
    use_async=True,
    embed_model=OpenAIEmbedding(model="text-embedding-3-large", mode="similarity"),
    # embed_model=OpenAIEmbedding(model="text-embedding-3-large", mode="text_search"),
)

In [7]:
from llama_index.core.data_structs import Node
from llama_index.core.schema import NodeWithScore, BaseNode, TextNode


# query = "fine-tune a pretrained model"
# query = "fine-tune an llm"
query = "how to fine-tune an llm?"

nodes_context = []
nodes = retriever.retrieve(query)


# Filter nodes with the same ref_doc_id
def filter_nodes_by_unique_doc_id(nodes):
    unique_nodes = {}
    for node in nodes:
        doc_id = node.node.ref_doc_id
        if doc_id is not None and doc_id not in unique_nodes:
            unique_nodes[doc_id] = node
    return list(unique_nodes.values())


nodes = filter_nodes_by_unique_doc_id(nodes)
print(len(nodes))

for node in nodes:
    print("Node ID\t", node.node_id)
    print("Title\t", node.metadata["title"])
    print("Text\t", node.text)
    print("Score\t", node.score)
    print("Metadata\t", node.metadata)
    print("-_" * 20)
    if node.metadata["retrieve_doc"] == True:
        print("This node will be replaced by the document")
        doc = document_dict[node.node.ref_doc_id]
        # print(doc.text)
        new_node = NodeWithScore(
            node=TextNode(text=doc.text, metadata=node.metadata), score=node.score
        )
        print(new_node.text)
        nodes_context.append(new_node)
    else:
        nodes_context.append(node)

print(len(nodes_context))

9
Node ID	 df8090eb-b13b-4f61-b94b-5489a43acfad
Title	 Generation with LLMs
Text	 That is why we have a GenerationConfig file associated with each model, which contains a good default generative parameterization and is loaded alongside your model.
Let’s talk code!
If you’re interested in basic LLM usage, our high-level Pipeline interface is a great starting point. However, LLMs often require advanced features like quantization and fine control of the token selection step, which is best done through generate() . Autoregressive generation with LLMs is also resource-intensive and should be executed on a GPU for adequate throughput.
First, you need to load the model.
Copied >>> from transformers import AutoModelForCausalLM >>> model = AutoModelForCausalLM.from_pretrained( ... "mistralai/Mistral-7B-v0.1" , device_map= "auto" , load_in_4bit= True ... )
You’ll notice two flags in the from_pretrained call:
device_map ensures the model is moved to your GPU(s) load_in_4bit applies 4-bit dynamic 

In [8]:
from llama_index.core import ChatPromptTemplate
from llama_index.core.llms import ChatMessage, MessageRole
from pydantic import BaseModel, Field

system_prompt = (
    "You are a witty AI teacher, helpfully answering questions from students of an applied artificial intelligence course on Large Language Models (LLMs or llm). Topics covered include training models, fine-tuning models, giving 'memory' to LLMs, prompting, hallucinations and bias, vector databases, transformer architectures, embeddings, RAG frameworks, Langchain, Llama-Index, LLMs interact with tool use, AI agents, reinforcement learning with human feedback. Questions should be understood with this context."
    "You are provided information found in Hugging Face's documentation and the RAG course. "
    "Only some information might be relevant to the question, so ignore the irrelevant part and use the relevant part to answer the question."
    "Only respond with information given to you documentation. DO NOT use additional information, even if you know the answer. "
    "If the answer is somewhere in the documentation, answer the question (depending on the questions and the variety of relevant information in the documentation, give complete and helpful answers."
    "Here is the information you can use, the order is not important: \n\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n\n"
    "REMEMBER:\n"
    "You are a witty AI teacher, helpfully answering questions from students of an applied artificial intelligence course on Large Language Models (LLMs or llm). Topics covered include training models, fine tuning models, giving memory to LLMs, prompting, hallucinations and bias, vector databases, transformer architectures, embeddings, RAG frameworks, Langchain, making LLMs interact with tool use, AI agents, reinforcement learning with human feedback. Questions should be understood with this context."
    "You are provided information found in Hugging Face's documentation and the RAG course. "
    "Here are the rules you must follow:\n"
    "* Only respond with information inside the documentation. DO NOT provide additional information, even if you know the answer. "
    "* If the answer is in the documentation, answer the question (depending on the questions and the variety of relevant information in the json documentation. Your answer needs to be pertinent and not redundant giving a clear explanation as if you were a teacher. "
    "* Only use information summarized from the documentation, do not respond otherwise. "
    "* Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
    "* Do not reference any links, urls or hyperlinks in your answers.\n"
    "* Make sure to format your answers in Markdown format, including code block and snippets.\n"
    "Now answer the following question: \n"
)

chat_text_qa_msgs: list[ChatMessage] = [
    ChatMessage(role=MessageRole.SYSTEM, content=system_prompt),
    ChatMessage(
        role=MessageRole.USER,
        content="{query_str}",
    ),
]

TEXT_QA_TEMPLATE = ChatPromptTemplate(chat_text_qa_msgs)

In [9]:
from IPython.display import Markdown
from llama_index.core.data_structs import Node
from llama_index.core.schema import NodeWithScore
from llama_index.core import get_response_synthesizer
from llama_index.llms.gemini import Gemini
from llama_index.llms.openai import OpenAI

# llm = Gemini(model="models/gemini-1.5-flash", temperature=1, max_tokens=None)
llm = Gemini(model="models/gemini-1.5-pro", temperature=1, max_tokens=None)
# llm = OpenAI(temperature=1, model="gpt-3.5-turbo", max_tokens=None)
# llm = OpenAI(temperature=1, model="gpt-4o", max_tokens=None)

response_synthesizer = get_response_synthesizer(
    llm=llm, response_mode="simple_summarize", text_qa_template=TEXT_QA_TEMPLATE
)

response = response_synthesizer.synthesize(query, nodes=nodes_context)
# print(response.response)
display(Markdown(response.response))

# for src in response.source_nodes:
#     print(src.node.ref_doc_id)
#     print("Node ID\t", src.node_id)
#     print("Title\t", src.metadata["title"])
#     print("Text\t", src.text)
#     print("Score\t", src.score)
#     print("Metadata\t", src.metadata)
#     print("-_" * 20)

This is a very broad question. There are many ways to fine-tune a large language model, and the best approach will depend on the specific model and the desired outcome. Generally, the process involves taking a pre-trained language model and further training it on a dataset specific to the task you are interested in. For example, you can fine-tune a large language model on a dataset of code to make it better at generating code. Also, you could fine-tune it on a dataset of dialogue to make it better at generating more engaging and human-like dialogue. 

Here is an example of fine-tuning a [google/gemma-2b](https://huggingface.co/google/gemma-2b) model on the IMDB dataset using the `trl.SFTTrainer` and the AdaLomo optimizer:

```python
import torch
import datasets
from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM
import trl

train_dataset = datasets.load_dataset('imdb', split='train')

args = TrainingArguments(
    output_dir= "./test-lomo",
    max_steps= 1000,
    per_device_train_batch_size= 4,
    optim= "adalomo",
    gradient_checkpointing= True,
    logging_strategy= "steps",
    logging_steps= 1,
    learning_rate= 2e-6,
    save_strategy= "no",
    run_name= "lomo-imdb",
)

model_id = "google/gemma-2b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage= True).to(0)

trainer = trl.SFTTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    dataset_text_field= 'text',
    max_seq_length= 1024,
)

trainer.train()
```

This is just one example of fine-tuning a large language model. There are many other methods and techniques available. I recommend checking out the Hugging Face documentation and exploring the different options to find the best approach for your needs. 


In [None]:
# import chromadb

# # create client and a new collection
# # chromadb.EphemeralClient saves data in-memory.
# chroma_client = chromadb.PersistentClient(path="./ai-tutor-db")
# chroma_collection = chroma_client.create_collection("ai-tutor-db")

In [None]:
# from llama_index.vector_stores.chroma import ChromaVectorStore
# from llama_index.core import StorageContext

# vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

# # Define a storage context object using the created vector store.
# storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
# import json
# from llama_index.core.schema import TextNode


# def load_jsonl_create_nodes(filepath):
#     nodes = []  # List to hold the created node objects
#     with open(filepath, "r") as file:
#         for line in file:
#             # Load each line as a JSON object
#             json_obj = json.loads(line)
#             # Extract required information
#             title = json_obj.get("title")
#             url = json_obj.get("url")
#             content = json_obj.get("content")
#             source = json_obj.get("source")
#             # Create a TextNode object and append to the list
#             node = TextNode(
#                 text=content,
#                 metadata={"title": title, "url": url, "source": source},
#                 excluded_embed_metadata_keys=["title", "url", "source"],
#                 excluded_llm_metadata_keys=["title", "url", "source"],
#             )
#             nodes.append(node)
#     return nodes

In [None]:
# filepath = "../combined_data.jsonl"
# nodes = load_jsonl_create_nodes(filepath)

# print(f"Loaded {len(nodes)} nodes/chunks from the JSONL file\n ")

# node = nodes[0]
# print(f"ID: {node.id_} \nText: {node.text}, \nMetadata: {node.metadata}")

# print("\n")

# node = nodes[-10000]
# print(f"ID: {node.id_} \nText: {node.text}, \nMetadata: {node.metadata}")

In [None]:
# # Create the pipeline to apply the transformation on each chunk,
# # and store the transformed text in the chroma vector store.
# pipeline = IngestionPipeline(
#     transformations=[
#         text_splitter,
#         QuestionsAnsweredExtractor(questions=3, llm=llm),
#         SummaryExtractor(summaries=["prev", "self"], llm=llm),
#         KeywordExtractor(keywords=10, llm=llm),
#         OpenAIEmbedding(),
#     ],
#     vector_store=vector_store
# )

# nodes = pipeline.run(documents=documents, show_progress=True);

In [None]:
# from llama_index.embeddings.openai import OpenAIEmbedding
# from llama_index.core import VectorStoreIndex

# # embeds = OpenAIEmbedding(model="text-embedding-3-small", mode="similarity")
# # embeds = OpenAIEmbedding(model="text-embedding-3-large", mode="similarity")
# embeds = OpenAIEmbedding(model="text-embedding-3-large", mode="text_search")
# # embeds = OpenAIEmbedding(model="text-embedding-ada-002", mode="similarity")

# # Build index / generate embeddings using OpenAI.
# index = VectorStoreIndex(
#     nodes=nodes,
#     show_progress=True,
#     use_async=True,
#     storage_context=storage_context,
#     embed_model=embeds,
#     insert_batch_size=3000,
# )

In [None]:
# from llama_index.llms.openai import OpenAI

# llm = OpenAI(temperature=0, model="gpt-3.5-turbo", max_tokens=None)
# query_engine = index.as_query_engine(llm=llm, similarity_top_k=5, embed_model=embeds)

In [None]:
# res = query_engine.query("What is the LLaMa model?")

In [None]:
# res.response

In [None]:
# for src in res.source_nodes:
#     print("Node ID\t", src.node_id)
#     print("Title\t", src.metadata["title"])
#     print("Text\t", src.text)
#     print("Score\t", src.score)
#     print("Metadata\t", src.metadata)
#     print("-_" * 20)

# Load DB from disk


In [None]:
# import logging

# logger = logging.getLogger(__name__)
# logging.basicConfig(level=logging.INFO)


# import chromadb
# from llama_index.vector_stores.chroma import ChromaVectorStore

# # Create your index
# db2 = chromadb.PersistentClient(path="./ai-tutor-dataset")
# chroma_collection = db2.get_or_create_collection("ai-tutor-dataset")
# vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [None]:
# # Create your index
# from llama_index.core import VectorStoreIndex

# index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

In [None]:
# from llama_index.embeddings.openai import OpenAIEmbedding
# from llama_index.llms.openai import OpenAI
# from llama_index.core.vector_stores import (
#     ExactMatchFilter,
#     MetadataFilters,
#     MetadataFilter,
#     FilterOperator,
#     FilterCondition,
# )

# filters = MetadataFilters(
#     filters=[
#         MetadataFilter(key="source", value="lanchain_course"),
#         MetadataFilter(key="source", value="langchain_docs"),
#     ],
#     condition=FilterCondition.OR,
# )

# llm = OpenAI(temperature=0, model="gpt-3.5-turbo", max_tokens=None)
# embeds = OpenAIEmbedding(model="text-embedding-3-large", mode="text_search")
# # query_engine = index.as_query_engine(
# #     llm=llm, similarity_top_k=5, embed_model=embeds, verbose=True, streaming=True, filters=filters
# # )
# query_engine = index.as_query_engine(
#     llm=llm,
#     similarity_top_k=5,
#     embed_model=embeds,
#     verbose=True,
# )

In [None]:
# res = query_engine.query("What is the LLama model?")

# # history = ""
# # for token in res.response_gen:
# #     history += token
# #     print(history)

In [None]:
# res.response

In [None]:
# for src in res.source_nodes:
#     print("Node ID\t", src.node_id)
#     print("Source\t", src.metadata["source"])
#     print("Title\t", src.metadata["title"])
#     print("Text\t", src.text)
#     print("Score\t", src.score)
#     print("-_" * 20)

In [None]:
# from IPython.display import Markdown, display


# # define prompt viewing function
# def display_prompt_dict(prompts_dict):
#     for k, p in prompts_dict.items():
#         text_md = f"**Prompt Key**: {k}<br>" f"**Text:** <br>"
#         display(Markdown(text_md))
#         print(p.get_template())
#         display(Markdown("<br><br>"))

In [None]:
# prompts_dict = query_engine.get_prompts()

In [None]:
# display_prompt_dict(prompts_dict)