# LlamaIndex Dataset Creation

In [93]:
from pymupdf4llm import LlamaMarkdownReader
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from openai import OpenAI
from llm_finetuning.llm import LLM
from llm_finetuning.config import RAW_DATA_DIR, EMBEDDING_METHOD

# Define the reader object
llama_reader = LlamaMarkdownReader()

# Load the data
indexed_chunks = llama_reader.load_data(RAW_DATA_DIR / "sample.pdf")

# Define the OpenAI client
client = OpenAI(base_url="http://172.29.3.249:1234/v1", api_key="lm-studio")

# Define the LLM object
llm = LLM(client=client, embedding_method=EMBEDDING_METHOD)

# Generate the dataset
response = llm.generate_dataset(indexed_chunks)

# Save the dataset to a file

ModuleNotFoundError: No module named 'langchain_chroma'

# LLM Finetuning Dataset Creation

In [91]:
from openai import OpenAI


class LLM:
    def __init__(self, client: OpenAI, n_questions: int = 3):
        self.client = client
        self.n_questions = n_questions

    def generate_questions_and_answers(self, indexed_chunks):
        import random

        responses = list()

        try:
            for chunk in indexed_chunks:
                # Select a random temperature between 0.0 and 0.2
                temperature = round(random.uniform(0.0, 0.2), 2)

                # Define the prompt
                completion = self.client.chat.completions.create(
                    model="bartowski/Llama-3-8B-Instruct-Gradient-1048k-GGUF",
                    messages=[
                        {
                            "role": "system",
                            "content": "You are a helpful assistant that can generate questions and answers from the provided text.",
                        },
                        {
                            "role": "user",
                            "content": f"Generate {self.n_questions} questions and answers from the provided text. Output should only include the questions and answers. There must not be any other text, dash, line, or output.",
                        },
                        {"role": "user", "content": chunk.text},
                    ],
                    temperature=temperature,
                    response_format={"type": "json_object"},
                )

                # Get the response
                response = completion.choices[0].message.content.strip("</s>")

                # Add the response to the list
                responses.append(response)
        except KeyboardInterrupt:
            return responses

        return responses

    def convert_to_json(self, questions_and_answers):
        import json

        responses = list()

        for i in range(len(questions_and_answers)):
            # Define the prompt
            completion = self.client.chat.completions.create(
                model="bartowski/Llama-3-8B-Instruct-Gradient-1048k-GGUF",
                messages=[
                    {
                        "role": "system",
                        "content": "You are a helpful assistant that convert the provided text to valid JSON.",
                    },
                    {
                        "role": "user",
                        "content": "Convert the provided text to valid JSON. Output should only be the valid JSON. For example: {[{'question': question1, 'answer': answer1}, {'question': question2, 'answer': answer2}, {'question': question3, 'answer': answer3}]}. There must not be any other text, dash, line, or output.",
                    },
                    {"role": "user", "content": questions_and_answers[i]},
                ],
                temperature=0.0,
                response_format={"type": "json_object"},
            )

            # Get the response
            response = completion.choices[0].message.content.strip("</s>")

            # Try to convert the response to a JSON object
            # If the response is not a valid JSON object, return an empty list
            try:
                response_json = json.loads(response)
            except json.JSONDecodeError:
                response_json = []

            # Add the response to the list
            responses.append(response_json)

        return responses

    def generate_dataset(self, indexed_chunks):
        response = self.generate_questions_and_answers(indexed_chunks)
        response_json = self.convert_to_json(response)

        return response_json

    def query_dataset(self, prompt: str, indexed_chunks):
        # Define the prompt
        completion = self.client.chat.completions.create(
            model="bartowski/Llama-3-8B-Instruct-Gradient-1048k-GGUF",
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful assistant that answer the question based on the provided dataset.",
                },
                {
                    "role": "user",
                    "content": f"Dataset: {indexed_chunks[0].text}",
                },
                {"role": "user", "content": prompt},
            ],
            temperature=0.0,
            response_format={"type": "text"},
        )

        # Get the response
        response = completion.choices[0].message.content.strip("</s>")

        return response

In [92]:
import json

# Define the OpenAI client
client = OpenAI(base_url="http://172.29.3.249:1234/v1", api_key="lm-studio")

# Define the LLM object
llm = LLM(client=client)

# Generate the dataset
response = llm.generate_dataset(indexed_chunks)

# Save the dataset to a file
with open("../data/processed/finetuning_dataset.json", "w") as f:
    json.dump(response, f)

# RAG

In [None]:
from pymupdf4llm import LlamaMarkdownReader

# Define the reader object
llama_reader = LlamaMarkdownReader()

# Load the data
indexed_chunks = llama_reader.load_data("../data/raw/sample.pdf")