In [None]:
# !pip install -q openai
# !pip install -q langchain
# !pip install -q guardrails-ai
# !pip install -q faiss-cpu
# !pip install -q pypdf
# !pip install -q python-dotenv
# !pip install -q datasets
# !pip install -q huggingface_hub

In [None]:
import os
from dotenv import load_dotenv
import json

from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage, AIMessage,  SystemMessage

#Guardrails
import openai
from rich import print
from langchain.output_parsers import GuardrailsOutputParser
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI

## Load Environment Varible

In [None]:
load_dotenv()
openai_api_key = os.environ.get("OPENAI_API_KEY")
huggingface_api_key = os.environ.get("HUGGINGFACE_API_KEY")

In [None]:
from huggingface_hub import notebook_login
from huggingface_hub import HfApi
from datasets import load_dataset
api = HfApi(token=huggingface_api_key)

## Loading the Document

In [None]:
# loader = PyPDFDirectoryLoader("/content/sample_data/Data/")
# loader = PyPDFDirectoryLoader("../cyber")
loader = PyPDFDirectoryLoader("../data")
data = loader.load()

In [None]:
print(data[0].page_content[:100])

In [None]:
file_name = "Extracted_data.txt"

# Open the file in write mode
with open(file_name, "w",encoding="utf-8") as file:
    for item in data:
        # Extract the "page_content" attribute from each item in the list
        page_content = item.page_content
        # Write the page_content to the file followed by a newline
        file.write(page_content + "\n")
print(f"Data has been written to {file_name}")

Chunking the Documents

In [None]:
#Step 05: Split the Extracted Data into Text Chunks
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=500)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=500)
text_chunks = text_splitter.split_documents(data)
print("Lenth of the whole documentation is:",len(text_chunks))

## Prompt template

[INPUT TEXT]

[CONTEXT ]

[REQUEST FOR Q+A]

[RESPONSE SAMPLE]


Getting Context of the Data

In [None]:
chat = ChatOpenAI(temperature=0.6) # for sythetic data generation
model = OpenAI(temperature=0) # for parsing the output

In [None]:
messages = [
    HumanMessage(
        content=f"{data[0].page_content}give the above information give me a single line rich summary with keywords in the beginning which can be used to descibe the entiner inforamtion"
    ),
]
response = chat(messages)

print(response.content)

## Generating QnA 

- *prompt* : provide {questions_per_chunk} question and answer pairs base on the text above , The Question must begin with \n"In the context of ...\".The answer borrow, verbatim, from the text above. In providing each question consider that the reader does not see of have access to any of the other questions from context. Vary the style and formate fo quesitons. Respond in plain test on a new line for each question and answer. Do not include Do no include qestion numbers, Here is an exmaple of two question and answer paids:\n\n {tain_sample}

In [None]:
dataset_name = "TaoGPT-v1.json"

json_response_format = [
                {
                    "question": "In the context of ...",
                    "answer": "..."
                },
                {
                    "question": "In the context of ...",
                    "answer": "..."
                },
                {
                    "question": "In the context of ...",
                    "answer": "..."
                }
            ]

rail_spec = """
<rail version="0.1">
<output>
<list name="data" description="list of question answer pairs">
    <object>
        <string name="question" description="the question"/>
        <string name="answer"  description="the answer"/>
    </object>
</list>
</output>
<prompt>

Given the following list of json question and answer paids , please extract it in a proper JSON formate 
${generated_data_json}

${gr.complete_json_suffix_v2}
</prompt>
</rail>
"""
def write_to_json_file(json_data, json_file_name):
    try:
        try:
            with open(json_file_name, "r") as outfile:
                data = json.load(outfile)
            for json_data_pairs in json_data:
                data.append(json_data_pairs)
            with open(json_file_name, "w") as outfile:
                json.dump(data, outfile)
        except FileNotFoundError:
            with open(json_file_name, "w") as outfile:
                json.dump(json_data, outfile)        
    except Exception as e:
        print("Error in write_to_json_file", e)

In [None]:
for text in text_chunks:
    try:
        context_messages = [
            HumanMessage(
                content=f"{text.page_content}give the above information give me a single line rich summary with keywords in the beginning which can be used to descibe the entiner inforamtion"
            ),
        ]
        context_response = chat(context_messages)
        print(context_response.content)
        try:
            generated_messages = [
                HumanMessage(
                    content=f"""given the context:{context_response} 
                    and the information {text.page_content} provide 5 question and answer pairs base on the text above , 
                    The Question must begin with 
                    "In the context of ...\".The answer borrow, verbatim, from the text above. In providing each question consider that the reader does not see of have access to any of the other questions from context. Vary the style and formate fo quesitons. 
                    Respond in only JSON following this formate
                    {json_response_format} and nothing else"""
                ),
            ]
            generated_response = chat(generated_messages)
            try:
                eval_generated_json = eval(generated_response.content)
                if len(eval_generated_json) >= 1:
                    write_to_json_file(eval_generated_json, dataset_name)
                    print(eval_generated_json)
                else:
                    raise Exception("Data generated not in the right format")
            except:
                # output_parser = GuardrailsOutputParser.from_rail_string(extractor_rail)
                output_parser = GuardrailsOutputParser.from_rail_string(rail_spec, api=openai.ChatCompletion.create)
                prompt = PromptTemplate(
                    template=output_parser.guard.base_prompt,
                    input_variables=output_parser.guard.prompt.variable_names,
                )

                model = OpenAI(temperature=0) # type: ignore
                generated_response_final = model(prompt.format_prompt(generated_data_json=generated_response.content).to_string())
                generated_response_final = output_parser.parse(generated_response_final)
                generated_response_final = generated_response_final["data"]
                print(generated_response_final)                
                write_to_json_file(generated_response_final, dataset_name)
        except:
            print("Error in generating the data")
    except Exception as e:
        print("Failed to get context of the text")
    completion_precentage = text_chunks.index(text)/len(text_chunks)*100s
    print(f"\n---------------------------------------------------{completion_precentage}\n")
    


## HuggingFace

In [None]:
dataset = load_dataset('json', data_files="./TaoGPT-v1.json")
dataset

In [None]:
# dataset.push_to_hub("Dataset name")
dataset.push_to_hub("agency888/TaoGPT-v1")

In [None]:
# Pushing Dataset.json
api.upload_file(
    path_or_fileobj="./TaoGPT-v1.json",
    path_in_repo="TaoGPT-v1.json",
    repo_id="agency888/TaoGPT-v1",
    repo_type="dataset",
)

In [None]:
# Pushing Dataset Readme
api.upload_file(
    path_or_fileobj="./README.md",
    path_in_repo="README.md",
    repo_id="agency888/TaoGPT-v1",
    repo_type="dataset",
)

Formatting Data

In [None]:
import json

# Load data from the JSON file
with open("./TaoGPT-v1.json", "r") as file:
    data = json.load(file)
error = 0
# Format and add the new fields to each item in the data using a for loop
formatted_data = []
for item in data:
    # formatted_item = format_and_add_fields(item)
    # formatted_data.append(formatted_item)
    # print(item)
    try:
        question = item["question"]
        answer = item["answer"]

        # Create text field
        item["text"] = f"{question} {answer}"

        # Create text_finetuning field
        item["text_finetuning"] = f"Here is a question based on Taoscience. ### Question : {question} answer in detail ### Answer : <s> {answer} </s>"

        # Create text_mistral field
        item["text_mistral"] = f"<s>[INST] {question}[/INST]{answer}</s>"
        formatted_data.append(item)
    except:
        error += 1
        print(f"Error in formatting the data at {data.index(item)} index \n\n Take a look at this: \n{item}")
# Save the updated data back to the JSON file
with open("TaoGPT-v1-formatted.json", "w") as file:
    json.dump(formatted_data, file, indent=4)

print("Formatted data has been saved to formatted_data.json")
print("Number of errors:", error)


In [None]:
dataset = load_dataset('json', data_files="./TaoGPT-v1-formatted.json")
dataset

In [None]:
# dataset.push_to_hub("Dataset name")
dataset.push_to_hub("agency888/TaoGPT-v1")

In [None]:
# Pushing Dataset.json
api.upload_file(
    path_or_fileobj="./TaoGPT-v1-formatted.json",
    path_in_repo="TaoGPT-v1-formatted.json",
    repo_id="agency888/TaoGPT-v1",
    repo_type="dataset",
)