In [None]:
# !pip install -q openai==0.28.0
# !pip install -q langchain
# !pip install -q guardrails-ai
# !pip install -q faiss-cpu
# !pip install -q pypdf
# !pip install -q python-dotenv
# !pip install -q datasets
# !pip install -q huggingface_hub
# !pip install tqdm

In [1]:
import os
from dotenv import load_dotenv
import json

from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage, AIMessage,  SystemMessage
from langchain.document_loaders import YoutubeLoader


from rich import print

#Guardrails
# from langchain.output_parsers import GuardrailsOutputParser
# from langchain.prompts import PromptTemplate
# from langchain.llms import OpenAI

## Name of Dataset

In [2]:
dataset_name = "Dataset name"

## Load Environment Varible

In [3]:
from openai import OpenAI
client = OpenAI()
from dotenv import load_dotenv

load_dotenv()
import os

openai_api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(
    # This is the default and can be omitted
    # api_key = openai_api_key,
)

In [4]:
huggingface_api_key = os.environ.get("HUGGINGFACE_API_KEY")

In [5]:
from huggingface_hub import HfApi
from datasets import load_dataset
api = HfApi(token=huggingface_api_key)

  from .autonotebook import tqdm as notebook_tqdm


## Loading the Document

In [None]:
# loader = PyPDFDirectoryLoader("/content/sample_data/Data/")
# loader = PyPDFDirectoryLoader("../cyber")
loader = PyPDFDirectoryLoader("../data")
data = loader.load()

In [None]:
print(data[0])

print(len(data))

## Loading Transcript from Youtube

In [None]:
list_of_urls = []

In [None]:
from rich import print

# overall_data = []

for url in list_of_urls:
    loader = YoutubeLoader.from_youtube_url(url, add_video_info=True)
    youtube_data = loader.load()
    data.append(youtube_data[0])
print(len(data))

In [None]:
file_name = "Extracted_data.txt"

# Open the file in write mode
with open(file_name, "w",encoding="utf-8") as file:
    for item in data:
        # Extract the "page_content" attribute from each item in the list
        page_content = item.page_content
        # Write the page_content to the file followed by a newline
        file.write(page_content + "\n")
print(f"Data has been written to {file_name}")

Chunking the Documents

In [None]:
#Step 05: Split the Extracted Data into Text Chunks
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=500)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=500)
text_chunks = text_splitter.split_documents(data)
print("Lenth of the whole documentation is:",len(text_chunks))

## Prompt template

[INPUT TEXT]

[CONTEXT ]

[REQUEST FOR Q+A]

[RESPONSE SAMPLE]


Getting Context of the Data

## Generating QnA 

- *prompt* : provide {questions_per_chunk} question and answer pairs base on the text above , The Question must begin with \n"In the context of ...\".The answer borrow, verbatim, from the text above. In providing each question consider that the reader does not see of have access to any of the other questions from context. Vary the style and formate fo quesitons. Respond in plain test on a new line for each question and answer. Do not include Do no include qestion numbers, Here is an exmaple of two question and answer paids:\n\n {tain_sample}

In [None]:
json_response_format = [
                {
                    "question": "In the context of ...",
                    "answer": "..."
                },
                {
                    "question": "In the context of ...",
                    "answer": "..."
                },
                {
                    "question": "In the context of ...",
                    "answer": "..."
                }
            ]

import json

def save_to_jsonl(dataset_name, question_answer_list):
    """
    Save question-answer pairs with metadata to a JSONL file.

    Parameters:
        - dataset_name (str): The name of the dataset.
        - question_answer_list (list): List of dictionaries containing question, answer, and metadata.
    """
    file_name = f"../data/{dataset_name}.jsonl"

    with open(file_name, "a", encoding="utf-8") as file:
        for qa in question_answer_list:
            # Combine question, answer, and metadata into a dictionary
            entry = {
                "question": qa["question"],
                "answer": qa["answer"],
                "content": qa.get("content", ""),
            }
            # Write the dictionary to the JSONL file
            file.write(json.dumps(entry, ensure_ascii=False) + "\n")

In [None]:
def process_text(text):
    # Short Response
    short_response = client.chat.completions.create(
        model="gpt-3.5-turbo-1106",
        temperature=0.3,
        response_format={"type": "json_object"},
        messages=[
            {
                "role": "system",
                "content": f"""You are an expert at qurating/generating questions and answers from a given piece of text.
                            The questions and answers you generate are unique from one another and are not repeated.
                            You always respond in the following json format ```question_answer:{json_response_format}```"""
            },
            {
                "role": "user",
                # you should add the summary and modify the prompt to your liking
                "content": f"""given the context which is about *summary of the document* 
                \n
                {text.page_content} 
                \n
                provide 10 question and answer pairs base on the text above , 
                The Question must begin with "In the context of ...\".The answer borrow, verbatim, from the text above. 
                In providing each question consider that the reader does not see or have access to any of the other questions from context. 
                Vary the style and format of questions. Let the answers be descriptive around 100 to 200 words
                """
                # Respond in only JSON following this format and nothing else {json_response_format}
            }
        ]
    )

    try:
        short_response_output = eval(short_response.choices[0].message.content)

        # Check if the format is correct and meets the criteria
        if (
            isinstance(short_response_output, dict)
            and "question_answer" in short_response_output
            and isinstance(short_response_output["question_answer"], list)
            and len(short_response_output["question_answer"]) >= 9
            and all(
                isinstance(qa, dict)
                and "question" in qa
                and "answer" in qa
                for qa in short_response_output["question_answer"]
            )
        ):
            # Add metadata to each question-answer pair
            for qa in short_response_output["question_answer"]:
                qa["content"] = f"{text.page_content}"
                qa["metadata"] = text.metadata

            # print("Short response format is correct.")
        else:
            print("Short response format is incorrect. Running the query again.")
            

    except Exception as e:
        print("Error in short_response_output", e)

    # Long Response
    long_response = client.chat.completions.create(
        model="gpt-3.5-turbo-1106",
        temperature=0.3,
        response_format={"type": "json_object"},
        messages=[
            {
                "role": "system",
                "content": f"""You are an expert at qurating/generating questions and answers from a given piece of text.
                            The questions and answers you generate are unique from one another and are not repeated.
                            You always respond in the following json format ```question_answer:{json_response_format}```"""
            },
            {
                "role": "user",
                "content": f"""given the context which is about Tao Science which is written by "Rulin Xiu" and "Zhi Gang Sha"
                \n
                {text.page_content} 
                \n
                provide 5 question and answer pairs base on the text above , 
                The Question must begin with "In the context of...\".The answer borrow, verbatim, from the text above. 
                In providing each question consider that the reader does not see or have access to any of the other questions from context. 
                Vary the style and format of questions. Let the answers be descriptive and lengthy.
                The answer should at least be 1000 words
                """
                # Respond in only JSON following this format and nothing else {json_response_format}
            }
        ]
    )

    try:
        long_response_output = eval(long_response.choices[0].message.content)

        # Check if the format is correct and meets the criteria
        if (
            isinstance(long_response_output, dict)
            and "question_answer" in long_response_output
            and isinstance(long_response_output["question_answer"], list)
            and len(long_response_output["question_answer"]) >= 4
            and all(
                isinstance(qa, dict)
                and "question" in qa
                and "answer" in qa
                for qa in long_response_output["question_answer"]
            )
        ):
            # Add metadata to each question-answer pair
            for qa in long_response_output["question_answer"]:
                qa["content"] = f"{text.page_content}"
                qa["metadata"] = text.metadata

            # print("Long response format is correct.")
        else:
            print("Long response format is incorrect. Running the query again.")
            

    except Exception as e:
        print("Error in long_response_output", e)
    save_to_jsonl(dataset_name, short_response_output["question_answer"])
    save_to_jsonl(dataset_name, long_response_output["question_answer"])

## Lets start synthetically generating the prompts

In [None]:
import concurrent.futures
from tqdm import tqdm

with concurrent.futures.ThreadPoolExecutor() as executor:
    # Submit each text for processing
    futures = [executor.submit(process_text, text) for text in text_chunks]

    # Use tqdm to create a progress bar
    for _ in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing"):
        pass  


## HuggingFace

In [19]:
dataset = load_dataset('json', data_files="./stringified_TaoGPT-v2.jsonl")
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'metadata', 'content'],
        num_rows: 6569
    })
})

In [20]:
# dataset.push_to_hub("Dataset name")
dataset.push_to_hub("account/datastname")

Creating parquet from Arrow format: 100%|██████████| 7/7 [00:00<00:00, 143.84ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.60s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/agency888/TaoGPT-v2/commit/fd5e95ec764d1e04b8eacfc1d42bc9217d2956a3', commit_message='Upload dataset', commit_description='', oid='fd5e95ec764d1e04b8eacfc1d42bc9217d2956a3', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# Pushing Dataset.json
api.upload_file(
    path_or_fileobj="./dataset.json",
    path_in_repo="dataset.json",
    repo_id="account/datastname",
    repo_type="dataset",
)

In [None]:
# Pushing Dataset Readme
api.upload_file(
    path_or_fileobj="./README.md",
    path_in_repo="README.md",
    repo_id="account/datastname",
    repo_type="dataset",
)