In [76]:
!pip install datasets -q


In [77]:
import os
import json
from datasets import Dataset, DatasetDict
import yaml
from tqdm import tqdm as tqdm_bar

In [78]:
def process_folder(folder_path,filename , prompts):
    dataset = {"table": [], "prompt": [], "label": []}

    for foldername in tqdm_bar(os.listdir(folder_path),desc=filename):
        file_path = os.path.join(folder_path, foldername,filename)

        if os.path.isfile(file_path):
            with open(file_path, 'r') as file:
                file_content = json.load(file)
                # Extract data and insights
                table_data = file_content.get("Data", [])
                insights = file_content.get("Insights", [])

                # Repeat the table for each insight
                for idx,insight in enumerate(insights[0].values()):
                    # Add the entry to the dataset
                    dataset["table"].append(table_data)
                    dataset["prompt"].append(prompts[idx].replace("{table_data}" , str(table_data)))
                    dataset["label"].append(insight)

    return dataset

In [79]:
def create_and_save_dataset(input_folder, output_file , prompts):
    # Process both Train_data and Test_data folders
    train_dataset_dict = process_folder(os.path.join(input_folder, "Train_data"),"Train.json",prompts)
    test_dataset_dict = process_folder(os.path.join(input_folder, "Test_data"),"Test.json" , prompts)

    # Combine train and test datasets
    combined_dataset_dict = {
        "table": train_dataset_dict["table"] + test_dataset_dict["table"],
        "prompt": train_dataset_dict["prompt"] + test_dataset_dict["prompt"],
        "label": train_dataset_dict["label"] + test_dataset_dict["label"],
    }

    # Calculate the index to split the data into train and validation
    split_index = len(train_dataset_dict["table"])

    # Create a Hugging Face Dataset
    combined_dataset = Dataset.from_dict(combined_dataset_dict)

    # Split the combined dataset into train and validation
    dataset_dict = DatasetDict({
        "train": combined_dataset.select(indices=list(range(0, split_index))),
        "validation": combined_dataset.select(indices=list(range(split_index, len(combined_dataset)))),
    })

    # Save the combined dataset to a file
    return dataset_dict

In [80]:
# Get the data dictionary
dict_file_path = "/content/drive/MyDrive/Colab Notebooks/Table to insights/Config/data_dictionary.yaml"

# Read the YAML file
with open(dict_file_path, 'r') as file:
    yaml_content = yaml.safe_load(file)

# Extract the data after the "columns" key
extracted_data = yaml_content.get("columns", [])

# Print the extracted data
for i in extracted_data :
  print(i)

{'name': 'cpu_usage', 'description': 'CPU usage in %'}
{'name': 'ram_usage', 'description': 'RAM usage in %'}
{'name': 'diskio_usage', 'description': 'Disk usage in %'}
{'name': 'date', 'description': 'Date on which the machine logs are recorded'}


In [81]:
# Add Prompts for each insights to be generated is where the data will be appended
query1 = "What does the data contains and say about. Respond within 2 Lines"
query2 = "What are the top 3 most important insights"
query3 = "What are the top 3 abberations present in the dataset"

prompts = []

prompt_template = "Answer the below query based on the following table , there is a Data Dictionary provided to give context for each column\n \
\n\
Table:\n\
{table_data}\
\n\
Data Dictionary:\n\
{data_dictionary}\n\
\n\
Query:\n\
{query}"

# Update template with each query and append to prompts
prompts.append(prompt_template.format(table_data="{table_data}" ,data_dictionary=extracted_data ,query=query1))
prompts.append(prompt_template.format(table_data="{table_data}" ,data_dictionary=extracted_data ,query=query2))
prompts.append(prompt_template.format(table_data="{table_data}" ,data_dictionary=extracted_data ,query=query3))

# Process both Train_data and Test_data folders
analytical_dataset = create_and_save_dataset("/content/drive/MyDrive/Colab Notebooks/Table to insights/Data/Processed_Data",\
                        "/content/drive/MyDrive/Colab Notebooks/Table to insights/Data/Analytical Datset/Hf_compatible_master_data",
                        prompts)

Train.json: 100%|██████████| 172/172 [00:00<00:00, 484.21it/s]
Test.json: 100%|██████████| 43/43 [00:00<00:00, 461.65it/s]


In [82]:
analytical_dataset

DatasetDict({
    train: Dataset({
        features: ['table', 'prompt', 'label'],
        num_rows: 516
    })
    validation: Dataset({
        features: ['table', 'prompt', 'label'],
        num_rows: 129
    })
})

In [83]:
print(f"The example for a prompt is:\n{analytical_dataset['train'][0]['prompt']}\n{'-'*100}\nThe response generated is:\n{analytical_dataset['train'][0]['label']}")

The example for a prompt is:
Answer the below query based on the following table , there is a Data Dictionary provided to give context for each column
 
Table:
[{'date': '2022-08-02', 'cpu_usage': 7.8451691553, 'ram_usage': 52.6620808196, 'diskio_usage': None}, {'date': '2022-08-03', 'cpu_usage': 5.0294155659, 'ram_usage': 53.5197459169, 'diskio_usage': None}, {'date': '2022-08-04', 'cpu_usage': 4.8550193826, 'ram_usage': 53.6561216662, 'diskio_usage': None}, {'date': '2022-08-05', 'cpu_usage': 8.0148437177, 'ram_usage': 55.2451243255, 'diskio_usage': None}, {'date': '2022-08-07', 'cpu_usage': 16.9099194876, 'ram_usage': 54.216115395, 'diskio_usage': None}, {'date': '2022-08-08', 'cpu_usage': 6.9691407072, 'ram_usage': 59.4594977502, 'diskio_usage': None}, {'date': '2022-08-09', 'cpu_usage': 6.9463629233, 'ram_usage': 62.009759781, 'diskio_usage': None}, {'date': '2022-08-10', 'cpu_usage': 6.9103194256, 'ram_usage': 62.3286668314, 'diskio_usage': None}, {'date': '2022-08-11', 'cpu_usag

In [84]:
print(f"The example for a prompt is:\n{analytical_dataset['validation'][0]['prompt']}\n{'-'*100}\nThe response generated is:\n{analytical_dataset['validation'][0]['label']}")

The example for a prompt is:
Answer the below query based on the following table , there is a Data Dictionary provided to give context for each column
 
Table:
[{'date': '2022-05-24', 'cpu_usage': 14.41051994, 'ram_usage': None, 'diskio_usage': None}, {'date': '2022-05-27', 'cpu_usage': 15.08522954, 'ram_usage': None, 'diskio_usage': None}, {'date': '2022-05-30', 'cpu_usage': 16.92334479, 'ram_usage': None, 'diskio_usage': None}, {'date': '2022-06-02', 'cpu_usage': 17.34803182, 'ram_usage': None, 'diskio_usage': None}, {'date': '2022-06-05', 'cpu_usage': 18.77696087, 'ram_usage': None, 'diskio_usage': None}, {'date': '2022-06-08', 'cpu_usage': 21.13115076, 'ram_usage': None, 'diskio_usage': None}, {'date': '2022-06-11', 'cpu_usage': 22.68988893, 'ram_usage': None, 'diskio_usage': None}, {'date': '2022-06-14', 'cpu_usage': 22.47543662, 'ram_usage': None, 'diskio_usage': None}, {'date': '2022-06-17', 'cpu_usage': 25.33039605, 'ram_usage': None, 'diskio_usage': None}, {'date': '2022-06-20

In [85]:
analytical_dataset.save_to_disk("/content/drive/MyDrive/Colab Notebooks/Table to insights/Data/Analytical Datset")


Saving the dataset (0/1 shards):   0%|          | 0/516 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/129 [00:00<?, ? examples/s]