In [1]:
"""This example shows how to automatically build an 'instruct' dataset from prompt state history"""

"This example shows how to automatically build an 'instruct' dataset from prompt state history"

In [3]:
import json
import os
from llmware.prompts import Prompt
from llmware.dataset_tools import Datasets
from llmware.configs import LLMWareConfig

In [5]:
# Use prompt history to easily create model-ready fine-tuning datasets
def create_datasets_from_prompt_history(model_name):

    context = "Joe Biden is the 46th President of the United States.  He was born in Scranton, " \
              "Pennsylvania.  He served as Vice President from 2008 through 2016."

    # Create a Prompt
    prompter = Prompt(save_state=True)
    prompter.load_model(model_name)

    # Perform several prompts
    print (f"\n > Performing several prompts to populate the prompt state...")

    response = prompter.prompt_main("Who was the 46th president?", context=context,
                                    register_trx=True)

    response = prompter.prompt_main(prompt="What year did Joe Biden start as vice president?", context=context,
                                    register_trx=True)

    response = prompter.prompt_main(prompt="Who is Joe Biden?", context=context, register_trx=True)

    for i, entries in enumerate(prompter.interaction_history):
        print("update: interaction prompt history created: ", i, entries)

    prompter.save_state()

    # Create a Datasets object
    datasets = Datasets(testing_split=0.0, validation_split=0.0)

    # Create dataset wrapped in "Alpaca format"
    print (f"\n > Creating a dataset from prompt history in ALPACA format...")
    alpaca_dataset = datasets.build_gen_ds_from_prompt_history(prompt_wrapper="alpaca")
    print (f"\nThe dataset dict:\n{json.dumps(alpaca_dataset, indent=2)}")
    sample = datasets.get_dataset_sample(datasets.current_ds_name)
    print (f"\nRandom sample from the dataset:\n{json.dumps(sample, indent=2)}")

    # Create dataset wrapped in "Chat GPT format"
    print (f"\n > Creating a dataset from prompt history in CHAT GPT format...")
    chatgpt_dataset = datasets.build_gen_ds_from_prompt_history(prompt_wrapper="chat_gpt")
    print (f"\nThe dataset dict:\n{json.dumps(chatgpt_dataset, indent=2)}")
    sample = datasets.get_dataset_sample(datasets.current_ds_name)
    print (f"\nRandom sample from the dataset:\n{json.dumps(sample, indent=2)}")

    # Create dataset wrapped in "human_bot format"
    print (f"\n > Creating a dataset from prompt history in HUMAN BOT format...")
    humanbot_dataset = datasets.build_gen_ds_from_prompt_history(prompt_wrapper="human_bot")
    print (f"\nThe dataset dict:\n{json.dumps(humanbot_dataset, indent=2)}")
    sample = datasets.get_dataset_sample(datasets.current_ds_name)
    print (f"\nRandom sample from the dataset:\n{json.dumps(sample, indent=2)}")

    return 0

In [7]:
if __name__ == "__main__":

    LLMWareConfig().set_active_db("sqlite")

    model_name = "llmware/bling-1b-0.1"

    create_datasets_from_prompt_history(model_name)




 > Performing several prompts to populate the prompt state...
update: interaction prompt history created:  0 {'llm_response': ' Joe Biden', 'prompt': 'Who was the 46th president?', 'evidence': 'Joe Biden is the 46th President of the United States.  He was born in Scranton, Pennsylvania.  He served as Vice President from 2008 through 2016.', 'instruction': 'default_with_context', 'model': 'llmware/bling-1b-0.1', 'usage': {'input': 49, 'output': 3, 'total': 52, 'metric': 'tokens', 'processing_time': 1.197199821472168}, 'time_stamp': '2024-05-15_212742', 'calling_app_ID': '', 'rating': '', 'account_name': 'llmware', 'prompt_id': '0057a3c0-3609-4d9e-b833-1ad2545ec720', 'batch_id': 0, 'evidence_metadata': [{'evidence_start_char': 0, 'evidence_stop_char': 146, 'page_num': 'NA', 'source_name': 'NA', 'doc_id': 'NA', 'block_id': 'NA'}], 'event_type': 'inference', 'human_feedback': '', 'human_assessed_accuracy': ''}
update: interaction prompt history created:  1 {'llm_response': ' 2008', 'promp

In [8]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Total Memory: {torch.cuda.get_device_properties(device).total_memory / 1024**3:.2f} GB")
    print(f"CUDA Version: {torch.version.cuda}")
else:
    print("No GPU available. Using CPU instead.")


No GPU available. Using CPU instead.


In [9]:
import torch
print(torch.__version__)

2.3.0+cpu


In [10]:
import torch

# Check if CUDA (GPU) is available
if torch.cuda.is_available():
    # Get the number of available GPUs
    gpu_count = torch.cuda.device_count()
    print(f"Found {gpu_count} GPU(s) available.")
else:
    print("CUDA is not available. Using CPU.")

CUDA is not available. Using CPU.


In [5]:
!conda install anaconda

^C


In [3]:
!conda update --name base conda

Channels:
 - conda-forge
 - defaults
 - pytorch
Platform: win-64
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [4]:
!nvidia-smi

Thu May 16 00:07:20 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 551.78                 Driver Version: 551.78         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4050 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   54C    P4             11W /   30W |       0MiB /   6141MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [6]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Mar_28_02:30:10_Pacific_Daylight_Time_2024
Cuda compilation tools, release 12.4, V12.4.131
Build cuda_12.4.r12.4/compiler.34097967_0


In [7]:
!pip install --upgrade torch torchvision torchaudio

Collecting torchaudio
  Using cached torchaudio-2.3.0-cp311-cp311-win_amd64.whl.metadata (6.4 kB)
Downloading torchaudio-2.3.0-cp311-cp311-win_amd64.whl (2.4 MB)
   ---------------------------------------- 0.0/2.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.4 MB ? eta -:--:--
   ----- ---------------------------------- 0.3/2.4 MB 4.2 MB/s eta 0:00:01
   ------------------------- -------------- 1.5/2.4 MB 12.0 MB/s eta 0:00:01
   ---------------------------------------- 2.4/2.4 MB 15.1 MB/s eta 0:00:00
Installing collected packages: torchaudio
  Attempting uninstall: torchaudio
    Found existing installation: torchaudio 2.0.2
    Uninstalling torchaudio-2.0.2:
      Successfully uninstalled torchaudio-2.0.2
Successfully installed torchaudio-2.3.0




In [11]:
!pip install nomic

Collecting nomic
  Downloading nomic-3.0.28.tar.gz (44 kB)
     ---------------------------------------- 0.0/44.2 kB ? eta -:--:--
     ----------------- -------------------- 20.5/44.2 kB 682.7 kB/s eta 0:00:01
     -------------------------------------- 44.2/44.2 kB 547.7 kB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting click (from nomic)
  Using cached click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting jsonlines (from nomic)
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Collecting loguru (from nomic)
  Downloading loguru-0.7.2-py3-none-any.whl.metadat

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorboard 2.13.0 requires grpcio>=1.48.2, which is not installed.
tensorflow-intel 2.13.0 requires grpcio<2.0,>=1.24.3, which is not installed.
tensorflow-intel 2.13.0 requires typing-extensions<4.6.0,>=3.6.6, but you have typing-extensions 4.11.0 which is incompatible.


In [12]:
!pip install grpcio>=1.48.2

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-intel 2.13.0 requires typing-extensions<4.6.0,>=3.6.6, but you have typing-extensions 4.11.0 which is incompatible.
