In [2]:
# https://huggingface.co/datasets/tatsu-lab/alpaca?row=4

In [4]:
%%capture
#hide
!wget https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json

In [5]:
import json

def preprocess_alpaca_json_to_text_array(alpaca_dataset_path: str):
    """
    Creates an array of text representations given the Alpaca JSON dataset.

    :param alpaca_dataset_path: path of the Alpaca dataset
    :return: array of formatted text instructions with input and response
    """
    with open(alpaca_dataset_path, 'r') as f:
        alpaca_data = json.load(f)

    text_array = []

    for data in alpaca_data:
        instruction = f"### Instruction: {data['instruction']}\n"
        input_text = f"### Input: {data['input']}\n"
        response = f"### Response: {data['output']}\n"
        formatted_text = f"{instruction}\n{input_text}\n{response}\n"
        text_array.append(formatted_text)

    return text_array

In [6]:
import csv

def create_csv_from_text_array(text_array, csv_output_path: str):
    """
    Saves an array of text representations to a CSV file with a single 'text' column.

    :param text_array: array of formatted text instructions with input and response
    :param csv_output_path: path to save the CSV file
    """
    with open(csv_output_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['text'])  # Write the header
        for text_entry in text_array:
            writer.writerow([text_entry])  # Write each text entry as a row

In [7]:
# Example usage:
alpaca_dataset_path = "alpaca_data.json"
text_array = preprocess_alpaca_json_to_text_array(alpaca_dataset_path)

# Now, 'formatted_data_array' contains the formatted strings.
# If you want to see the first element to verify:
print(text_array[1])

create_csv_from_text_array(text_array, 'test.csv')  # Save the array to 'test.csv'

### Instruction: What are the three primary colors?

### Input: 

### Response: The three primary colors are red, blue, and yellow.




In [None]:
import csv
import json

def preprocess_alpaca_json_to_csv(alpaca_dataset_path: str, csv_output_path: str):
    """
    Creates a CSV file from the Alpaca JSON dataset with a single 'text' column.

    :param alpaca_dataset_path: path of the Alpaca dataset
    :param csv_output_path: path to save the CSV file
    """
    with open(alpaca_dataset_path, 'r') as f:
        alpaca_data = json.load(f)

    # Open the CSV file for writing
    with open(csv_output_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # Write the header
        writer.writerow(['text'])

        # Write the formatted text to the CSV
        for data in alpaca_data:
            instruction = f"### Instruction: {data['instruction']}"
            input_text = f"### Input: {data['input']}"
            response = f"### Response: {data['output']}."
            formatted_text = f"{instruction}\n{input_text}\n{response}\n"
            writer.writerow([formatted_text])

# Usage
alpaca_dataset_path = "alpaca_data.json"
csv_output_path = "test.csv"
preprocess_alpaca_json_to_csv(alpaca_dataset_path, csv_output_path)


In [9]:
import pandas as pd
pd.read_csv('test.csv').head()

Unnamed: 0,text
0,### Instruction: Give three tips for staying h...
1,### Instruction: What are the three primary co...
2,### Instruction: Describe the structure of an ...
3,### Instruction: How can we reduce air polluti...
4,### Instruction: Describe a time when you had ...


In [10]:
!apt-get update

Reading package lists... Done
E: List directory /var/lib/apt/lists/partial is missing. - Acquire (13: Permission denied)


In [12]:
!apt-get install libjpeg-dev libpng-dev


E: Could not open lock file /var/lib/dpkg/lock-frontend - open (13: Permission denied)
E: Unable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), are you root?


In [13]:
#@title 🤗 AutoTrain LLM
#@markdown In order to use this colab
#@markdown - upload train.csv to a folder named `data/`
#@markdown - train.csv must contain a `text` column
#@markdown - choose a project name if you wish
#@markdown - change model if you wish, you can use most of the text-generation models from Hugging Face Hub
#@markdown - add huggingface information (token and repo_id) if you wish to push trained model to huggingface hub
#@markdown - update hyperparameters if you wish
#@markdown - click `Runtime > Run all` or run each cell individually

import os
!pip install -U autotrain-advanced > install_logs.txt
!autotrain setup --colab > setup_logs.txt

Traceback (most recent call last):
  File "/usr/local/python/3.10.8/lib/python3.10/site-packages/diffusers/utils/import_utils.py", line 684, in _get_module
    return importlib.import_module("." + module_name, self.__name__)
  File "/usr/local/python/3.10.8/lib/python3.10/importlib/__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "/usr/local/python/3.10.8/lib/python3.10/site-packages/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py", line 50, in <module>
    from .watermark i

In [None]:
#@markdown ---
#@markdown #### Project Config
#@markdown Note: if you are using a restricted/private model, you need to enter your Hugging Face token in the next step.
project_name = 'my_autotrain_llm' # @param {type:"string"}
model_name = 'abhishek/llama-2-7b-hf-small-shards' # @param {type:"string"}

#@markdown ---
#@markdown #### Push to Hub?
#@markdown Use these only if you want to push your trained model to a private repo in your Hugging Face Account
#@markdown If you dont use these, the model will be saved in Google Colab and you are required to download it manually.
#@markdown Please enter your Hugging Face write token. The trained model will be saved to your Hugging Face account.
#@markdown You can find your token here: https://huggingface.co/settings/tokens
push_to_hub = False # @param ["False", "True"] {type:"raw"}
hf_token = "" #@param {type:"string"}
repo_id = "" #@param {type:"string"}

#@markdown ---
#@markdown #### Hyperparameters
learning_rate = 2e-4 # @param {type:"number"}
num_epochs = 1 #@param {type:"number"}
batch_size = 1 # @param {type:"slider", min:1, max:32, step:1}
block_size = 1024 # @param {type:"number"}
trainer = "sft" # @param ["default", "sft"] {type:"raw"}
warmup_ratio = 0.1 # @param {type:"number"}
weight_decay = 0.01 # @param {type:"number"}
gradient_accumulation = 4 # @param {type:"number"}
use_fp16 = True # @param ["False", "True"] {type:"raw"}
use_peft = True # @param ["False", "True"] {type:"raw"}
use_int4 = True # @param ["False", "True"] {type:"raw"}
lora_r = 16 #@param {type:"number"}
lora_alpha = 32 #@param {type:"number"}
lora_dropout = 0.05 #@param {type:"number"}

os.environ["PROJECT_NAME"] = project_name
os.environ["MODEL_NAME"] = model_name
os.environ["PUSH_TO_HUB"] = str(push_to_hub)
os.environ["HF_TOKEN"] = hf_token
os.environ["REPO_ID"] = repo_id
os.environ["LEARNING_RATE"] = str(learning_rate)
os.environ["NUM_EPOCHS"] = str(num_epochs)
os.environ["BATCH_SIZE"] = str(batch_size)
os.environ["BLOCK_SIZE"] = str(block_size)
os.environ["WARMUP_RATIO"] = str(warmup_ratio)
os.environ["WEIGHT_DECAY"] = str(weight_decay)
os.environ["GRADIENT_ACCUMULATION"] = str(gradient_accumulation)
os.environ["USE_FP16"] = str(use_fp16)
os.environ["USE_PEFT"] = str(use_peft)
os.environ["USE_INT4"] = str(use_int4)
os.environ["LORA_R"] = str(lora_r)
os.environ["LORA_ALPHA"] = str(lora_alpha)
os.environ["LORA_DROPOUT"] = str(lora_dropout)


In [None]:
!autotrain llm \
--train \
--model ${MODEL_NAME} \
--project-name ${PROJECT_NAME} \
--data-path ./ \
--text-column text \
--lr ${LEARNING_RATE} \
--batch-size ${BATCH_SIZE} \
--epochs ${NUM_EPOCHS} \
--block-size ${BLOCK_SIZE} \
--warmup-ratio ${WARMUP_RATIO} \
--lora-r ${LORA_R} \
--lora-alpha ${LORA_ALPHA} \
--lora-dropout ${LORA_DROPOUT} \
--weight-decay ${WEIGHT_DECAY} \
--gradient-accumulation ${GRADIENT_ACCUMULATION} \
$( [[ "$USE_FP16" == "True" ]] && echo "--fp16" ) \
$( [[ "$USE_PEFT" == "True" ]] && echo "--use-peft" ) \
$( [[ "$USE_INT4" == "True" ]] && echo "--use-int4" )