In [6]:
from flask import Flask, jsonify
import os
import subprocess

for key, value in [
    ("PROJECT_NAME", 'my_autotrain_llm'),
    ("MODEL_NAME", 'abhishek/llama-2-7b-hf-small-shards'),
    ("PUSH_TO_HUB", False), 
    ("LEARNING_RATE", 2e-4),
    ("NUM_EPOCHS", 1),
    ("BATCH_SIZE", 1),
    ("BLOCK_SIZE", 1024),
    ("WARMUP_RATIO", 0.1),
    ("WEIGHT_DECAY", 0.01),
    ("GRADIENT_ACCUMULATION", 4),
    ("MIXED_PRECISION", 'fp16'),
    ("PEFT", True),
    ("QUANTIZATION", 'int4'),
    ("LORA_R", 16),
    ("LORA_ALPHA", 32),
    ("LORA_DROPOUT", 0.05)
    ]:
        if not os.environ.get(key):
            os.environ[key] = str(value)

In [10]:
print('start1')
command = [
    "autotrain", "llm", "--train",
    "--model", os.environ.get("MODEL_NAME"),
    "--project-name", os.environ.get("PROJECT_NAME"),
    "--data-path", "./../train/image_o/data/train.csv",
    "--text-column", "text",
    "--lr", os.environ.get("LEARNING_RATE"),
    "--batch-size", os.environ.get("BATCH_SIZE"),
    "--epochs", os.environ.get("NUM_EPOCHS"),
    "--block-size", os.environ.get("BLOCK_SIZE"),
    "--warmup-ratio", os.environ.get("WARMUP_RATIO"),
    "--lora-r", os.environ.get("LORA_R"),
    "--lora-alpha", os.environ.get("LORA_ALPHA"),
    "--lora-dropout", os.environ.get("LORA_DROPOUT"),
    "--weight-decay", os.environ.get("WEIGHT_DECAY"),
    "--gradient-accumulation", os.environ.get("GRADIENT_ACCUMULATION"),
    "--quantization", os.environ.get("QUANTIZATION"),
    "--mixed-precision", os.environ.get("MIXED_PRECISION"),
]

# Conditional flags
if os.environ.get("PEFT") == "True": command.append("--PEFT")
if os.environ.get("PUSH_TO_HUB") == "True": command.append("--push-to-hub --token ${HF_TOKEN} --repo-id ${REPO_ID}") 

# Execute the command
print("Command: ")
print(command)
result = subprocess.run(command, capture_output=True, text=True)
jsonify({"output": result.stdout, "error": result.stderr})

start1
Command: 
['autotrain', 'llm', '--train', '--model', 'abhishek/llama-2-7b-hf-small-shards', '--project-name', 'my_autotrain_llm', '--data-path', './../train/image_o/data/train.csv', '--text-column', 'text', '--lr', '0.0002', '--batch-size', '1', '--epochs', '1', '--block-size', '1024', '--warmup-ratio', '0.1', '--lora-r', '16', '--lora-alpha', '32', '--lora-dropout', '0.05', '--weight-decay', '0.01', '--gradient-accumulation', '4', '--quantization', 'int4', '--mixed-precision', 'fp16', '--PEFT']


FileNotFoundError: [WinError 2] The system cannot find the file specified

def main():
    parser = argparse.ArgumentParser(
        "AutoTrain advanced CLI",
        usage="autotrain <command> [<args>]",
        epilog="For more information about a command, run: `autotrain <command> --help`",
    )
    parser.add_argument("--version", "-v", help="Display AutoTrain version", action="store_true")
    commands_parser = parser.add_subparsers(help="commands")

    # Register commands
    RunAutoTrainAppCommand.register_subcommand(commands_parser)
    RunAutoTrainLLMCommand.register_subcommand(commands_parser)
    RunSetupCommand.register_subcommand(commands_parser)
    RunAutoTrainDreamboothCommand.register_subcommand(commands_parser)
    RunAutoTrainAPICommand.register_subcommand(commands_parser)
    RunAutoTrainTextClassificationCommand.register_subcommand(commands_parser)
    RunAutoTrainImageClassificationCommand.register_subcommand(commands_parser)
    RunAutoTrainTabularCommand.register_subcommand(commands_parser)
    RunAutoTrainSpaceRunnerCommand.register_subcommand(commands_parser)
    RunAutoTrainSeq2SeqCommand.register_subcommand(commands_parser)

    args = parser.parse_args()

    if args.version:
        print(__version__)
        exit(0)

    if not hasattr(args, "func"):
        parser.print_help()
        exit(1)

    command = args.func(args)
    command.run()


class RunAutoTrainLLMCommand(BaseAutoTrainCommand):
    def run(self):
        from autotrain.backend import EndpointsRunner, SpaceRunner
        from autotrain.trainers.clm.__main__ import train as train_llm
        from autotrain.trainers.clm.params import LLMTrainingParams

        logger.info("Running LLM")
        logger.info(f"Params: {self.args}")
        if self.args.train:
            params = LLMTrainingParams(
                model=self.args.model,
                data_path=self.args.data_path,
                train_split=self.args.train_split,
                valid_split=self.args.valid_split,
                text_column=self.args.text_column,
                lr=self.args.learning_rate,
                epochs=self.args.num_train_epochs,
                batch_size=self.args.train_batch_size,
                warmup_ratio=self.args.warmup_ratio,
                gradient_accumulation=self.args.gradient_accumulation_steps,
                optimizer=self.args.optimizer,
                scheduler=self.args.scheduler,
                weight_decay=self.args.weight_decay,
                max_grad_norm=self.args.max_grad_norm,
                seed=self.args.seed,
                add_eos_token=self.args.add_eos_token,
                block_size=self.args.block_size,
                use_peft=self.args.use_peft,
                lora_r=self.args.lora_r,
                lora_alpha=self.args.lora_alpha,
                lora_dropout=self.args.lora_dropout,
                logging_steps=self.args.logging_steps,
                project_name=self.args.project_name,
                evaluation_strategy=self.args.evaluation_strategy,
                save_total_limit=self.args.save_total_limit,
                save_strategy=self.args.save_strategy,
                auto_find_batch_size=self.args.auto_find_batch_size,
                fp16=self.args.fp16,
                push_to_hub=self.args.push_to_hub,
                use_int8=self.args.use_int8,
                model_max_length=self.args.model_max_length,
                repo_id=self.args.repo_id,
                use_int4=self.args.use_int4,
                trainer=self.args.trainer,
                target_modules=self.args.target_modules,
                token=self.args.token,
                merge_adapter=self.args.merge_adapter,
                username=self.args.username,
                use_flash_attention_2=self.args.use_flash_attention_2,
                log=self.args.log,
                rejected_text_column=self.args.rejected_text_column,
                disable_gradient_checkpointing=self.args.disable_gradient_checkpointing,
                model_ref=self.args.model_ref,
                dpo_beta=self.args.dpo_beta,
                prompt_text_column=self.args.prompt_text_column,
            )


# Welcome

## Notes to Michael - 5:08pm 11/7/2023

### This Doc

I put notes in this notebook about auto-training using xturing. 

It covers generating datasets using a prepped file, auto-creating it using jsonl, or generating from files in a directory. The 'CHATGPT' api call is depricated in the library but if you want to fix it and get the 'generate from directory' example working that would be DOPE.

### In Colab

I have been working in a google colab enviornment connected to a cloud instance for training here:
https://colab.research.google.com/drive/1hjUMbrJhS92a9tEzVKpgEZx8vU4QxBtx?usp=sharing

The colab notebook is a cleaner notebook where I am also developing the docker/deploy stuff.

### Huggingface

Huggingface provides an autoTrain API that operates similarly. 

Weather or not we train our models using this api, is less of a concern at the moment. 

At the moment we really just want to create code which uses a users yaml doc to auto-deploy a localmodel to the huggingface hub and getting back an endpoint. For testing purposes the model does not have to be a trained and you can test deploying a base model from xturing demonstrated below. 

#### HF Autotrain notes

The AutoTrain Api has code for deploying models to huggingface hubs but the api docs are lacking.

Here is a google colab of someone training using the service via CLI

https://colab.research.google.com/drive/1ufB53v_ptm6NJYeemCgYAIGWGUIjO8yw#scrollTo=g3cd_ED_yXXt

Forum:
https://discuss.huggingface.co/c/autotrain/16

- To let AutoTrain choose the best models for your task, you can use the “AutoTrain” in the “Model Choice” section. Once you choose AutoTrain mode, you no longer need to worry about model and parameter selection. AutoTrain will automatically select the best models (and parameters) for your task.


Documentation:
https://huggingface.co/docs/autotrain/llm_finetuning

Useful things I found in the codebase:

class AutoTrainDataset: 
https://github.com/huggingface/autotrain-advanced/blob/main/src/autotrain/dataset.py

autotrain api.py
https://github.com/huggingface/autotrain-advanced/blob/main/src/autotrain/api.py

autotrain app.py
https://github.com/huggingface/autotrain-advanced/blob/main/src/autotrain/app.py

autotrain config
https://github.com/huggingface/autotrain-advanced/blob/main/src/autotrain/config.py

In [None]:
import xturing

In [2]:
help(xturing.datasets)

Help on package xturing.datasets in xturing:

NAME
    xturing.datasets

PACKAGE CONTENTS
    base
    instruction_dataset
    text2image_dataset
    text_dataset

FILE
    /home/carlos/.local/lib/python3.10/site-packages/xturing/datasets/__init__.py




## Initial LLM Test

In [None]:
from xturing.models import BaseModel
model = BaseModel.create('llama')

In [None]:
outputs = model.generate(texts=['Hi How are you?'])

In [None]:
outputs

["\nI’m good how are you?\nNot too bad. Just watching tv and eating a bowl of ice cream.\nSounds good what are you watching?\nA show called The Good Place. It's on Netflix if you want to check it out.\nI have it on my netflix but I haven’t watched it yet. I should do that.\nYeah it's good. I'm a fan of Parks and Rec so I was excited to see it on Netflix.\nParks and Rec is my favorite show of all time. I watched it for the first time last year and fell in love with it.\nSame here. The first time I watched it I was like wtf is going on? But I kept watching and it grew on me.\nI think it's one of those shows that you need to watch multiple times to get the full experience.\nDefinitely. I've rewatched it a few times and each time I notice something new that I didn't the first time."]

## TRAINING 

The model weights will be saved into 2 files. The whole model weights including based model parameters and LoRA parameters are stored in pytorch_model.bin file and only LoRA parameters are stored in adapter_model.bin file.

### Retrieve Training Data

In [3]:
import json

from datasets import Dataset, DatasetDict

# Convert the alpaca JSON dataset to HF format

# Right now only the HuggingFace datasets are supported, that's why the JSON Alpaca dataset
# needs to be converted to the HuggingFace format. In addition, this HF dataset should have 3 columns for instruction finetuning: instruction, text and target.
def preprocess_alpaca_json_data(alpaca_dataset_path: str):
    """Creates a dataset given the alpaca JSON dataset. You can download it here: https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json

    :param alpaca_dataset_path: path of the Alpaca dataset
    """
    alpaca_data = json.load(open(alpaca_dataset_path))
    instructions = []
    inputs = []
    outputs = []

    for data in alpaca_data:
        instructions.append(data["instruction"])
        inputs.append(data["input"])
        outputs.append(data["output"])

    data_dict = {
        "train": {"instruction": instructions, "text": inputs, "target": outputs}
    }

    dataset = DatasetDict()
    # using your `Dict` object
    for k, v in data_dict.items():
        dataset[k] = Dataset.from_dict(v)

    dataset.save_to_disk(str("./data/alpaca_data"))

In [4]:
preprocess_alpaca_json_data("./data/alpaca_data.json")

Saving the dataset (0/1 shards):   0%|          | 0/52002 [00:00<?, ? examples/s]

### Auto Generate Training Data

In [None]:
from xturing.datasets import InstructionDataset
from xturing.model_apis.openai import ChatGPT

#### Autogen from folder

https://github.com/stochasticai/xTuring/blob/55eda97e51e6b04c6796ae12104dd11cda362a47/examples/datasets/create_instruction_dataset_from_files.ipynb

Some finance news articles are stored in sample_finance_data folder. In this tutorial, we are going to generate InstructionDataset from a data folder (finance news articles) and perform instruction fine-tuning on the generated dataset.

In [2]:
from xturing.datasets import InstructionDataset
from xturing.model_apis.openai import ChatGPT

In [None]:
engine = ChatGPT("your-api-key")
dataset = InstructionDataset.generate_dataset_from_dir(engine=engine, path="./sample_finance_data")
dataset.save("./output_dataset")

In [None]:
from xturing.models import BaseModel
model = BaseModel.create("gpt2_lora")
# Finetune the model on generated dataset
model.finetune(dataset=dataset)

#### Autogen From Seed Tasks

https://xturing.stochastic.ai/advanced/generate

In [None]:
%%writefile data/seed_tasks.jsonl
{"id": "seed_task_0", "name": "breakfast_suggestion", "instruction": "Is there anything I can eat for a breakfast that doesn't include eggs, yet includes protein, and has roughly 700-1000 calories?", "instances": [{"input": "", "output": "Yes, you can have 1 oatmeal banana protein shake and 4 strips of bacon. The oatmeal banana protein shake may contain 1/2 cup oatmeal, 60 grams whey protein powder, 1/2 medium banana, 1tbsp flaxseed oil and 1/2 cup watter, totalling about 550 calories. The 4 strips of bacon contains about 200 calories."}], "is_classification": false}
{"id": "seed_task_1", "name": "antonym_relation", "instruction": "What is the relation between the given pairs?", "instances": [{"input": "Night : Day :: Right : Left", "output": "The relation between the given pairs is that they are opposites."}], "is_classification": false}

In [1]:
from xturing.model_apis.openai import ChatGPT
from xturing.datasets import InstructionDataset
engine = ChatGPT("") ## Generate the dataset 

[2023-11-06 22:53:26,156] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
/home/carlos/.local/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32


In [None]:
dataset = InstructionDataset.generate_dataset(engine=engine, path="./data/seed_tasks.jsonl")
dataset.save("./output_dataset")

In [None]:
from xturing.models import BaseModel
model = BaseModel.create("gpt2_lora")
# Finetune the model on generated dataset
model.finetune(dataset=dataset)

## Test

#### Example Llama Model Test

In [None]:
from xturing.datasets.instruction_dataset import InstructionDataset
from xturing.models import BaseModel

instruction_dataset = InstructionDataset("./data/alpaca_data")
# Initializes the model
model = BaseModel.create("llama")
# Finetuned the model
model.finetune(dataset=instruction_dataset)
# Once the model has been finetuned, you can start doing inferences
output = model.generate(texts=["Why LLM models are becoming so important?"])
print("Generated output by the model: {}".format(output))
# Save the model
model.save("./llama_weights")

# If you want to load the model just do BaseModel.load("./llama_weights")

#### Finetune Playground Example

In [5]:
from xturing.datasets import InstructionDataset
from xturing.model_apis.openai import ChatGPT
from xturing.datasets.instruction_dataset import InstructionDataset
from xturing.models.base import BaseModel
from xturing.ui.playground import Playground

In [6]:
# Initializes the model
model = BaseModel.create("gpt2_lora")

trainable params: 294912 || all params: 124734720 || trainable%: 0.23643136409814364


In [7]:
sameThingAs = """
    InstructionDataset({
        "text": ["first text", "second text"],
        "target": ["first text", "second text"],
        "instruction": ["first instruction", "second instruction"]
    })
    """
instruction_dataset = InstructionDataset("./data/alpaca_data")

In [8]:
finetuning_config = model.finetuning_config()
finetuning_config.batch_size = 64 # 16
finetuning_config.num_train_epochs = 1 # 3
finetuning_config.learning_rate = 1e-5 # 0.003
finetuning_config.weight_decay = 0.01
finetuning_config.optimizer_name = "adamw"
finetuning_config.output_dir = "./data/training_dir/" 

https://xturing.stochastic.ai/configuration/finetune_configure

In [9]:
finetuning_config 

FinetuningConfig(learning_rate=1e-05, gradient_accumulation_steps=1, batch_size=64, weight_decay=0.01, warmup_steps=50, eval_steps=5000, save_steps=5000, max_length=512, num_train_epochs=1, logging_steps=10, max_grad_norm=2.0, save_total_limit=4, optimizer_name='adamw', output_dir='./data/training_dir/')

In [10]:
# Finetuned the model
model.finetune(dataset=instruction_dataset)

trainable params: 294912 || all params: 124734720 || trainable%: 0.23643136409814364


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

: 

In [None]:
# Model path
model_path = "./gpt2_weights"
# Save the model
model.save(model_path)
# launch the playground
Playground(model_path).launch()

#### Using Fine Tuned Model

#### ChatGPT AutoGen Fix

In [2]:
engine.get_completion("How do you do?")

ERRORR
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content="Hello! It seems like you're trying to start a conversation, but I'm not sure what you're looking for. Can you please provide more information or ask a specific question?", role='assistant', function_call=None, tool_calls=None))


Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content="Hello! It seems like you're trying to start a conversation, but I'm not sure what you're looking for. Can you please provide more information or ask a specific question?", role='assistant', function_call=None, tool_calls=None))

In [None]:
openai.OpenAI().chat.completions.create("How do you do")

In [4]:
from openai import OpenAI
client = OpenAI()

response = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Who won the world series in 2020?"},
    {"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."},
    {"role": "user", "content": "Where was it played?"}
  ]
)
response

ChatCompletion(id='chatcmpl-8I7SlKlDwi2KViuEeKICoEZ76Lgmy', choices=[Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='The World Series in 2020 was played at the Globe Life Field in Arlington, Texas.', role='assistant', function_call=None, tool_calls=None))], created=1699329283, model='gpt-3.5-turbo-0613', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=19, prompt_tokens=53, total_tokens=72))

In [None]:
dataset = InstructionDataset.generate_dataset(path="./tasks.jsonl", engine=engine)
# dataset.save('testgptdataset')

https://xturing.stochastic.ai/overview/quickstart/prepare/#save-a-dataset

InstructionDataset - You want the model to generate text based on an instruction/task.

In [21]:
from xturing.model_apis.openai import ChatGPT
from xturing.datasets import InstructionDataset
dataset = InstructionDataset({
    "text": ["first text", "second text"],
    "target": ["first text", "second text"],
    "instruction": ["first instruction", "second instruction"]
})
dataset.save('testinstructdataset')

Saving the dataset (0/1 shards):   0%|          | 0/2 [00:00<?, ? examples/s]

## Env Versioning Tests

In [1]:
# program
!python --version

Python 3.11.5


In [2]:
# program
!python3 --version

Python 3.11.6


In [1]:
# windows kernal
!python3 --version

Python 3.11.6


In [2]:
# windows kernal
!python --version

Python 3.11.6


In [1]:
# WSL
!python --version

/bin/bash: line 1: python: command not found


In [2]:
# WSL
!python3 --version

Python 3.10.12
