In [1]:
!pip install huggingface
!pip install datasets

[0m

In [2]:
from datasets import load_dataset

ds = load_dataset("garage-bAInd/Open-Platypus")

Downloading readme:   0%|          | 0.00/5.34k [00:00<?, ?B/s]

In [3]:
def create_text_column(example):
    example['text'] = (f"Instruction: {example['instruction']}\n"
                       f"Input: {example['input']}\n"
                       f"Output: {example['output']}")
    return example

In [4]:
ds['train'] = ds['train'].map(create_text_column, remove_columns=['instruction', 'input', 'output'])


# Print the first example to verify
print(ds['train'][0])

{'data_source': 'MATH/PRM-800K', 'text': 'Instruction: A board game spinner is divided into three parts labeled $A$, $B$  and $C$. The probability of the spinner landing on $A$ is $\\frac{1}{3}$ and the probability of the spinner landing on $B$ is $\\frac{5}{12}$.  What is the probability of the spinner landing on $C$? Express your answer as a common fraction.\nInput: \nOutput: To find the probability of the spinner landing on $C$, I need to subtract the probabilities of the spinner landing on $A$ and $B$ from $1$, since the sum of the probabilities of all possible outcomes is $1$. I can write this as an equation: $P(C) = 1 - P(A) - P(B)$. I know that $P(A) = \\frac{1}{3}$ and $P(B) = \\frac{5}{12}$, so I can plug those values into the equation and simplify. I get: $P(C) = 1 - \\frac{1}{3} - \\frac{5}{12} = \\frac{12}{12} - \\frac{4}{12} - \\frac{5}{12} = \\frac{3}{12}$. I can reduce this fraction by dividing the numerator and denominator by $3$, and I get: $P(C) = \\frac{1}{4}$. '}


In [5]:
def sample_and_shuffle(dataset, split, fraction=0.5):
    # Shuffle the dataset
    shuffled_dataset = dataset[split].shuffle(seed=42)
    # Calculate the number of rows to keep
    num_rows = len(shuffled_dataset)
    num_to_keep = int(num_rows * fraction)
    # Select the desired fraction
    sampled_dataset = shuffled_dataset.select(range(num_to_keep))
    return sampled_dataset

# Apply the function to each split
ds['train'] = sample_and_shuffle(ds, 'train', fraction=0.40)

In [6]:
#convert to csv
ds['train'].to_csv('train.csv', index=False)

Creating CSV from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

4682179

In [7]:
#@title 🤗 AutoTrain LLM
#@markdown In order to use this colab
#@markdown - upload train.csv to a folder named `data/`
#@markdown - train.csv must contain a `text` column
#@markdown - choose a project name if you wish
#@markdown - change model if you wish, you can use most of the text-generation models from Hugging Face Hub
#@markdown - add huggingface information (token) if you wish to push trained model to huggingface hub
#@markdown - update hyperparameters if you wish
#@markdown - click `Runtime > Run all` or run each cell individually
#@markdown - report issues / feature requests here: https://github.com/huggingface/autotrain-advanced/issues


import os
!pip install -U autotrain-advanced > install_logs.txt 2>&1
!autotrain setup --colab > setup_logs.txt
from autotrain import __version__
print(f'AutoTrain version: {__version__}')

AutoTrain version: 0.8.12


In [8]:


!pip install --force-reinstall torch==2.3.0

from torch import __version__
print(f'Torch version: {__version__}')


[0mCollecting torch==2.3.0
  Downloading torch-2.3.0-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting filelock (from torch==2.3.0)
  Using cached filelock-3.15.4-py3-none-any.whl.metadata (2.9 kB)
Collecting typing-extensions>=4.8.0 (from torch==2.3.0)
  Using cached typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Collecting sympy (from torch==2.3.0)
  Using cached sympy-1.13.2-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch==2.3.0)
  Using cached networkx-3.3-py3-none-any.whl.metadata (5.1 kB)
Collecting jinja2 (from torch==2.3.0)
  Using cached jinja2-3.1.4-py3-none-any.whl.metadata (2.6 kB)
Collecting fsspec (from torch==2.3.0)
  Using cached fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.3.0)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.3.0)
  Using cached nvidia_cuda

Torch version: 2.2.0+cu121


In [12]:
#@markdown ---
#@markdown #### Project Config
#@markdown Note: if you are using a restricted/private model, you need to enter your Hugging Face token in the next step.
project_name = 'Llama2STEM' # @param {type:"string"}
model_name = 'TinyPixel/Llama-2-7B-bf16-sharded' # @param {type:"string"}

#@markdown ---
#@markdown #### Push to Hub?
#@markdown Use these only if you want to push your trained model to a private repo in your Hugging Face Account
#@markdown If you dont use these, the model will be saved in Google Colab and you are required to download it manually.
#@markdown Please enter your Hugging Face write token. The trained model will be saved to your Hugging Face account.
#@markdown You can find your token here: https://huggingface.co/settings/tokens
push_to_hub = True # @param ["False", "True"] {type:"raw"}
hf_token = "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" #@param {type:"string"}
hf_username = "AzmeerFaisal" #@param {type:"string"}

#@markdown ---
#@markdown #### Hyperparameters
unsloth = False # @param ["False", "True"] {type:"raw"}
learning_rate = 2e-4 # @param {type:"number"}
num_epochs = 1 #@param {type:"number"}
batch_size = 1 # @param {type:"slider", min:1, max:32, step:1}
block_size = 1024 # @param {type:"number"}
trainer = "sft" # @param ["generic", "sft"] {type:"string"}
warmup_ratio = 0.1 # @param {type:"number"}
weight_decay = 0.01 # @param {type:"number"}
gradient_accumulation = 4 # @param {type:"number"}
mixed_precision = "fp16" # @param ["fp16", "bf16", "none"] {type:"string"}
peft = True # @param ["False", "True"] {type:"raw"}
quantization = "int4" # @param ["int4", "int8", "none"] {type:"string"}
lora_r = 16 #@param {type:"number"}
lora_alpha = 32 #@param {type:"number"}
lora_dropout = 0.05 #@param {type:"number"}

os.environ["HF_TOKEN"] = hf_token
os.environ["HF_USERNAME"] = hf_username

conf = f"""
task: llm-{trainer}
base_model: {model_name}
project_name: {project_name}
log: tensorboard
backend: local

data:
  path: /content
  train_split: train
  valid_split: null
  chat_template: null
  column_mapping:
    text_column: text

params:
  block_size: {block_size}
  lr: {learning_rate}
  warmup_ratio: {warmup_ratio}
  weight_decay: {weight_decay}
  epochs: {num_epochs}
  batch_size: {batch_size}
  gradient_accumulation: {gradient_accumulation}
  mixed_precision: {mixed_precision}
  peft: {peft}
  quantization: {quantization}
  lora_r: {lora_r}
  lora_alpha: {lora_alpha}
  lora_dropout: {lora_dropout}
  unsloth: {unsloth}

hub:
  username: ${{HF_USERNAME}}
  token: ${{HF_TOKEN}}
  push_to_hub: {push_to_hub}
"""

with open("conf.yaml", "w") as f:
  f.write(conf)

In [None]:
!autotrain --config conf.yaml

[1mINFO    [0m | [32m2024-08-21 18:36:43[0m | [36mautotrain.cli.autotrain[0m:[36mmain[0m:[36m60[0m - [1mUsing AutoTrain configuration: conf.yaml[0m
[1mINFO    [0m | [32m2024-08-21 18:36:44[0m | [36mautotrain.parser[0m:[36m__post_init__[0m:[36m147[0m - [1mRunning task: lm_training[0m
[1mINFO    [0m | [32m2024-08-21 18:36:44[0m | [36mautotrain.parser[0m:[36m__post_init__[0m:[36m148[0m - [1mUsing backend: local[0m
[1mINFO    [0m | [32m2024-08-21 18:36:44[0m | [36mautotrain.parser[0m:[36mrun[0m:[36m211[0m - [1m{'model': 'TinyPixel/Llama-2-7B-bf16-sharded', 'project_name': 'Llama2STEM', 'data_path': '/content', 'train_split': 'train', 'valid_split': None, 'add_eos_token': True, 'block_size': 1024, 'model_max_length': 2048, 'padding': 'right', 'trainer': 'sft', 'use_flash_attention_2': False, 'log': 'tensorboard', 'disable_gradient_checkpointing': False, 'logging_steps': -1, 'eval_strategy': 'epoch', 'save_total_limit': 1, 'auto_find_batch_size'