<a href="https://colab.research.google.com/github/bupt-Yy-young/colabdataupload/blob/collect/unsloth.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!rm -rf /content/.cache/*

In [3]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [4]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.11.9: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)


Unsloth 2024.11.9 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [7]:
import pandas as pd
from json import loads, dumps

alpaca_prompt = """You are a cybersecurity expert specializing in cyber threat intelligence. You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset. Your task is to choose the best option among the four provided. Return your answer as a single uppercase letter: A, B, C, or D.

### Question:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

dataset = pd.read_csv('cti_mcq_generated.tsv', sep='\t', encoding='cp1252')
result = dataset.to_json(orient="records")
dataset = loads(result)
for index,row in enumerate(dataset):
  question = row['Question'] + f" Options:  A) {row['Option A']}  B) {row['Option B']}  C) {row['Option C']}  D) {row['Option D']}"
  response = row['GT']
  row['text'] = alpaca_prompt.format(question, response) + EOS_TOKEN
  row['label'] = index

In [8]:

from datasets import Dataset

dataset = Dataset.from_list(dataset)
dataset

Dataset({
    features: ['URL', 'Question', 'Option A', 'Option B', 'Option C', 'Option D', 'GT', 'Prompt', 'text', 'label'],
    num_rows: 4793
})

In [9]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)


Map (num_proc=2):   0%|          | 0/4793 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [10]:

trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 4,793 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
1,2.1896
2,2.2719
3,2.2384
4,2.2122
5,2.0932
6,1.7819
7,1.3622
8,1.2728
9,1.046
10,0.9571


In [11]:
dataset_val = pd.read_csv('cti-mcq.tsv', sep='\t')

In [12]:
# alpaca_prompt = Copied from above
count = 0
correct = 0
for index,row in dataset_val.iterrows():
  FastLanguageModel.for_inference(model) # Enable native 2x faster inference
  question = row['Question'] + f" Options:  A) {row['Option A']}  B) {row['Option B']}  C) {row['Option C']}  D) {row['Option D']}"
  inputs = tokenizer(
  [
      alpaca_prompt.format(
          question,
          ""
      )
  ], return_tensors = "pt").to("cuda")

  outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
  res = tokenizer.batch_decode(outputs)
  response = res[0][-16]
  if response not in ('A','B','C','D'):
    print("problem", response)
    continue

  count += 1

  if response == row['GT']:
    correct += 1

print((correct/count)*100,"% of correct answers")

55.67999999999999 % of correct answers


In [13]:
%cd /content
!git init

/content
[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /content/.git/


In [14]:
!git remote add origin https://github.com/bupt-Yy-young/colabdataupload.git

In [18]:
!git add outputs/
!git commit -m "Add outputs folder"

[master (root-commit) 8b41bed] Add outputs folder
 12 files changed, 1253779 insertions(+)
 create mode 100644 outputs/checkpoint-60/README.md
 create mode 100644 outputs/checkpoint-60/adapter_config.json
 create mode 100644 outputs/checkpoint-60/adapter_model.safetensors
 create mode 100644 outputs/checkpoint-60/optimizer.pt
 create mode 100644 outputs/checkpoint-60/rng_state.pth
 create mode 100644 outputs/checkpoint-60/scheduler.pt
 create mode 100644 outputs/checkpoint-60/special_tokens_map.json
 create mode 100644 outputs/checkpoint-60/tokenizer.json
 create mode 100644 outputs/checkpoint-60/tokenizer_config.json
 create mode 100644 outputs/checkpoint-60/trainer_state.json
 create mode 100644 outputs/checkpoint-60/training_args.bin
 create mode 100644 outputs/runs/Nov26_11-28-11_c271a51138a3/events.out.tfevents.1732620502.c271a51138a3.770.0


In [22]:
!git branch

* [32mmaster[m


In [23]:
!git push -u origin master

fatal: could not read Username for 'https://github.com': No such device or address


In [16]:
!git config --global user.email "1787246528@qq.com"

In [24]:
# 1. ËøõÂÖ• outputs Êñá‰ª∂Â§πÊâÄÂú®ÁöÑÁõÆÂΩï
%cd /content

# 2. ÂàùÂßãÂåñ Git ‰ªìÂ∫ìÔºàÂ¶ÇÊûúÂ∞öÊú™ÂàùÂßãÂåñÔºâ
!git init

# 3. ÈÖçÁΩÆ Git Áî®Êà∑‰ø°ÊÅØ
!git config --global user.email "1787246528@qq.com"
!git config --global user.name "bupt-Yy-young"

# 4. Ê∑ªÂä†ËøúÁ®ã‰ªìÂ∫ì
!git remote add origin https://github.com/bupt-Yy-young/colabdataupload.git

# 5. Ê∑ªÂä† outputs Êñá‰ª∂Â§πÂà∞ÊöÇÂ≠òÂå∫
!git add outputs/

# 6. Êèê‰∫§Êõ¥Êîπ
!git commit -m "Upload outputs folder to collect branch"

# 7. ÂàáÊç¢Âà∞ collect ÂàÜÊîØÔºàÂ¶ÇÊûú‰∏çÂ≠òÂú®ÂàôÂàõÂª∫Ôºâ
!git checkout -b collect

# 8. Â∞ÜÊú¨Âú∞ collect ÂàÜÊîØÂÖ≥ËÅîÂà∞ËøúÁ®ã collect ÂàÜÊîØ
!git branch --set-upstream-to=origin/collect collect


# 9. Êé®ÈÄÅÂà∞ËøúÁ®ã‰ªìÂ∫ìÁöÑ collect ÂàÜÊîØ
!git push -u origin collect

/content
Reinitialized existing Git repository in /content/.git/
error: remote origin already exists.
On branch master
Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31m.config/[m
	[31mcti-mcq.tsv[m
	[31mcti_mcq_generated.tsv[m
	[31mdrive/[m
	[31mhuggingface_tokenizers_cache/[m
	[31msample_data/[m
	[31mwandb/[m

nothing added to commit but untracked files present (use "git add" to track)
Switched to a new branch 'collect'
error: the requested upstream branch 'origin/collect' does not exist
[33mhint: [m
[33mhint: If you are planning on basing your work on an upstream[m
[33mhint: branch that already exists at the remote, you may need to[m
[33mhint: run "git fetch" to retrieve it.[m
[33mhint: [m
[33mhint: If you are planning to push out a new local branch that[m
[33mhint: will track its remote counterpart, you may want to use[m
[33mhint: "git push -u" to set the upstream config as you push.[m
fatal: could not read Username

In [17]:
!git config --global user.name "bupt-Yy-young"