In [1]:
!nvidia-smi

Thu Aug  8 21:19:56 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA RTX A6000               Off | 00000000:01:00.0 Off |                  Off |
| 58%   82C    P2             299W / 300W |   9489MiB / 49140MiB |    100%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA RTX A6000               Off | 00000000:21:0

In [2]:
!gpustat --debug


[1m[37mcil-hydra          [m  Thu Aug  8 21:20:07 2024  [1m[30m535.183.01[m
[36m[0][m [34mNVIDIA RTX A6000[m |[1m[31m 82°C[m, [1m[32m100 %[m | [36m[1m[33m 9489[m / [33m49140[m MB | [1m[30mongzhiyang[m([33m4740M[m) [1m[30mongzhiyang[m([33m4740M[m)
[36m[1][m [34mNVIDIA RTX A6000[m |[1m[31m 79°C[m, [1m[32m 99 %[m | [36m[1m[33m10758[m / [33m49140[m MB | [1m[30mongzhiyang[m([33m3582M[m) [1m[30mongzhiyang[m([33m3582M[m) [1m[30mongzhiyang[m([33m3582M[m)
[36m[2][m [34mNVIDIA RTX A6000[m |[1m[31m 72°C[m, [1m[32m 56 %[m | [36m[1m[33m 4746[m / [33m49140[m MB | [1m[30mongzhiyang[m([33m4740M[m)
[36m[3][m [34mNVIDIA RTX A6000[m |[1m[31m 51°C[m, [1m[32m 32 %[m | [36m[1m[33m 3588[m / [33m49140[m MB | [1m[30mongzhiyang[m([33m3582M[m)


In [3]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

# Generate subset of training data

In [1]:
import random
random.seed(42) # Set a random seed for reproducibility

dataset_path = "/home/FYP/on0008an/bias-bench-main/data/wikipedia-10.txt"
sampling_ratio = 0.25
sampled_dataset_path = "/home/FYP/on0008an/bias-bench-main/data/wikipedia-10_sample.txt"

# Read
with open(dataset_path, "r") as f:
    lines = f.readlines()

sampled_lines = random.sample(lines, int(len(lines) * sampling_ratio))

# Write sampled dataset
with open(sampled_dataset_path, "w") as f:
    f.writelines(sampled_lines)

# Train models with CDA data

## All possible arguments:
- https://huggingface.co/docs/transformers/v4.41.3/en/main_classes/trainer#transformers.TrainingArguments
- ModelArguments (In the script)
- DataTrainingArguments (In the script)

## Lora explanation

- Instead of updating weights, Lora tracks changes
- Changes are tracked in 2 smaller matrices, multiplied to form the same size matrices as the model original weights

Parameters:
- Rank: Number of columns in the 2 smaller matrices
    - As rank increases, more parameters are fine-tuned
    - Downstream tasks intrinsically work well with low rank
    - However, complex tasks/behavior that contradicts the pre-training dataset may require higher rank
- Training all layers of the network is essential to match full-training performance
- Alpha: Scaling factor that is applied to the weight changes when adding to original weights, scale factor = Alpha/Rank
    - Microsoft Lora paper sets Alpha = 2 * rank
    - QLora paper sets Alpha = 1/4 * rank
- Dropout: Randomly set a fraction of the weight changes to zero to prevent overfitting
    - QLora paper sets dropout = 0.1 for 7B, 13B models, 0.05 for 33B, 65B models

# To achieve the most out of Lora fine-tuning, we must train all layers of the network

Llama 2:
- Linear Layers: gate_proj, down_proj, up_proj, q_proj, v_proj, k_proj, and o_proj

#If only targeting attention blocks of the model
target_modules = ["q_proj", "v_proj"]

#If targeting all linear layers
target_modules = ['q_proj','k_proj','v_proj','o_proj','gate_proj','down_proj','up_proj','lm_head']

# Following parameters from the batch script

- Cache takes around 3 runs to be built

## For GPT2

In [None]:
!python run_clm.py --model_name_or_path "gpt2" --tokenizer_name "gpt2" --do_train --train_file "../data/wikipedia-10.txt" --max_steps 2000 --per_device_train_batch_size 1 --gradient_accumulation_steps 32 --save_steps 100 --preprocessing_num_workers 1 --counterfactual_augmentation "gender" --seed 42 --output_dir "../results/CDA_FT/gpt2/gender" --persistent_dir "/home/ongzhiyang/bias-bench-main" --num_train_epochs 3 --overwrite_output_dir --logging_strategy "steps" --logging_steps 1 --block_size 1024 --lora

Padding token: <|endoftext|>
Quantization enabled
Training target modules: ['query', 'value']
Applying counterfactual augmentation:  25%|▏| 1322000/5349612 [34:58<1:50:10, 60

In [None]:
!python run_clm.py --model_name_or_path "gpt2" --tokenizer_name "gpt2" --do_train --train_file "../data/wikipedia-10.txt" --max_steps 2000 --per_device_train_batch_size 1 --gradient_accumulation_steps 32 --save_steps 100 --preprocessing_num_workers 1 --counterfactual_augmentation "race" --seed 42 --output_dir "../results/CDA_FT/gpt2/race" --persistent_dir "/home/ongzhiyang/bias-bench-main" --num_train_epochs 3 --overwrite_output_dir --logging_strategy "steps" --logging_steps 1 --block_size 1024 --lora

In [None]:
!python run_clm.py --model_name_or_path "gpt2" --tokenizer_name "gpt2" --do_train --train_file "../data/wikipedia-10.txt" --max_steps 2000 --per_device_train_batch_size 1 --gradient_accumulation_steps 32 --save_steps 100 --preprocessing_num_workers 1 --counterfactual_augmentation "religion" --seed 42 --output_dir "../results/CDA_FT/gpt2/religion" --persistent_dir "/home/ongzhiyang/bias-bench-main" --num_train_epochs 3 --overwrite_output_dir --logging_strategy "steps" --logging_steps 1 --block_size 1024 --lora

In [12]:
# 500 max_steps to fit into 6hours training timeframe
!python run_clm.py --model_name_or_path "gpt2" --do_train --train_file "../data/wikipedia-10_tiny.txt" --max_steps 500 --per_device_train_batch_size 8 --gradient_accumulation_steps 32 --save_steps 100 --preprocessing_num_workers 1 --counterfactual_augmentation "gender" --seed 0 --output_dir "../results/CDA_FT/gpt2" --persistent_dir "/home/FYP/on0008an/bias-bench-main" --num_train_epochs 3 --overwrite_output_dir --logging_strategy "steps" --logging_steps 1 --block_size 1024

{'loss': 6.7748, 'grad_norm': 104.59912872314453, 'learning_rate': 4.99e-05, 'epoch': 0.01}
  0%|                                         | 1/500 [00:30<4:12:31, 30.36s/it]^C
Traceback (most recent call last):
  File "/home/FYP/on0008an/bias-bench-main/experiments/run_clm.py", line 790, in <module>
    main()
  File "/home/FYP/on0008an/bias-bench-main/experiments/run_clm.py", line 729, in main
    train_result = trainer.train(resume_from_checkpoint=checkpoint)
  File "/home/FYP/on0008an/.conda/envs/RunJupyter/lib/python3.10/site-packages/transformers/trainer.py", line 1885, in train
    return inner_training_loop(
  File "/home/FYP/on0008an/.conda/envs/RunJupyter/lib/python3.10/site-packages/transformers/trainer.py", line 2216, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs)
  File "/home/FYP/on0008an/.conda/envs/RunJupyter/lib/python3.10/site-packages/transformers/trainer.py", line 3241, in training_step
    torch.cuda.empty_cache()
  File "/h

## To continue training from a checkpoint, need to set steps to the number of steps already done + what you want to do

In [1]:
# Continue from checkpoint
!python run_clm.py --model_name_or_path "gpt2" --do_train --train_file "../data/wikipedia-10.txt" --max_steps 1000 --per_device_train_batch_size 8 --gradient_accumulation_steps 32 --save_steps 500 --preprocessing_num_workers 1 --counterfactual_augmentation "gender" --seed 0 --output_dir "../results/gpt2_CDA_training_seed0_1kstep" --persistent_dir "/home/FYP/on0008an/bias-bench-main" --num_train_epochs 3 --overwrite_output_dir --logging_strategy "steps" --logging_steps 1 --resume_from_checkpoint "../results/gpt2_CDA_training_seed0/checkpoint-500"

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 67.07it/s]
{'loss': 0.6557, 'grad_norm': 0.2115873247385025, 'learning_rate': 2.495e-05, 'epoch': 0.12}
{'loss': 0.6238, 'grad_norm': 0.19264155626296997, 'learning_rate': 2.4900000000000002e-05, 'epoch': 0.12}
{'loss': 0.636, 'grad_norm': 0.3111065924167633, 'learning_rate': 2.485e-05, 'epoch': 0.12}
{'loss': 0.6642, 'grad_norm': 0.3562127351760864, 'learning_rate': 2.48e-05, 'epoch': 0.12}
{'loss': 0.6896, 'grad_norm': 0.30045801401138306, 'learning_rate': 2.4750000000000002e-05, 'epoch': 0.12}
{'loss': 0.6368, 'grad_norm': 0.35026121139526367, 'learning_rate': 2.47e-05, 'epoch': 0.12}
{'loss': 0.6551, 'grad_norm': 0.606468141078949, 'learning_rate': 2.465e-05, 'epoch': 0.12}
{'loss': 0.6384, 'grad_norm': 0.4467180371284485, 'learning_rate': 2.46e-05, 'epoch': 0.12}
{'loss': 0.6552, 'grad_norm': 0.3130481243133545, 'learning_rate': 2.455e-05, 'epoch': 0.12}
{'loss': 0.6287, 'grad_norm': 0.4406903684139251

## Phi 2.0

## Gender

In [None]:
!python run_clm.py --model_name_or_path "microsoft/phi-2" --tokenizer_name "microsoft/phi-2" --do_train --train_file "../data/wikipedia-10.txt" --max_steps 2000 --per_device_train_batch_size 1 --gradient_accumulation_steps 32 --save_steps 100 --preprocessing_num_workers 1 --counterfactual_augmentation "gender" --seed 42 --output_dir "../results/CDA_FT/phi2/gender" --persistent_dir "/home/ongzhiyang/bias-bench-main" --num_train_epochs 3 --overwrite_output_dir --logging_strategy "steps" --logging_steps 1 --block_size 1024 --lora

## Religion

In [None]:
!python run_clm.py --model_name_or_path "microsoft/phi-2" --tokenizer_name "microsoft/phi-2" --do_train --train_file "../data/wikipedia-10.txt" --max_steps 2000 --per_device_train_batch_size 1 --gradient_accumulation_steps 32 --save_steps 100 --preprocessing_num_workers 1 --counterfactual_augmentation "religion" --seed 42 --output_dir "../results/CDA_FT/phi2/religion" --persistent_dir "/home/ongzhiyang/bias-bench-main" --num_train_epochs 3 --overwrite_output_dir --logging_strategy "steps" --logging_steps 1 --block_size 1024 --lora

In [None]:
!CUDA_VISIBLE_DEVICES=2 nohup python run_clm.py --model_name_or_path "microsoft/phi-2" --tokenizer_name "microsoft/phi-2" --do_train --train_file "../data/wikipedia-10.txt" --max_steps 2000 --per_device_train_batch_size 1 --gradient_accumulation_steps 32 --save_steps 100 --preprocessing_num_workers 1 --counterfactual_augmentation "religion" --seed 42 --output_dir "../results/CDA_FT/phi2/religion" --persistent_dir "/home/ongzhiyang/bias-bench-main" --num_train_epochs 3 --overwrite_output_dir --logging_strategy "steps" --logging_steps 1 --block_size 1024 --lora &

## Race

In [None]:
!CUDA_VISIBLE_DEVICES=2 nohup python run_clm.py --model_name_or_path "microsoft/phi-2" --tokenizer_name "microsoft/phi-2" --do_train --train_file "../data/wikipedia-10.txt" --max_steps 2000 --per_device_train_batch_size 1 --gradient_accumulation_steps 32 --save_steps 100 --preprocessing_num_workers 1 --counterfactual_augmentation "race" --seed 42 --output_dir "../results/CDA_FT/phi2/race" --persistent_dir "/home/ongzhiyang/bias-bench-main" --num_train_epochs 3 --overwrite_output_dir --logging_strategy "steps" --logging_steps 1 --block_size 1024 --lora &

## For llama 2, max block size/sequence length is 1024 (OOM error)

## Gender

In [None]:
!python run_clm.py --model_name_or_path "meta-llama/Llama-2-7b-hf" --tokenizer_name "meta-llama/Llama-2-7b-hf" --do_train --train_file "../data/wikipedia-10.txt" --max_steps 2000 --per_device_train_batch_size 1 --gradient_accumulation_steps 32 --save_steps 100 --preprocessing_num_workers 1 --counterfactual_augmentation "gender" --seed 42 --output_dir "../results/CDA_FT/llama2/gender" --persistent_dir "/home/FYP/on0008an/bias-bench-main" --num_train_epochs 3 --overwrite_output_dir --logging_strategy "steps" --logging_steps 1 --block_size 1024 --lora



In [None]:
# Continue from checkpoint
!python run_clm.py --model_name_or_path "meta-llama/Llama-2-7b-hf" --tokenizer_name "meta-llama/Llama-2-7b-hf" --do_train --train_file "../data/wikipedia-10.txt" --max_steps 2000 --per_device_train_batch_size 1 --gradient_accumulation_steps 32 --save_steps 100 --preprocessing_num_workers 1 --counterfactual_augmentation "gender" --seed 42 --output_dir "../results/CDA_FT/llama2/gender" --persistent_dir "/home/FYP/on0008an/bias-bench-main" --num_train_epochs 3 --overwrite_output_dir --logging_strategy "steps" --logging_steps 1 --block_size 1024 --lora --resume_from_checkpoint "../results/CDA_FT/llama2/gender/checkpoint-900"

Padding token: </s>
Quantization enabled
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:06<00:00,  3.45s/it]

Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
{'loss': 0.3438, 'grad_norm': 0.05251160264015198, 'learning_rate': 2.7475e-05, 'epoch': 0.02}
 45%|██████████████████                      | 901/2000 [01:33<01:53,  9.65it/s]

## Race

In [None]:
!python run_clm.py --model_name_or_path "meta-llama/Llama-2-7b-hf" --tokenizer_name "meta-llama/Llama-2-7b-hf" --do_train --train_file "../data/wikipedia-10.txt" --max_steps 2000 --per_device_train_batch_size 1 --gradient_accumulation_steps 32 --save_steps 100 --preprocessing_num_workers 1 --counterfactual_augmentation "race" --seed 42 --output_dir "../results/CDA_FT/llama2/race" --persistent_dir "/home/ongzhiyang/bias-bench-main" --num_train_epochs 3 --overwrite_output_dir --logging_strategy "steps" --logging_steps 1 --block_size 1024 --lora

In [None]:
# Continue from checkpoint
!python run_clm.py --model_name_or_path "meta-llama/Llama-2-7b-hf" --tokenizer_name "meta-llama/Llama-2-7b-hf" --do_train --train_file "../data/wikipedia-10.txt" --max_steps 2000 --per_device_train_batch_size 1 --gradient_accumulation_steps 32 --save_steps 100 --preprocessing_num_workers 1 --counterfactual_augmentation "gender" --seed 42 --output_dir "../results/CDA_FT/llama2/gender" --persistent_dir "/home/FYP/on0008an/bias-bench-main" --num_train_epochs 3 --overwrite_output_dir --logging_strategy "steps" --logging_steps 1 --block_size 1024 --lora --resume_from_checkpoint "../results/CDA_FT/llama2/gender/checkpoint-200"



## Religion

In [None]:
!python run_clm.py --model_name_or_path "meta-llama/Llama-2-7b-hf" --tokenizer_name "meta-llama/Llama-2-7b-hf" --do_train --train_file "../data/wikipedia-10_sample.txt" --max_steps 2000 --per_device_train_batch_size 1 --gradient_accumulation_steps 32 --save_steps 100 --preprocessing_num_workers 1 --counterfactual_augmentation "religion" --seed 42 --output_dir "../results/CDA_FT/llama2/religion" --persistent_dir "/home/FYP/on0008an/bias-bench-main" --num_train_epochs 3 --overwrite_output_dir --logging_strategy "steps" --logging_steps 1 --block_size 1024 --lora