## Imports

In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes
!pip install groq

In [None]:
from groq import Groq
import pandas as pd
from datasets import Dataset
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import FastLanguageModel
import torch
from google.colab import drive
import copy
import time
import os
import matplotlib.pyplot as plt
import gc
from google.colab import userdata

groq_api_key = userdata.get('groq_api_2')
groq_client = Groq(api_key=groq_api_key)
llama70b = "llama3-70b-8192"

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


## Load finetune data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
finetune_data_path = '/content/drive/My Drive/synthetic_dataset/sales_dataset/finetune_dataset.csv'

In [None]:
finetune_data = pd.read_csv(finetune_data_path)

In [None]:
# #sanity
# print(finetune_data.head(1))

In [None]:
system_prompt = finetune_data['system_prompt'][0]
print(system_prompt)

You are an expert sales copywriting assistant for social media who understands the attention economy and modern digital advertisement.


### Load baseline data

In [None]:
baseline_path = '/content/drive/My Drive/synthetic_dataset/sales_dataset/baseline_dataset.csv'

In [None]:
baseline_df = pd.read_csv(baseline_path)

### Create scoring prompts & configs for scoring baseline responses against finetuned responses

In [None]:
def create_scoring_prompts(baseline_df):
    scoring_prompts = []

    for user_prompt, completion1, completion2 in zip(baseline_df['user_prompt'], baseline_df['completion'], baseline_df['finetune_completion']):
        scoring_prompt = f"""
You are tasked with evaluating the quality of two text-only social media sales posts based on the system's prompt, user's prompt, and the completions provided. Please use the following criteria to determine which post is overall better and provide a concise explanation of your choice.

### System Prompt:
"{system_prompt}"

### User Prompt:
"{user_prompt}"

### Completion 1:
"{completion1}"

### Completion 2:
"{completion2}"

Evaluate each completion based on these criteria:

1. **Message Clarity**: Core message clarity
2. **Brevity**: Conciseness
3. **Language Simplicity**: Ease of understanding
4. **Call to Action (CTA)**: Compelling CTA
5. **Emotional Appeal**: Emotional resonance
6. **Target Audience Fit**: Relevance to audience
7. **Timeliness**: Alignment with trends/events
8. **Tone of Voice Consistency**: Consistency with brand voice
9. **Benefit Highlighting**: Communication of benefits
10. **Unique Selling Point (USP)**: Effective communication of USP
11. **Creativity and Originality**: Uniqueness/creativity
12. **Social Proof**: Inclusion of testimonials/reviews
13. **Hashtag and Keyword Usage**: Effective use of hashtags/keywords
14. **Transparency**: Disclosure of partnerships/sponsorships
15. **Honesty**: Honesty in claims

### FORMAT RULES
1. Use special tokens [START] and [END] as instructed. Responses should be between these tokens.
2. Replace variables in **** with your outputs, excluding the ****.
3. Output only the best completion and a concise explanation. No extra text or labels.

### REQUIRED OUTPUT
[START_PICK]
'Baseline completion' or 'Finetune completion' (completion 1 = baseline, completion 2 = finetune)
[END_PICK]

[START_EXPLANATION]
Explanation here (focus on key reasons for your choice).
[END_EXPLANATION]
"""
        scoring_prompts.append(scoring_prompt)
    return scoring_prompts

In [None]:
# we will be using llama3 70b to score the baseline prompts vs finetuned prompts
def ask_llm(prompts, client, model):
    responses = []
    for prompt in prompts:
      response = client.chat.completions.create(
          model=model,
          messages=[
              {"role": "user", "content": prompt}
          ],
          max_tokens=2000
      )

      content = response.choices[0].message.content
      content = content.replace('\n', '')
      responses.append(content)

    return responses

In [None]:
# create list of the special tokens defined in scoring prompt to parse responses
split_tokens = [
    ("[START_PICK]", "[END_PICK]"),
    ("[START_EXPLANATION]", "[END_EXPLANATION]")
]


### Experiment configs

In [None]:
experiments = [
    {
        "experiment_name" : "100%_finetune_data",
        "subset_required" : False,
        "subset_%" : None,
        "num_rows" : int(len(finetune_data)),
    },
    {
        "experiment_name" : "50%_finetune_data",
        "subset_required" : True,
        "subset_%" : .5,
        "num_rows" : int(len(finetune_data) * .5),
    },
    {
        "experiment_name" : "10%_finetune_data",
        "subset_required" : True,
        "subset_%" : .1,
        "num_rows" : int(len(finetune_data) * .10),
    },
    {
        "experiment_name" : "1%_finetune_data",
        "subset_required" : True,
        "subset_%" : .01,
        "num_rows" : int(len(finetune_data) * .01),
    },
    {
        "experiment_name" : "0.5%_finetune_data",
        "subset_required" : True,
        "subset_%" : .005,
        "num_rows" : int(len(finetune_data) * .005),
    }
]

### Create experiment datasets

In [None]:
from sklearn.model_selection import train_test_split
for experiment in experiments:
  experiment_data = finetune_data.copy()

  if experiment["subset_required"]: # creates a new subset from the original dataset and saves it to the experiment config
    # create a new column that preserves the row characterisitcs
    experiment_data['stratify_col'] = (experiment_data['post_tone'].astype(str) + '_' +
                      experiment_data['post_length'].astype(str) + '_' +
                      experiment_data['post_type'].astype(str))

    # set the size of the examples to throw away
    test_size = 1 - experiment["subset_%"]

    # Create a subset of the whole dataset that matches the distribution of the whole dataset
    subset_df, _ = train_test_split(experiment_data, test_size=test_size, stratify=experiment_data['stratify_col'], random_state=42)

    # Drop the stratification column to return the DataFrame to its original structure
    subset_df = subset_df.drop(columns=['stratify_col'])

    experiment["experiment_dataset"] = subset_df

  else: # saves the original dataset to the experiment config for the 100% finetune experiment
    experiment["experiment_dataset"] = experiment_data

In [None]:
for experiment in experiments:
  print(f"Name : {experiment['experiment_name']} : size {len(experiment['experiment_dataset'])}")

Name : 50%_finetune_data : size 8800
Name : 10%_finetune_data : size 1760
Name : 1%_finetune_data : size 176


In [None]:
# double check the subset data maintains the same distribution as the full dataset
for experiment in experiments:
  print(f"EXPERIMENT: {experiment['experiment_name']} \n\n")
  for column in experiment["experiment_dataset"].columns:
    if column in ["system_prompt", "user_prompt", "completion", "finetune_text"]:
      pass
    else:
      print(f"COLUMN: {column}")
      subset_value_counts = experiment["experiment_dataset"][column].value_counts()
      full_dataset_value_counts = finetune_data[column].value_counts()
      for (subset_key, subset_value), (full_key, full_value) in zip(subset_value_counts.items(), full_dataset_value_counts.items()):
        print(f"value: {subset_key} | Full finetune count = {full_value} | Subset count = {subset_value} | Subset % = {float(int(subset_value) / int(full_value))}")
      print("\n")


EXPERIMENT: 50%_finetune_data 


COLUMN: post_tone
value: Witty | Full finetune count = 3520 | Subset count = 1760 | Subset % = 0.5
value: Professional | Full finetune count = 3520 | Subset count = 1760 | Subset % = 0.5
value: Trustworthy | Full finetune count = 3520 | Subset count = 1760 | Subset % = 0.5
value: Inspirational | Full finetune count = 3520 | Subset count = 1760 | Subset % = 0.5
value: Enthusiastic | Full finetune count = 3520 | Subset count = 1760 | Subset % = 0.5


COLUMN: post_length
value: 1-3 sentences | Full finetune count = 8000 | Subset count = 4000 | Subset % = 0.5
value: 4-6 sentences | Full finetune count = 6400 | Subset count = 3200 | Subset % = 0.5
value: 7+ sentences | Full finetune count = 3200 | Subset count = 1600 | Subset % = 0.5


COLUMN: post_type
value: Interactive Posts | Full finetune count = 1600 | Subset count = 800 | Subset % = 0.5
value: Comparison Posts | Full finetune count = 1600 | Subset count = 800 | Subset % = 0.5
value: User-Generated Con

### Run experiments



In [None]:
def format_prompt_for_finetune(row):
  user_prompt = row['user_prompt']
  completion = row['completion']

  messages = [
      {"role" : "system", "content" : system_prompt},
      {"role" : "user", "content" : user_prompt},
      {"role" : "assistant", "content" : completion}
  ]

  return tokenizer.apply_chat_template(messages, add_generation_prompt = False, tokenize = False)

In [None]:
def save_results_model(save_path, results_df, model, tokenizer, meta_results_df):
    # Ensure the necessary directories exist
    os.makedirs(save_path, exist_ok=True)

    # Save the experiment results
    results_path = os.path.join(save_path, "results.csv")
    results_df.to_csv(results_path, index=False)

    # Save the meta results
    meta_results_path = os.path.join(save_path, "meta_results.csv")
    meta_results_df.to_csv(meta_results_path, index=False)

    # Ensure the model directory exists
    model_save_path = os.path.join(save_path, "model")
    os.makedirs(model_save_path, exist_ok=True)

    # Save the model weights and tokenizer
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)


In [None]:
def load_model_tokenizer(model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit", load_in_4bit = True, max_seq_length = 2048):
    dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+

    model, tokenizer = FastLanguageModel.from_pretrained(
      model_name = model_name,
      max_seq_length = max_seq_length,
      dtype = dtype,
      load_in_4bit = load_in_4bit,
    )

    model = FastLanguageModel.get_peft_model(
      model,
      r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
      target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                        "gate_proj", "up_proj", "down_proj",],
      lora_alpha = 16,
      lora_dropout = 0, # Supports any, but = 0 is optimized
      bias = "none",    # Supports any, but = "none" is optimized
      # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
      use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
      random_state = 3407,
      use_rslora = False,  # We support rank stabilized LoRA
      loftq_config = None, # And LoftQ
    )

    return model, tokenizer

## Run Experiments

In [None]:
for num_steps in [50, 150, "full_finetune"]:
    for experiment in experiments:
        init_time = time.time()
        max_seq_length = 2048
        model, tokenizer = load_model_tokenizer()
        experiment_name = experiment['experiment_name'] + "_" + str(num_steps) + "_finetune_steps"
        save_path = f"/content/drive/My Drive/synthetic_dataset/sales_dataset/experiments/{experiment_name}"

        # copy the baseline df for each experiment so we can compare the baseline response with the finetuned response
        experiment_baseline_df = baseline_df.copy()

        experiment_data = experiment["experiment_dataset"]

        # process the data for finetuning
        experiment_data["finetune_text"] = experiment_data.apply(lambda row: format_prompt_for_finetune(row), axis = 1)

        # convert the dataset from pandas dataframe to Dataset object
        dataset = Dataset.from_pandas(experiment_data)

        training_args = TrainingArguments(
            per_device_train_batch_size=2,
            gradient_accumulation_steps=4,
            warmup_steps=5,
            learning_rate=2e-4,
            fp16=not is_bfloat16_supported(),
            bf16=is_bfloat16_supported(),
            logging_steps=1,
            optim="adamw_8bit",
            weight_decay=0.01,
            lr_scheduler_type="linear",
            seed=3407,
            output_dir="outputs",
        )

        # Set either max_steps or num_train_epochs based on num_steps
        if num_steps == "full_finetune":
            training_args.num_train_epochs = 1
        else:
            training_args.max_steps = num_steps


        # init the model trainer
        trainer = SFTTrainer(
          model = model,
          tokenizer = tokenizer,
          train_dataset = dataset,
          dataset_text_field = "finetune_text",
          max_seq_length = max_seq_length,
          dataset_num_proc = 2,
          packing = False, # Can make training 5x faster for short sequences.
          args = training_args,
          )

        print("\n\nTRAINING MODEL\n\n")
        # train the model
        start_time = time.time()
        trainer_stats = trainer.train()
        end_time = time.time()
        finetune_time = round(end_time - start_time, 2)

        log_history = trainer.state.log_history
        finetune_steps = trainer.state.global_step
        print(finetune_steps)
        print(type(finetune_steps))

        # Extract training loss values and steps
        loss_values = []
        steps = []
        for log in log_history:
            if 'loss' in log:
                loss_values.append(log['loss'])
                steps.append(log['step'])

        baseline_prompts = []
        prompt_lengths = []
        for user_prompt in baseline_df["user_prompt"]:
            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ]

            prompt = tokenizer.apply_chat_template(
                    messages,
                    tokenize=False,
                    add_generation_prompt=True,
            )

            prompt_lengths.append(len(prompt))

            #the apply_chat_template adds the <begin_of_text> special token.
            #However, this token is generated again during inference causing the special token to appear twice. Therefore, we are stripping the <begin_of_text> token so there is no duplicate token.
            baseline_prompts.append(prompt[17:])

        print("GENERATING FINETUNED RESPONSES FOR BASELINE PROMPTS\n\n")
        # generate finetuned responses for all of the baseline prompts
        FastLanguageModel.for_inference(model) # Enable native 2x faster inference
        inputs = tokenizer(baseline_prompts, return_tensors = "pt", padding = True).to("cuda")
        with torch.no_grad():
          outputs = model.generate(**inputs, max_new_tokens = 500, use_cache = True)

        finetuned_responses = tokenizer.batch_decode(outputs)

        # remove special tokens from the finetuned responses
        finetune_completions = []
        for response in finetuned_responses:

          # Word to strip before
          first_split = "<|start_header_id|>assistant<|end_header_id|>"
          first_split_offset = len(first_split)
          # Find the position of the word
          index_first_split = response.find(first_split)

          if index_first_split != -1:
              subset_response = response[index_first_split + first_split_offset:].strip()

          second_split = "<|eot_id|>"

          index_second_split = subset_response.find(second_split)

          if index_second_split != -1:
              subset_response = subset_response[:index_second_split].strip()

          finetune_completions.append(subset_response)

        # save the finetuned responses to the experiment baseline dataframe for comparions
        experiment_baseline_df["finetune_completion"] = finetune_completions

        scoring_prompts = create_scoring_prompts(experiment_baseline_df)

        print("SCORING FINETUNED RESPONSES TO BASELINE PROMPTS\n\n")
        # score the baseline responses vs the finetuned responses
        scoring_responses = ask_llm(scoring_prompts, groq_client, llama70b)

        # parse the scoring responses
        best_completions = []
        explanations = []

        for response in scoring_responses:
            for idx, (first_split, second_split) in enumerate(split_tokens):
                _response = response
                first_split_offset = len(first_split)
                # Find the position of the first split token
                index_first_split = _response.find(first_split)

                if index_first_split != -1:
                    subset_response = _response[index_first_split + first_split_offset:].strip()

                index_second_split = subset_response.find(second_split)

                if index_second_split != -1:
                    subset_response = subset_response[:index_second_split].strip()

                if idx == 0:
                    best_completions.append(subset_response)
                if idx == 1:
                    explanations.append(subset_response)


        #add the experiment results
        experiment_baseline_df["best_completion"] = best_completions
        experiment_baseline_df['explanation'] = explanations

        best_completion_counts = experiment_baseline_df["best_completion"].value_counts()
        raw_counts = best_completion_counts.to_dict()
        finetune_count = best_completion_counts.get("Finetune completion", 0)
        baseline_count = best_completion_counts.get("Baseline completion", 0)
        ratio = finetune_count / baseline_count if baseline_count > 0 else None

        ending_time = time.time()
        total_runtime = round(ending_time - init_time, 2)

        meta_results = pd.DataFrame()
        meta_results["loss_values"] = [loss_values]
        meta_results['training_steps'] = [steps]
        meta_results["time_to_finetune"] = finetune_time
        meta_results["finetune_count"] = finetune_count
        meta_results["baseline_count"] = baseline_count
        meta_results["raw_counts"] = [raw_counts]
        meta_results["raw_counts"] = meta_results["raw_counts"].apply(lambda x: str(x))
        meta_results["finetune_over_baseline_ratio"] = ratio
        meta_results["experiment_name"] = experiment_name
        meta_results["subset_size"] = experiment['subset_%']
        meta_results["num_rows"] = experiment['num_rows']
        meta_results["total_runtime"] = total_runtime
        meta_results["finetune_steps"] = finetune_steps




        print("SAVING RESULTS & MODEL\n\n")
        save_results_model(save_path, experiment_baseline_df, model, tokenizer, meta_results)

        if hasattr(trainer, 'optimizer'):
            del trainer.optimizer
        if hasattr(trainer, 'state'):
            del trainer.state

        # CLEAR GPU RAM FOR NEXT RUN
        for param in model.parameters():
            param.grad = None

        model.to('cpu')

        for var in [inputs, outputs]:
            if isinstance(var, torch.Tensor):
                var = var.cpu()


        if hasattr(trainer, 'optimizer'):
            del trainer.optimizer
        if hasattr(trainer, 'lr_scheduler'):
            del trainer.lr_scheduler

        # Clear the state if it exists
        if hasattr(trainer, 'state'):
            del trainer.state

        # Clear the callbacks if they exist
        if hasattr(trainer, 'callback_handler'):
            del trainer.callback_handler

        # Clear the train dataset to free memory
        if hasattr(trainer, 'train_dataset'):
            del trainer.train_dataset

        # Now delete the trainer itself
        del model, tokenizer, trainer, inputs, outputs, experiment_baseline_df
        del finetune_completions, finetuned_responses, baseline_prompts, dataset
        del scoring_prompts, scoring_responses, best_completions, explanations
        del loss_values, steps, meta_results

        torch.cuda.empty_cache()
        gc.collect()

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.1k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Map (num_proc=2):   0%|          | 0/8800 [00:00<?, ? examples/s]



TRAINING MODEL




==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 8,800 | Num Epochs = 5
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 5,500
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.4111
2,2.6786
3,2.3936
4,2.3016
5,2.2699
6,1.9396
7,1.7245
8,1.5364
9,1.3852
10,1.2429


5500
<class 'int'>
GENERATING FINETUNED RESPONSES FOR BASELINE PROMPTS


SCORING FINETUNED RESPONSES TO BASELINE PROMPTS


SAVING RESULTS & MODEL


==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Map (num_proc=2):   0%|          | 0/1760 [00:00<?, ? examples/s]



TRAINING MODEL




==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,760 | Num Epochs = 5
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 1,100
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.6117
2,2.4299
3,2.5911
4,2.3783
5,2.1722
6,1.9385
7,1.6772
8,1.6436
9,1.3727
10,1.2574


1100
<class 'int'>
GENERATING FINETUNED RESPONSES FOR BASELINE PROMPTS


SCORING FINETUNED RESPONSES TO BASELINE PROMPTS


SAVING RESULTS & MODEL


==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Map (num_proc=2):   0%|          | 0/176 [00:00<?, ? examples/s]



TRAINING MODEL




==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 176 | Num Epochs = 5
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 110
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.4943
2,2.3837
3,2.4531
4,2.4983
5,2.1812
6,1.8243
7,1.8984
8,1.6128
9,1.4619
10,1.1662


110
<class 'int'>
GENERATING FINETUNED RESPONSES FOR BASELINE PROMPTS


SCORING FINETUNED RESPONSES TO BASELINE PROMPTS


SAVING RESULTS & MODEL


