## Imports & Configs

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import pandas as pd
finetune_dataset_path = "/content/drive/My Drive/synthetic_dataset/sales_dataset/finetune_dataset.csv"
finetune_dataset_df = pd.read_csv(finetune_dataset_path)

## Create Dataset Used To Generate Baseline Responses

In [None]:
random_state = 7

In [None]:
baseline_indexes = []

In [None]:
post_tones = finetune_dataset_df["post_tone"].unique()

In [None]:
for tone in post_tones:
  subset_df = finetune_dataset_df[finetune_dataset_df["post_tone"] == tone]
  subset_random = subset_df.sample(5, random_state = random_state).index
  baseline_indexes.extend(subset_random)

In [None]:
post_lengths = finetune_dataset_df["post_length"].unique()

In [None]:
for length in post_lengths:
  subset_df = finetune_dataset_df[finetune_dataset_df["post_length"] == length]
  subset_random = subset_df.sample(5, random_state = random_state).index
  baseline_indexes.extend(subset_random)

In [None]:
post_types = finetune_dataset_df["post_type"].unique()

In [None]:
for _type in post_types:
  subset_df = finetune_dataset_df[finetune_dataset_df["post_type"] == _type]
  subset_random = subset_df.sample(5, random_state = random_state).index
  baseline_indexes.extend(subset_random)

In [None]:
baseline_df = finetune_dataset_df.iloc[baseline_indexes].copy()

In [None]:
system_prompt = finetune_dataset_df["system_prompt"][0]

You are an expert sales copywriting assistant for social media who understands the attention economy and modern digital advertisment.
<class 'str'>


## Load Llama 3 8b Instruct 4 Bit & Generate Baseline Responses

In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.0.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


## Load & Process Baseline Prompts For Baseline Response Generation

In [None]:
batch_prompts = []
prompt_lengths = []
for user_prompt in baseline_df["user_prompt"]:
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]

    prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
    )

    prompt_lengths.append(len(prompt))

    #the apply_chat_template adds the <begin_of_text> special token.
    #However, this token is generated again during inference causing the special token to appear twice. Therefore, we are stripping the <begin_of_text> token so there is no duplicate token.
    batch_prompts.append(prompt[17:])


## Generate Baseline Responses

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(batch_prompts, return_tensors = "pt", padding = True).to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 500, use_cache = True)
responses = tokenizer.batch_decode(outputs)

In [None]:
print(len(responses))

95


## Process Baseline Responses To Remove Special Tokens

In [None]:
baseline_completions = []
for response in responses:


  # Word to strip before
  first_split = "<|start_header_id|>assistant<|end_header_id|>"
  first_split_offset = len(first_split)
  # Find the position of the word
  index_first_split = response.find(first_split)

  if index_first_split != -1:
      subset_response = response[index_first_split + first_split_offset:].strip()

  second_split = "<|eot_id|>"

  index_second_split = subset_response.find(second_split)

  if index_second_split != -1:
      subset_response = subset_response[:index_second_split].strip()

  baseline_completions.append(subset_response)
  #print(subset_response)
  #print("\n\n")



In [None]:
baseline_df["completion"] = baseline_completions

In [None]:
for completion in baseline_df["completion"][:5]:
  print(completion)

"Discover the future of science education with VirtualLab Science Simulations! Unlike traditional textbooks, our simulations bring the lab to life, allowing students to conduct virtual experiments and explore complex concepts in a safe and engaging environment. Compare our simulations to traditional teaching methods and experience the difference for yourself! #EduFuture #VirtualLab #ScienceSimulations"
"Take your audio experience to the next level with SoundSync Wireless Transmitter from PureSound Audio! As a trusted name in the industry since 2012, we've crafted a wireless transmitter that delivers superior sound quality and seamless connectivity for professionals and audiophiles alike. Say goodbye to tangled cords and hello to freedom with our reliable and durable wireless solution. Upgrade your audio setup today and experience the difference PureSound Audio brings to the table."
"Stay cool and comfortable all day, every day! Introducing our CoolDry T-Shirts, featuring advanced moist

## Save Baseline Dataset

In [None]:
baseline_dataset_path = "/content/drive/My Drive/synthetic_dataset/sales_dataset/baseline_dataset.csv"
baseline_df.to_csv(baseline_dataset_path, index = False)