## Install and Import Packages

In [1]:
from transformers import AutoTokenizer, PreTrainedTokenizer
from typing import Dict

from predibase import PredibaseClient
from datasets import load_dataset

import pandas as pd
import numpy as np

import json
import pprint

  from .autonotebook import tqdm as notebook_tqdm


# Download Base Dataset From HuggingFace


In [2]:
base_dataset: pd.DataFrame = load_dataset("gbharti/finance-alpaca", split="train").to_pandas()
# base_dataset = base_dataset[["instruction", "category", "intent", "response"]]

In [3]:
base_dataset.columns

Index(['instruction', 'output', 'text', 'input'], dtype='object')

In [4]:
base_dataset.head(n=10)

Unnamed: 0,instruction,output,text,input
0,"For a car, what scams can be plotted with 0% f...",The car deal makes money 3 ways. If you pay in...,,
1,Why does it matter if a Central Bank has a neg...,"That is kind of the point, one of the hopes is...",,
2,Where should I be investing my money?,"Pay off your debt. As you witnessed, no ""inve...",,
3,Specifically when do options expire?,"Equity options, at least those traded in the A...",,
4,Negative Balance from Automatic Options Exerci...,"Automatic exercisions can be extremely risky, ...",,
5,Approximation of equity value for company in d...,"Generally ""default"" means that the company can...",,
6,Is it true that 90% of investors lose their mo...,The game is not zero sum. When a friend and I ...,,
7,Can a company charge you for services never re...,"In general, you can only be charged for servic...",,
8,Working out if I should be registered as self-...,Being self employed just means you fill out so...,,
9,About eToro investments,"For eToro, just like any other brokerage firm,...",,


In [5]:
print(base_dataset.iloc[0].instruction)
print(base_dataset.iloc[0].input)
print(base_dataset.iloc[0].output)

For a car, what scams can be plotted with 0% financing vs rebate?

The car deal makes money 3 ways. If you pay in one lump payment. If the payment is greater than what they paid for the car, plus their expenses, they make a profit. They loan you the money. You make payments over months or years, if the total amount you pay is greater than what they paid for the car, plus their expenses, plus their finance expenses they make money. Of course the money takes years to come in, or they sell your loan to another business to get the money faster but in a smaller amount. You trade in a car and they sell it at a profit. Of course that new transaction could be a lump sum or a loan on the used car... They or course make money if you bring the car back for maintenance, or you buy lots of expensive dealer options. Some dealers wave two deals in front of you: get a 0% interest loan. These tend to be shorter 12 months vs 36,48,60 or even 72 months. The shorter length makes it harder for many to affo

### Dataset Characteristics


In [6]:
base_dataset.describe() # -> Num rows and number of unique values per column

Unnamed: 0,instruction,output,text,input
count,68912,68912,68912.0,68912.0
unique,58482,67853,1.0,18882.0
top,"What options do I have at 26 years old, with 1...",Negative,,
freq,23,44,68912.0,49184.0


Drop the unneeded columns

In [7]:
def get_dataset_with_split(df: pd.DataFrame, validation_frac: float = 0.20) -> pd.DataFrame:
  """
    Adds a split column to the dataframe with two values:
    - 0 to indicate the train set
    - 1 to indicate the validation set

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - validation_frac (float): The fraction of the data to be used for validation.

    Returns:
    - pd.DataFrame: The DataFrame with the 'split' column added.
  """
  df["split"] = 0
  sample_indices = df.sample(frac=validation_frac).index
  df.loc[sample_indices, "split"] = 1
  df = df.sample(frac=1) # Shuffle
  print(df['split'].value_counts(normalize=True))
  return df


split_dataset = get_dataset_with_split(base_dataset)

split
0    0.800006
1    0.199994
Name: proportion, dtype: float64


### See Final Prepared Dataset

# Understanding Token Distributions In Each Dataset

Another important aspect to optimize training is to get an understand of how the dataset looks like once it is tokenized. This is useful for two reasons:
1. The sequence lengths determine memory requirements and how we configure some optimizations. Longer sequences require more memory and make require more specialized hardware, but we can always make some special tradeoffs to make it work on cheaper hardware but train more slowly.
2. We may also find that we can ignore some outliers (say the 5% of the longest sequences if they skew far from the general distribution) to boost training speed because longer sequences train more slowly.

In the case below, the sequence lengths are typically quite short, so we won't have to worry about either of these things, but is always very useful to inspect so you can make the right tradeoffs.


In [8]:
BASE_MODEL: str = "mistralai/Mistral-7B-v0.1"
tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

In [9]:
def get_token_distribution(df: pd.DataFrame, tokenizer: PreTrainedTokenizer) -> Dict[str, int]:
    """
    Calculate the token distribution for each column in the DataFrame after tokenization.

    Parameters:
    - df (pd.DataFrame): The input DataFrame with text columns.
    - tokenizer (PreTrainedTokenizer): The tokenizer to use for tokenization.

    Returns:
    - Dict[str, int]: A dictionary containing token counts for each column and the total.
      Keys are column names, and values are lists of token counts.
    """
    cols = list(set(df.columns) - {"split"})

    def tokenize_and_count(text):
        tokens = tokenizer.tokenize(text)
        return len(tokens)

    token_counts = {}
    for col in cols:
        token_counts[col] = df[col].apply(tokenize_and_count).tolist()

    # Calculate total token counts for each column
    total_counts = [sum(col_counts) for col_counts in zip(*token_counts.values())]
    token_counts['total'] = total_counts

    return token_counts


def calculate_distribution(df: pd.DataFrame, tokenizer: PreTrainedTokenizer) -> pd.DataFrame:
    """
    Calculate statistical distribution metrics for token counts in each column after tokenization.

    Parameters:
    - df (pd.DataFrame): The input DataFrame with text columns.
    - tokenizer (PreTrainedTokenizer): The tokenizer to use for tokenization.

    Returns:
    - pd.DataFrame: A DataFrame containing statistical distribution metrics:
        - 'average': Average token count
        - 'min': Minimum token count
        - 'max': Maximum token count
        - 'median': Median token count
        - '75th_percentile': 75th percentile token count
        - '90th_percentile': 90th percentile token count
        - '95th_percentile': 95th percentile token count
        - '99th_percentile': 99th percentile token count
      Columns represent different columns in the input DataFrame.
    """
    token_counts = get_token_distribution(df, tokenizer)
    result = {}

    for key, values in token_counts.items():
        values = np.array(values)
        result[key] = {
            'average': int(np.mean(values)),
            'min': np.min(values),
            'max': np.max(values),
            'median': np.median(values),
            '75th_percentile': int(np.percentile(values, 75)),
            '90th_percentile': int(np.percentile(values, 90)),
            '95th_percentile': int(np.percentile(values, 95)),
            '99th_percentile': int(np.percentile(values, 99))
        }

    return pd.DataFrame(result)

In [13]:
split_dataset['instruction'] = split_dataset['instruction'] + split_dataset['input']

In [14]:
res = calculate_distribution(split_dataset, tokenizer)
res.to_latex()

'\\begin{tabular}{lrrrrr}\n\\toprule\n & input & output & text & instruction & total \\\\\n\\midrule\naverage & 4.000000 & 106.000000 & 0.000000 & 17.000000 & 129.000000 \\\\\nmin & 0.000000 & 0.000000 & 0.000000 & 2.000000 & 5.000000 \\\\\nmax & 618.000000 & 3688.000000 & 0.000000 & 628.000000 & 3712.000000 \\\\\nmedian & 0.000000 & 68.000000 & 0.000000 & 15.000000 & 90.000000 \\\\\n75th_percentile & 4.000000 & 128.000000 & 0.000000 & 20.000000 & 151.000000 \\\\\n90th_percentile & 14.000000 & 249.000000 & 0.000000 & 29.000000 & 271.000000 \\\\\n95th_percentile & 22.000000 & 362.000000 & 0.000000 & 37.000000 & 383.000000 \\\\\n99th_percentile & 57.000000 & 694.000000 & 0.000000 & 71.000000 & 714.000000 \\\\\n\\bottomrule\n\\end{tabular}\n'

In [20]:
# res.drop(columns=['text'], inplace=True)
# res.drop(columns=['total'], inplace=True)
# res.drop(columns=['input'], inplace=True)
# reaarange so that it goes instruction, output, total
res = res[['instruction', 'output']]
res

Unnamed: 0,instruction,output
average,17.0,106.0
min,2.0,0.0
max,628.0,3688.0
median,15.0,68.0
75th_percentile,20.0,128.0
90th_percentile,29.0,249.0
95th_percentile,37.0,362.0
99th_percentile,71.0,694.0


In [21]:
print(res.to_latex())

\begin{tabular}{lrr}
\toprule
 & instruction & output \\
\midrule
average & 17.000000 & 106.000000 \\
min & 2.000000 & 0.000000 \\
max & 628.000000 & 3688.000000 \\
median & 15.000000 & 68.000000 \\
75th_percentile & 20.000000 & 128.000000 \\
90th_percentile & 29.000000 & 249.000000 \\
95th_percentile & 37.000000 & 362.000000 \\
99th_percentile & 71.000000 & 694.000000 \\
\bottomrule
\end{tabular}



In [23]:
# remove the rows with entries above the 99th percentile
def tokenize_and_count(text):
    tokens = tokenizer.tokenize(text)
    return len(tokens)

# split_dataset['tokenized_in'] = split_dataset['instruction'].apply(tokenize_and_count)
split_dataset['tokenized_out'] = split_dataset['output'].apply(tokenize_and_count)

# split_dataset = split_dataset[split_dataset['tokenized_in'] < 10000]
trimmed_dataset = split_dataset[split_dataset['tokenized_out'] < 540]

In [24]:
split_dataset.columns

Index(['instruction', 'output', 'split', 'tokenized_out'], dtype='object')

In [25]:
split_dataset.describe()

Unnamed: 0,split,tokenized_out
count,57425.0,57425.0
mean,0.2,84.362647
std,0.400003,110.363098
min,0.0,0.0
25%,0.0,18.0
50%,0.0,57.0
75%,0.0,107.0
max,1.0,2336.0


In [26]:
trimmed_dataset.describe()

Unnamed: 0,split,tokenized_out
count,56852.0,56852.0
mean,0.199835,77.358527
std,0.399879,81.338109
min,0.0,0.0
25%,0.0,18.0
50%,0.0,56.0
75%,0.0,105.0
max,1.0,539.0


In [28]:
# save the data, dont include any sort of index or the like
trimmed_dataset.drop(columns=['tokenized_out'], inplace=True)
# rename the remaining columns
trimmed_dataset.rename(columns={'instruction': 'question', 'output': 'answer'}, inplace=True)
trimmed_dataset.to_csv("cleaned_finance.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trimmed_dataset.drop(columns=['tokenized_out'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trimmed_dataset.rename(columns={'instruction': 'question', 'output': 'answer'}, inplace=True)


In [29]:
# read it back in and print head to check the round trip
trimmed_dataset = pd.read_csv("cleaned_finance.csv")
# trimmed_dataset.drop(columns=['Unnamed: 0'], inplace=True)
trimmed_dataset.head(20)

Unnamed: 0,question,answer,split
0,Provide a list of items that can be reused or ...,1. Plastic bottles 2. Cardboard boxes 3. Alumi...,1
1,Convert the given Celsius temperature to Fahre...,21.16 Fahrenheit.,0
2,Create a timeline of the life of Muhammad Ali.,"1942: Born in Louisville, Kentucky.\n1960: Won...",0
3,Construct a game involving the given settings.,The game involves a player taking on the role ...,0
4,Online brokers with a minimum stock purchase l...,With InteractiveBrokers there is no minimum tr...,0
5,Is playing sports an important part of your da...,"Yes, playing sports is a very important part o...",0
6,Summarize the key features of the given produc...,This phone from ABC Company is a powerful devi...,0
7,Name two vegetables that start with the letter A,"Artichoke, Asparagus",0
8,Should I put more money down on one property a...,I would go with the 2nd option (put down as li...,0
9,Describe a vegetation layer of a tropical rain...,The vegetation layer of a tropical rainforest ...,0


# Fine-Tuning with Predibase

As we discussed, there are 4 steps to kicking off a fine-tuning job using the Predibase SDK:

1. Prepare your dataset (done above)

In this section we will cover the following:
2. Upload your dataset to Predibase - can be done either via importing a dataset already in memory like we have, or via file upload.
3. Setup the fine-tuning prompt
4. Kick-off fine-tuning and monitor the job

Beneath the surface, Predibase uses **LoRA adapters** for fine-tuning with a quantized base model. LoRA adapters allow for efficient, light-weight learning by injecting a small set of learnable weights into the base model that are trainable. Not only does it have very comparable performance to full fine-tuning, but it is faster to train and also allows dynamically swapping models at inference time for cost-savings. There will be more about this after we finish fine-tuning.

## Setup the Prompt Template

In [36]:
# Define the template used to prompt the model for each example
# Note the 4-space indentation, which is necessary for the YAML templating.
base_prompt_template: str = """
    You are a friendly financial advisor. Your job is to help people answer financial and related question.
    Make sure when you answer questions you show the steps you took to arrive at your answer. Do your
    best to reason step by step. If you don't know the answer, say that you don't know.

    Question: {question}

    Answer:
"""

llm = pc.LLM(f"hf://{BASE_MODEL}")

Note: We don't need to actually provide 1 or more examples in the prompt template for the model to do well. It may be required sometimes for very complex tasks, but as we will see here, we can get away without it which saves us a lot of money on inference tokens and inference query latency.

## Payments Dataset: Upload dataset + Start Fine-Tuning

In [None]:
trimmed_dataset.columns

Index(['question', 'answer', 'split'], dtype='object')

In [35]:
# Upload your dataframe directly to Predibase
# Note: You can also use pc.upload_dataset("/local/path/to/dataset") if you've already preprocessed your dataset
finance_dataset_handle = pc.create_dataset_from_df(trimmed_dataset, name="finance_training_dataset")

ServerResponseError: Error 400: uploaded dataset with name already exists: finance_training_dataset. Trace ID: 64b4403d8ae3a54403cc17249962ae91

In [None]:
print(finance_dataset_handle)

Dataset(id=12980, name=finance_training_dataset, object_name=90f4137c400f44bc9971395aa14bbc19, connection_id=11129, author=ckniffin6@gatech.edu, created=2024-04-08T16:15:52.451465Z, updated=2024-04-08T16:15:52.451465Z)


In [40]:
job = llm.finetune(
    prompt_template=base_prompt_template,
    target="answer",
    dataset="file_uploads/finance_training_dataset", # https://docs.predibase.com/user-guide/fine-tuning/create-finetuned-model
    epochs=5,
    lora_rank=16
)

In [42]:
job = llm.finetune(
    repo="finace_rank_8",
    prompt_template=base_prompt_template,
    target="answer",
    dataset="file_uploads/finance_training_dataset", # https://docs.predibase.com/user-guide/fine-tuning/create-finetuned-model
    epochs=5,
    lora_rank=8
)

In [44]:
job = llm.finetune(
    repo="finace_rank_4",
    prompt_template=base_prompt_template,
    target="answer",
    dataset="file_uploads/finance_training_dataset", # https://docs.predibase.com/user-guide/fine-tuning/create-finetuned-model
    epochs=5,
    lora_rank=4
)

ValueError: LoRA rank must be one of 8, 16, 32, or 64

In [None]:
finance_model = job.get()

NameError: name 'job' is not defined

# Fine-Tuning Inference Performance

We can use [LoRAX](https://predibase.github.io/lorax/) for multi-LoRA adapter inference.

## What is LoRAX?

LoRAX (LoRA eXchange) is a framework built by Predibase that allows users to serve thousands of fine-tuned models on a single GPU, dramatically reducing the cost of serving without compromising on throughput or latency.

## How does LoRAX work?

At inference time, your adapter is downloaded/loaded on top of the base model and used for inference. Since each fine-tuned model at Predibase is an adapter, it means we can load in multiple adapters simultaneously over the same base model and run inference against any of the adapters using LoRAX. All downloaded adapters are kept in memory until some predifined memory threshold is hit, after which they are dynamically swapped out when requests with new adapters come in.

## Other LoRAX features:
1. Structured Generation with schema enforcement: https://predibase.github.io/lorax/guides/structured_output/
2. Dynamic adapter merging: https://predibase.github.io/lorax/guides/merging_adapters/

In [30]:
# Get a reference to the base mistral-7b model we fine-tuned our datasets on
base_model = pc.LLM("pb://deployments/mistral-7b")
chat_model = pc.LLM("pb://deployments/mistral-7b-instruct")
moe_model = pc.LLM("pb://deployments/mixtral-8x7b-instruct-v0-1")
llama_model = pc.LLM("pb://deployments/llama-2-7b-chat")
finance_model = pc.get_model("Mistral-7B-v0.1-finance_training_dataset")

# Create LoRAX adapter handles for each model on top of the base model
ft_finance_llm = base_model.with_adapter(finance_model)

options = {
    "max_new_tokens": 2048, # fine-tuned LLMs actually know how to stop early, so it will not hit the 2048 token limit set here
    "temperature": 0.1
}

Now we can prompt all of these models using the same `generate` method! For now, we'll just spot check them to make sure they have learned something reasonable!

In [33]:
# Prompt the base finance model 3 times
prompt = """
You are a friendly financial advisor. Your job is to help people answer financial and related question.
Make sure when you answer questions you show the steps you took to arrive at your answer. Do your
best to reason step by step. If you don't know the answer, say that you don't know.

Question: If I a $35000 loan with a 5% APR how many $500 payments would it take to pay it back?

Answer:
"""

fin_result1 = ft_finance_llm.prompt(prompt, max_new_tokens=2048)
fin_result2 = ft_finance_llm.prompt(prompt, max_new_tokens=2048)
fin_result3 = ft_finance_llm.prompt(prompt, max_new_tokens=2048)

print(fin_result1)
print(fin_result2)
print(fin_result3)

GeneratedResponse
	prompt: 
You are a friendly financial advisor. Your job is to help people answer financial and related question.
Make sure when you answer questions you show the steps you took to arrive at your answer. Do your
best to reason step by step. If you don't know the answer, say that you don't know.

Question: 
You are a friendly financial advisor. Your job is to help people answer financial and related question.
Make sure when you answer questions you show the steps you took to arrive at your answer. Do your
best to reason step by step. If you don't know the answer, say that you don't know.

Question: If I a $35000 loan with a 5% APR how many $500 payments would it take to pay it back?

Answer:


Answer:

	response: 70 payments.

GeneratedResponse
	prompt: 
You are a friendly financial advisor. Your job is to help people answer financial and related question.
Make sure when you answer questions you show the steps you took to arrive at your answer. Do your
best to reason s

In [34]:
moe_result1 = moe_model.prompt(prompt, max_new_tokens=2048)
moe_result2 = moe_model.prompt(prompt, max_new_tokens=2048)
moe_result3 = moe_model.prompt(prompt, max_new_tokens=2048)

print(moe_result1)
print(moe_result2)
print(moe_result3)

GeneratedResponse
	prompt: <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
<</SYS>>

[INST] 
You are a friendly financial advisor. Your job is to help people answer financial and related question.
Make sure when you answer questions you show the steps you took to arrive at your answer. Do your
best to reason step by step. If you don't know the answer, say that you don't know.

Question: If I a $35000 loan with a 5% APR how many $500 payments would it take to pay it back?

Answer:
 [/INST]

	response: Sure, I'd be happy to help