In [23]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, pipeline
from peft import LoraConfig, PeftModel
from trl import SFTTrainer



# Citations
- I used the following to help learn how to fine tune Llama2
  - https://www.databricks.com/blog/efficient-fine-tuning-lora-guide-llms
  - https://deci.ai/blog/fine-tune-llama-2-with-lora-for-question-answering/
  - https://huggingface.co/docs/trl/main/en/sft_trainer
  - https://huggingface.co/blog/llama2

# Import/Prepare the Data

In [24]:
# Import the data
df = pd.read_csv('data/data_filtered.csv').drop(columns=['Unnamed: 0', 'id', 'label'])
df.head()

Unnamed: 0,question,answer
0,What is the full name of Virginia Tech?,Virginia Polytechnic Institute and State Unive...
1,Where is the main campus of Virginia Tech loca...,"Blacksburg, Virginia"
2,How many students does Virginia Tech have?,37000
3,What is the classification of Virginia Tech am...,R1: Doctoral Universities - Very high research...
4,What are the athletic teams of Virginia Tech c...,Virginia Tech Hokies


In [25]:
# Generate the template string for fine-tuning
template = '''<s>[INST] <<SYS>>
You are an expert on Virginia Tech or Virginia Polytechnic Institute and State University. Always answer in a helpful way. If you do not know the answer, simply response with "I do not know".
<</SYS>>

{} [/INST] '''

In [26]:
df['prompt'] = df['question'].apply(lambda x: template.format(x))
df = df.rename(columns={'answer': 'response'})
df['response'] = df['response'] + ' </s>'
df_train = df[['prompt', 'response']]
df_train['text'] = df_train['prompt'] + df_train['response']
df_train = df_train.drop(columns=['prompt', 'response'])
df_train.head()

Unnamed: 0,text
0,<s>[INST] <<SYS>>\nYou are an expert on Virgin...
1,<s>[INST] <<SYS>>\nYou are an expert on Virgin...
2,<s>[INST] <<SYS>>\nYou are an expert on Virgin...
3,<s>[INST] <<SYS>>\nYou are an expert on Virgin...
4,<s>[INST] <<SYS>>\nYou are an expert on Virgin...


In [27]:
df_train['text'][0]

'<s>[INST] <<SYS>>\nYou are an expert on Virginia Tech or Virginia Polytechnic Institute and State University. Always answer in a helpful way. If you do not know the answer, simply response with "I do not know".\n<</SYS>>\n\nWhat is the full name of Virginia Tech? [/INST] Virginia Polytechnic Institute and State University (VPI) </s>'

In [28]:
# df_train.to_csv('data/llama2_data.csv', index=False)

# Load the dataset

In [29]:
from datasets import load_dataset

In [30]:
dataset = load_dataset('data', data_files='llama2_data.csv', split='train')

In [31]:
dataset['text'][0]

'<s>[INST] <<SYS>>\nYou are an expert on Virginia Tech or Virginia Polytechnic Institute and State University. Always answer in a helpful way. If you do not know the answer, simply response with "I do not know".\n<</SYS>>\n\nWhat is the full name of Virginia Tech? [/INST] Virginia Polytechnic Institute and State University (VPI) </s>'

# Load models

In [32]:
# Model Names
base_model_name = "NousResearch/Llama-2-7b-chat-hf"
fine_tuned_model_name = "llama2-7b-hokiehelper"

# Tokenizer Names
llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = 'right'

# Quanization Config
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)

# Model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quant_config,
    device_map={"": 0}
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [33]:
# LoRA Config
peft_parameters = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM"
)

# Training Params
train_params = TrainingArguments(
    output_dir="./results_modified",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=True,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant"
)

# Trainer
fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=dataset,
    peft_config=peft_parameters,
    dataset_text_field="text",
    tokenizer=llama_tokenizer,
    args=train_params
)



In [35]:
# Training
fine_tuning.train()

# Save Model
fine_tuning.model.save_pretrained(fine_tuned_model_name)

  0%|          | 0/1495 [00:00<?, ?it/s]

{'loss': 2.1228, 'learning_rate': 0.0002, 'epoch': 0.08}
{'loss': 0.6047, 'learning_rate': 0.0002, 'epoch': 0.17}
{'loss': 0.73, 'learning_rate': 0.0002, 'epoch': 0.25}
{'loss': 0.4708, 'learning_rate': 0.0002, 'epoch': 0.33}
{'loss': 0.6721, 'learning_rate': 0.0002, 'epoch': 0.42}
{'loss': 0.4559, 'learning_rate': 0.0002, 'epoch': 0.5}
{'loss': 0.6879, 'learning_rate': 0.0002, 'epoch': 0.59}
{'loss': 0.3956, 'learning_rate': 0.0002, 'epoch': 0.67}
{'loss': 0.6422, 'learning_rate': 0.0002, 'epoch': 0.75}
{'loss': 0.4291, 'learning_rate': 0.0002, 'epoch': 0.84}
{'loss': 0.6908, 'learning_rate': 0.0002, 'epoch': 0.92}
{'loss': 0.4114, 'learning_rate': 0.0002, 'epoch': 1.0}
{'loss': 0.5467, 'learning_rate': 0.0002, 'epoch': 1.09}
{'loss': 0.384, 'learning_rate': 0.0002, 'epoch': 1.17}
{'loss': 0.4959, 'learning_rate': 0.0002, 'epoch': 1.25}
{'loss': 0.3535, 'learning_rate': 0.0002, 'epoch': 1.34}
{'loss': 0.4701, 'learning_rate': 0.0002, 'epoch': 1.42}
{'loss': 0.3636, 'learning_rate': 0.

# Test

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig

In [4]:
# Model Names
base_model_name = "NousResearch/Llama-2-7b-chat-hf"
fine_tuned_model_name = "llama2-7b-hokiehelper"

# Tokenizer Names
llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = 'right'

# Quanization Config
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)

## Base Model

In [38]:
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quant_config,
    device_map={"": 0}
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [54]:
query = "What is the admission rate of Virginia Tech?"
text_gen = pipeline('text-generation', model=base_model, tokenizer=llama_tokenizer, max_length=200)
output = text_gen(f'<s>[INST] <<SYS>> You are an expert on Virginia Tech or Virginia Polytechnic Institute and State University. Always answer in a helpful way. If you do not know the answer, simply response with "I do not know". <</SYS>> {query} [/INST] ')
print(output[0]['generated_text'].split('[/INST]')[1])

  Great question! The admission rate for Virginia Tech, also known as Virginia Polytechnic Institute and State University, varies depending on the program or major to which you are applying.

For the class of 2024, Virginia Tech accepted 53.6% of applicants. However, this number can vary from year to year, so it's important to check the most recent admission statistics on the university's website.

Here are the admission rates for the past few years:

* Class of 2024: 53.6%
* Class


## Fine Tuned Model

In [36]:
finetuned_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quant_config,
    device_map={"": 0}
)
finetuned_model.config.use_cache = False
finetuned_model.config.pretraining_tp = 1
finetuned_model.load_adapter(fine_tuned_model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [55]:
query = "What is the admission rate of Virginia Tech?"
text_gen = pipeline('text-generation', model=finetuned_model, tokenizer=llama_tokenizer, max_length=200)
output = text_gen(f'<s>[INST] <<SYS>> You are an expert on Virginia Tech or Virginia Polytechnic Institute and State University. Always answer in a helpful way. If you do not know the answer, simply response with "I do not know". <</SYS>> {query} [/INST] ')
print(output[0]['generated_text'].split('[/INST]')[1])

 65.8% in 2019 and 66.8% in 2018. 


# Batch Running

In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig

In [2]:
df = pd.read_csv('results.csv')
df.head()

Unnamed: 0,question,7b_base_answer,7b_ft_answer,13b_base_answer,13b_ft_answer,verbatim?,truth
0,Who is a former NASA engineer and a Virginia T...,,,"As an expert on Virginia Tech, I can tell yo...",1964 Homer Hadley 'Sonny' Hickam 2007 2012 20...,True,Homer Hickham
1,What is the Upper Quad at Virginia Tech?,,,"Ah, you must be referring to the iconic Uppe...",1876 Commencement Quadrangle 2.168 km2 of the...,True,an area on the north of the Drillfield that is...
2,Where is the Virginia Tech campus located?,,,Hello! Virginia Tech's main campus is locate...,"2601 Wright St. SE, Blacksburg, VA 24061 2601...",True,"Blacksburg, Virginia"
3,Who is the current president of Virginia Tech?,,,The current president of Virginia Tech is Dr...,Timothy Sands 6 7 8 9 10 11 12-13 14 15 16 1...,True,Timothy Sands
4,What is the Drillfield?,,,"Ah, you must be referring to the Drillfield,...",526 acres of open field 1400 feet from end to...,True,a large oval field in the center of the Blacks...


In [3]:
# Model Names
base_model_name = "NousResearch/Llama-2-7b-chat-hf"
fine_tuned_model_name = "llama2-7b-hokiehelper"

# Tokenizer Names
llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = 'right'

# Quanization Config
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quant_config,
    device_map={"": 0}
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

finetuned_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quant_config,
    device_map={"": 0}
)
finetuned_model.config.use_cache = False
finetuned_model.config.pretraining_tp = 1
finetuned_model.load_adapter(fine_tuned_model_name)

text_gen_base = pipeline('text-generation', model=base_model, tokenizer=llama_tokenizer, max_length=200)
text_gen_finetuned = pipeline('text-generation', model=finetuned_model, tokenizer=llama_tokenizer, max_length=200)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
for idx, row in df.iterrows():
    query = row['question']
    print(query)
    output_base = text_gen_base(f'<s>[INST] <<SYS>> You are an expert on Virginia Tech or Virginia Polytechnic Institute and State University. Always answer in a helpful way. If you do not know the answer, simply response with "I do not know". <</SYS>> {query} [/INST] ')
    output_finetuned = text_gen_finetuned(f'<s>[INST] <<SYS>> You are an expert on Virginia Tech or Virginia Polytechnic Institute and State University. Always answer in a helpful way. If you do not know the answer, simply response with "I do not know". <</SYS>> {query} [/INST] ')
    df.loc[idx, '7b_base_answer'] = output_base[0]['generated_text'].split('[/INST]')[1]
    df.loc[idx, '7b_ft_answer'] = output_finetuned[0]['generated_text'].split('[/INST]')[1]

Who is a former NASA engineer and a Virginia Tech alumni?



After conducting a quick search, I found that there are several former NASA engineers who are also Virginia Tech alumni. However, I couldn't find a specific individual who fits your criteria.

Virginia Tech has a strong reputation for producing talented engineers and scientists, and many of its graduates have gone on to work at NASA and other prestigious institutions. Some notable Virginia Tech alumni who have worked at NASA include:

1. Dr. Mae Jemison - Dr.' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.loc[idx, '7b_base_answer'] = output_base[0]['generated_text'].split('[/INST]')[1]
  df.loc[idx, '7b_ft_answer'] = output_finetuned[0]['generated_text'].split('[/INST]')[1]


What is the Upper Quad at Virginia Tech?
Where is the Virginia Tech campus located?
Who is the current president of Virginia Tech?
What is the Drillfield?
What waterway runs beneath the Drillfield?
How many alumni does Virginia Tech have internationally and from all 50 states?
How many generals and admirals has Virginia Tech produced?
How many Virginia Tech alumni have been awarded the Medal of Honor?
When was the word 'Hokie' first used?
Who came up with the spirit cheer 'Old Hokie'?




What was the original spirit cheer?
What was the original nickname for Hokies?
When do students who dress as the HokieBird reveal their secret identity?
What is the requirement for new central campus buildings at Virginia Tech?
What do The Pylons represent from left to right?
Who transformed VPI into a major research university?
What did Herbert Thomas do to receive the Medal of Honor?
Who was the first to register at Virginia Tech?
When did the display of the Confederate flag at Virginia Tech end?
What is the GPA requirement for students in the Honors College at Virginia Tech?
What is the average SAT score for admitted students at Virginia Tech?
Which Virginia Tech golfer won three PGA Tour wins?
What is the chorus of the Alma Mater?
Who wrote the lyrics for the Alma Mater?


In [5]:
df.head()

Unnamed: 0,question,7b_base_answer,7b_ft_answer,13b_base_answer,13b_ft_answer,verbatim?,truth
0,Who is a former NASA engineer and a Virginia T...,Great question! I'm happy to help.\n\nAfter ...,2017 National Book Award finalist for his mem...,"As an expert on Virginia Tech, I can tell yo...",1964 Homer Hadley 'Sonny' Hickam 2007 2012 20...,True,Homer Hickham
1,What is the Upper Quad at Virginia Tech?,Great question! The Upper Quad at Virginia T...,1965 Veterans Memorial Building and the Corps...,"Ah, you must be referring to the iconic Uppe...",1876 Commencement Quadrangle 2.168 km2 of the...,True,an area on the north of the Drillfield that is...
2,Where is the Virginia Tech campus located?,Great question! Virginia Tech is located in ...,15 miles south of Roanoke and 75 miles southw...,Hello! Virginia Tech's main campus is locate...,"2601 Wright St. SE, Blacksburg, VA 24061 2601...",True,"Blacksburg, Virginia"
3,Who is the current president of Virginia Tech?,I'm happy to help! The current president of ...,Timothy Sands. http://www.vtnews.org/stories/...,The current president of Virginia Tech is Dr...,Timothy Sands 6 7 8 9 10 11 12-13 14 15 16 1...,True,Timothy Sands
4,What is the Drillfield?,"Ah, a question about my alma mater! The Dril...",a large oval field in the center of the Blac...,"Ah, you must be referring to the Drillfield,...",526 acres of open field 1400 feet from end to...,True,a large oval field in the center of the Blacks...


In [None]:
# save the dataframe
df.to_csv('results.csv', index=False)