In [2]:
# This script fine-tunes llama2 models to classify the provided headlines
# The fine-tuned models are then tested on test set and the results are stored.
# The script requires significant ammount of RAM and GPU memory to run
# and is intended to run in Google Colab (on A100 instance).

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd drive/MyDrive/your_path/Refined_Applied_Project/llama

In [None]:
!pip install transformers datasets peft accelerate bitsandbytes trl safetensors


In [None]:
!MAX_JOBS=4 pip install flash-attn --no-build-isolation

In [None]:
!pip install git+https://github.com/huggingface/accelerate

In [15]:
import pandas as pd
import torch
import time
import tqdm
import transformers
import configparser
import datetime
import pytz
import gc
import os

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from datasets import load_dataset
from random import randrange
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from huggingface_hub.hf_api import HfFolder
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, AutoPeftModelForCausalLM
from trl import SFTTrainer
from huggingface_hub import notebook_login
from tqdm import tqdm
# This sript is imported from https://github.com/philschmid/deep-learning-pytorch-huggingface/tree/main/training/utils
# and is primarily used to manipulate flash atention
from llama_patch import replace_attn_with_flash_attn
from llama_patch import upcast_layer_for_flash_attention
from llama_patch import unplace_flash_attn_with_attn



In [16]:
# Set the directory path and read in the config file
# Get the current working directory
script_dir = os.getcwd()
abs_config_path = os.path.join(script_dir, '../config.ini')
config = configparser.ConfigParser()
config.read(abs_config_path)

# Set the variables from the config file
hf_token = config.get('DEFAULT', 'hf_token') # HuggingFace token for the API
HfFolder.save_token(hf_token) # Save the HuggingFace token
model_path = config.get('llama2_finetuning', 'model')
train_size = config.getfloat('DEFAULT', 'train_size')
train_df_path = '../data/train_' + str(train_size) + '.csv'
train_df_path = os.path.join(script_dir, train_df_path)
test_df_path = '../data/test_' + str(round(1-train_size,2)) + '.csv'
test_df_path = os.path.join(script_dir, test_df_path)
# Output directory for the fie-tuned model
output_dir_path = model_path.split('/')[1] + '-int4-ft-' + str(train_size)

# Read-in the dataset
df = pd.read_csv(train_df_path)

# Convert DataFrame to a list of dictionaries
df = df.to_dict('records')


In [17]:
# Format the prompt for finetuning

def format_instruction(sample):
  return f"""### Instruction:
    As a retail investor, you are presented with a financial headline. Your task is to classify the sentiment expressed in the headline using one of the following labels: [NEGATIVE, POSITIVE, NEUTRAL].

    ### Headline:
    {sample['Headline']}

    ### Please respond with only one of the following labels: NEGATIVE, POSITIVE, or NEUTRAL.

    ### Response: The sentiment expressed in the headline is {sample['True_Label']}"""

print(format_instruction(df[randrange(len(df))]))

### Instruction:
    As a retail investor, you are presented with a financial headline. Your task is to classify the sentiment expressed in the headline using one of the following labels: [NEGATIVE, POSITIVE, NEUTRAL].

    ### Headline:
    Its market share widened to 48.51 percent from 48.31 percent a year earlier .

    ### Please respond with only one of the following labels: NEGATIVE, POSITIVE, or NEUTRAL.

    ### Response: The sentiment expressed in the headline is POSITIVE


In [19]:
# COMMENT IN TO USE FLASH ATTENTION
# replace attention with flash attention
use_flash_attention = True
if torch.cuda.get_device_capability()[0] >= 8:
    print("Using flash attention")
    replace_attn_with_flash_attn()
    use_flash_attention = True

model_id = model_path


# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    use_cache=False, device_map="auto",
    use_auth_token=True
    )
model.config.pretraining_tp = 1

# Validate that the model is using flash attention, by comparing doc strings
if use_flash_attention:
    from llama_patch import forward
    assert model.model.layers[0].self_attn.forward.__doc__ == forward.__doc__, "Model is not using flash attention"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
tokenizer.pad_token = tokenizer.eos_token
if model_path == 'meta-llama/Llama-2-7b-chat-hf':
  tokenizer.padding_side = "right"
else:
  tokenizer.padding_side = "left"

Using flash attention




Downloading (…)lve/main/config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]



Downloading (…)okenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [20]:
# LoRA config based on QLoRA paper
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="CAUSAL_LM",
)

# prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
model = upcast_layer_for_flash_attention(model, torch.bfloat16)

In [21]:
# Define the training arguments
args = TrainingArguments(
    output_dir=output_dir_path,
    num_train_epochs=3,
    per_device_train_batch_size=6 if use_flash_attention else 4,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=2e-4,
    bf16=True,
    tf32=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    disable_tqdm=True # disable tqdm since with packing values are in correct
)

In [22]:
max_seq_length = 2048 # max sequence length for model and packing of the dataset

trainer = SFTTrainer(
    model=model,
    train_dataset=df,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    formatting_func=format_instruction,
    args=args
)

In [23]:
# Fine-tune the model
start_time = time.time()
trainer.train() # there will not be a progress bar since tqdm is disabled
end_time = time.time()

elapsed_time = end_time - start_time
print(f"The function took {elapsed_time} seconds to complete.")

# Save the model
trainer.save_model()


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.8918, 'learning_rate': 0.0002, 'epoch': 0.03}
{'loss': 0.7504, 'learning_rate': 0.0002, 'epoch': 0.07}
{'loss': 0.6601, 'learning_rate': 0.0002, 'epoch': 1.02}
{'loss': 0.5788, 'learning_rate': 0.0002, 'epoch': 1.06}
{'loss': 0.5394, 'learning_rate': 0.0002, 'epoch': 2.02}
{'loss': 0.5308, 'learning_rate': 0.0002, 'epoch': 2.05}
{'train_runtime': 480.5783, 'train_samples_per_second': 22.679, 'train_steps_per_second': 1.891, 'train_loss': 0.6489712551458559, 'epoch': 2.07}
The function took 480.8720507621765 seconds to complete.


## Use the fine-tuned model to predict the labels

In [24]:
# Unpatch flash attention
if use_flash_attention:
    unplace_flash_attn_with_attn()

args.output_dir = output_dir_path

# Load base LLM model and tokenizer
model = AutoPeftModelForCausalLM.from_pretrained(
    args.output_dir,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
)
tokenizer = AutoTokenizer.from_pretrained(args.output_dir)


Reloading llama model, unpatching flash attention


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [25]:
# Set the tokenizer padding side
# llama-2-7b does not appear to work with left padding
if model_path == 'meta-llama/Llama-2-7b-chat-hf':
  tokenizer.padding_side = "right"
else:
  tokenizer.padding_side = "left"

# Define the pipeline
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype='auto',
    device_map="auto",
    # max_length=250,
    max_new_tokens=10,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id = tokenizer.eos_token_id,
    pad_token_id = tokenizer.pad_token_id
)
# Set the tokenizer padding side
tokenizer.pad_token = tokenizer.eos_token

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'Refor

In [None]:
test_df = pd.read_csv(test_df_path)

# Initialize an empty list to store the formatted strings
prompts = []

prompt_template = """### Instruction:
    As a retail investor, you are presented with a financial headline. Your task is to classify the sentiment expressed in the headline using one of the following labels: [NEGATIVE, POSITIVE, NEUTRAL].

    ### Headline:
    {headline}

    ### Please respond with only one of the following labels: NEGATIVE, POSITIVE, or NEUTRAL.

    ### Response: The sentiment expressed in the headline is"""

# This formats the prompt with the headline
# Iterate over each headline in the DataFrame column and apply the f-string
for headline in test_df['Headline'].tolist():
    prompts.append(prompt_template.format(headline = headline))

# Run the fine-tined model
start_time = time.time()
raw_responses = pipeline(prompts, batch_size=16)
end_time = time.time()

elapsed_time = end_time - start_time
print(f"The function took {elapsed_time} seconds to complete.")

torch.cuda.empty_cache()  # Free up memory
gc.collect()  # Collect any garbage

In [50]:
# test_df = pd.read_csv(test_df_name)
# test_df = test_df.iloc[:100,:]

# Sample code to filter out unwanted characters
# and retain only the label in the 'response' variable
# Define the set of labels to look for
responses = []
unknown_responses = []
unknown_count = 0

for item in raw_responses:
    for sub_item in item:
        generated_text = sub_item['generated_text']
        # response_parts = generated_text.split("### Your Response:")
        # response_parts = generated_text.split("### Response:")
        response_parts = generated_text.split("The sentiment expressed in the headline is")
        if len(response_parts) < 2:
            response = 'Empty Response'
            unknown_responses.append(response_parts)

        else:
            response = response_parts[1].strip()
            # Check if the response is empty
            if not response:
                response = 'UNKNOWN'
            else:
                # Strip away everything but the first word (assuming the first word is the label)
                response_list = response.split()
                for i in response_list:
                  i = i.strip('.,;')
                  i = i.upper()
                  for label in ["NEGATIVE", "NEUTRAL", "POSITIVE"]:
                    if i == label:
                      response = i


        if response.upper() not in ['NEGATIVE', 'NEUTRAL', 'POSITIVE']:
            # Append the list so unrecognised reposnses can be examined later
            unknown_responses.append(response)
            unknown_count += 1
            # This is consistent with Zhang, Yang & Liu (2023)
            response = 'NEUTRAL'

        responses.append(response)

# Add the classified labels to the df
test_df['Predicted_Label'] = responses

# Output the number of rows where NEUTRAL was subsituted due to errors/unrecognised output
# This is consistent with Zhang, Yang & Liu (2023)
print(f"Number of rows with substituted 'NEUTRAL' in the Predicted_Label column: {unknown_count}.")

# Define a dictionary to map the old values to the new values
mapping = {'NEGATIVE': -1, 'NEUTRAL': 0, 'POSITIVE': 1}

# Replace the values in the two columns using the mapping dictionary
test_df['True_Label'] = test_df['True_Label'].map(mapping)
test_df['Predicted_Label'] = test_df['Predicted_Label'].map(mapping)

# Drop rows where 'Predicted_Label' is NaN
test_df = test_df.dropna(subset=['Predicted_Label'])

# Calculate accuracy
accuracy = accuracy_score(test_df['True_Label'], test_df['Predicted_Label'])
print(f"Accuracy: {accuracy}")
# Calculate precision
precision = precision_score(test_df['True_Label'], test_df['Predicted_Label'], average='weighted')
print(f"Precision: {precision}")
# Calculate recall
recall = recall_score(test_df['True_Label'], test_df['Predicted_Label'], average='weighted')
print(f"Recall: {recall}")
# Calculate F1 score
f1 = f1_score(test_df['True_Label'], test_df['Predicted_Label'], average='weighted')
print(f"F1 score: {f1}")

Number of rows with substituted 'NEUTRAL' in the Predicted_Label column: 5.
Number of rows with NaN in the Predicted_Label column: 0
Accuracy: 0.6507018992568125
Precision: 0.6777359239203035
Recall: 0.6507018992568125
F1 score: 0.5903959520245827


In [36]:
#Store the results in the dataframe
df_results_path = os.path.join(script_dir, '../results/df_results.csv')
df_results = pd.read_csv(df_results_path)
ft_model_name = output_dir_path
error_count = unknown_count
bst = pytz.timezone('Europe/London')
now = datetime.datetime.now(bst)
formatted_time = now.strftime('%d/%m/%Y/%H:%M')

# What does ingore_index do?
# https://stackoverflow.com/questions/17839973/constructing-pandas-dataframe-from-values-in-variables-gives-valueerror-if-usi
df_results = pd.concat([
    df_results,
    pd.DataFrame({
        'Model': ft_model_name,
        'Test_Size': round(1-train_size,2),
        'Accuracy': accuracy,
        'F1': f1,
        'Precision': precision,
        'Recall': recall,
        'Prompt': prompt_template,
        'Error Count': error_count,
        'DateTime': formatted_time,
        'Few_Shot': False
    }, index=[0])
], ignore_index=True)

df_results.to_csv(df_results_path, index=False)