In [6]:
## This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Phi-2 is a Transformer with 2.7 billion parameters. It was trained using the same data sources as Phi-1.5, augmented with a new data source that consists of various NLP synthetic texts and filtered websites (for safety and educational value). When assessed against benchmarks testing common sense, language understanding, and logical reasoning, Phi-2 showcased a nearly state-of-the-art performance among models with less than 13 billion parameters.

- Model Link - https://huggingface.co/microsoft/phi-2

In [2]:
!pip install -U trl accelerate peft -i https://pypi.org/simple/ bitsandbytes transformers trl huggingface_hub

Looking in indexes: https://pypi.org/simple/
Collecting trl
  Downloading trl-0.7.11-py3-none-any.whl (155 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m155.3/155.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.9.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.42.0-py3-none-any.whl (105.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting transformers
  Downloading transformers-4.38.2-py3-none-any.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

In [3]:
%pip install -U datasets



#### Loading the Libraries

In [4]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline,
                          logging)
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split

- **accelerate** is a distributed training library for PyTorch by HuggingFace. It allows you to train your models on multiple GPUs or CPUs in parallel (distributed configurations), which can significantly speed up training in presence of multiple GPUs (we won't use it in our example).
- **peft** is a Python library by HuggingFace for efficient adaptation of pre-trained language models (PLMs) to various downstream applications without fine-tuning all the model's parameters. PEFT methods only fine-tune a small number of (extra) model parameters, thereby greatly decreasing the computational and storage costs.
- **bitsandbytes** by Tim Dettmers, is a lightweight wrapper around CUDA custom functions, in particular 8-bit optimizers, matrix multiplication (LLM.int8()), and quantization functions. It allows to run models stored in 4-bit precision: while 4-bit bitsandbytes stores weights in 4-bits, the computation still happens in 16 or 32-bit and here any combination can be chosen (float16, bfloat16, float32, and so on).
- **transformers** is a Python library for natural language processing (NLP). It provides a number of pre-trained models for NLP tasks such as text classification, question answering, and machine translation.
- **trl** is a full stack library by HuggingFace providing a set of tools to train transformer language models with Reinforcement Learning, from the Supervised Fine-tuning step (SFT), Reward Modeling step (RM) to the Proximal Policy Optimization (PPO) step.

#### Loading the Data
Dataset Link - https://www.kaggle.com/datasets/ankurzing/sentiment-analysis-for-financial-news/data

The FinancialPhraseBank dataset is a comprehensive collection that captures the sentiments of financial news headlines from the viewpoint of a retail investor. Comprising two key columns, namely "Sentiment" and "News Headline," the dataset effectively classifies sentiments as either negative, neutral, or positive.

In [11]:
from google.colab import files


uploaded = files.upload()

Saving all-data.csv to all-data.csv


In [12]:
# filename = "/kaggle/input/sentiment-analysis-for-financial-news/all-data.csv"
filename="/content/all-data.csv"

df = pd.read_csv(filename,
                 names=["sentiment", "text"],
                 encoding="utf-8", encoding_errors="replace")
df.head()

Unnamed: 0,sentiment,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


#### Data Pre-Processing

- Splits the dataset into training and test sets, with 300 samples in each set. The split is stratified by sentiment, so that each set contains a representative sample of positive, neutral, and negative sentiments.


In [13]:
X_train = list()
X_test = list()
for sentiment in ["positive", "neutral", "negative"]:
    train, test  = train_test_split(df[df.sentiment==sentiment],
                                    train_size=300,
                                    test_size=300,
                                    random_state=42)
    X_train.append(train)
    X_test.append(test)

X_train[:1]

[     sentiment                                               text
 228   positive  ( ADP News ) - Feb 12 , 2009 - Finnish IT solu...
 835   positive  Finnish consulting and engineering group Poyry...
 991   positive  These companies will be able to keep their mar...
 2241  positive  The sale will lead to a pretax capital gain of...
 13    positive  Finnish Talentum reports its operating profit ...
 ...        ...                                                ...
 1761  positive  SysOpen Digia Plc , Press release , 7 February...
 234   positive  Neste Oil Corp. has signed long-term procureme...
 2294  positive  Outokumpu of Finland , stainless steel manufac...
 55    positive  Shares of Nokia Corp. rose Thursday after the ...
 745   positive  `` In terms of profitability and earnings 2007...
 
 [300 rows x 2 columns]]

- Shuffles the train data in a replicable order (random_state=10)

In [14]:
X_train = pd.concat(X_train).sample(frac=1, random_state=10)
X_test = pd.concat(X_test)
X_train[:1]

Unnamed: 0,sentiment,text
3683,neutral,Mr Jortikka is president of the base metal div...


### Evaluation or Validation Data

In [15]:
eval_idx = [idx for idx in df.index if idx not in list(train.index) + list(test.index)]
eval_idx[:5]

[0, 1, 3, 4, 5]

In [16]:
X_eval = df[df.index.isin(eval_idx)]
X_eval[:5]

Unnamed: 0,sentiment,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
5,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...


In [17]:
X_eval = (X_eval
          .groupby('sentiment', group_keys=False)
          .apply(lambda x: x.sample(n=50, random_state=10, replace=True)))
X_train = X_train.reset_index(drop=True)

### Transform Data into LLM Training Form using Dataset Library

In [18]:
def generate_prompt(data_point):
    return f"""The sentiment of the following phrase: '{data_point["text"]}' is
            \n\n Positive
            \n Negative
            \n Neutral
            \n Cannot be determined
            \n\nSolution: The correct option is {data_point["sentiment"]}""".strip()

def generate_test_prompt(data_point):
    return f"""The sentiment of the following phrase: '{data_point["text"]}' is
            \n\n Positive
            \n Negative
            \n Neutral
            \n Cannot be determined
            \n\nSolution: The correct option is""".strip()

In [19]:
X_train = pd.DataFrame(X_train.apply(generate_prompt, axis=1),
                       columns=["text"])
X_eval = pd.DataFrame(X_eval.apply(generate_prompt, axis=1),
                      columns=["text"])

X_train.head()

Unnamed: 0,text
0,The sentiment of the following phrase: 'Mr Jor...
1,The sentiment of the following phrase: 'Both o...
2,The sentiment of the following phrase: 'Finnis...
3,The sentiment of the following phrase: 'Renzo ...
4,The sentiment of the following phrase: '`` We ...


In [20]:
y_true = X_test.sentiment
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])

train_data = Dataset.from_pandas(X_train)
eval_data = Dataset.from_pandas(X_eval)

In [21]:
train_data['text'][1]

"The sentiment of the following phrase: 'Both operating profit and net sales for the 12-month period increased , respectively from EUR21 .5 m and EUR196 .1 m , as compared to 2005 .' is \n            \n\n Positive\n            \n Negative\n            \n Neutral\n            \n Cannot be determined\n            \n\nSolution: The correct option is positive"

Next we create a function to evaluate the results from our fine-tuned sentiment model. The function performs the following steps:

 - Maps the sentiment labels to a numerical representation, where 2 represents positive, 1 represents neutral, and 0 represents negative.
- Calculates the accuracy of the model on the test data.
- Generates an accuracy report for each sentiment label.
- Generates a classification report for the model.
- Generates a confusion matrix for the model.

In [22]:
def evaluate(y_true, y_pred):
    labels = ['positive', 'neutral', 'negative']
    mapping = {'positive': 2, 'neutral': 1, 'none':1, 'negative': 0}
    def map_func(x):
        return mapping.get(x, 1)

    y_true = np.vectorize(map_func)(y_true)
    y_pred = np.vectorize(map_func)(y_pred)

    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')

    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels

    for label in unique_labels:
        label_indices = [i for i in range(len(y_true))
                         if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')

    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print('\nClassification Report:')
    print(class_report)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1, 2])
    print('\nConfusion Matrix:')
    print(conf_matrix)

### Loading the Model

Model loading and quantization:

- First the code loads the Phi-2 language model from the Hugging Face Hub.
- Then the code gets the float16 data type from the torch library. This is the data type that will be used for the computations.
- Next, it creates a BitsAndBytesConfig object with the following settings:
- load_in_4bit: Load the model weights in 4-bit format.
- bnb_4bit_quant_type: Use the "nf4" quantization type. 4-bit NormalFloat (NF4), is a new data type that is information theoretically optimal for normally distributed weights.
- bnb_4bit_compute_dtype: Use the float16 data type for computations.
- bnb_4bit_use_double_quant: Do not use double quantization (reduces the average memory footprint by quantizing also the quantization constants and saves an additional 0.4 bits per parameter.).

Then the code creates a AutoModelForCausalLM object from the pre-trained Phi-2 language model, using the BitsAndBytesConfig object for quantization.
- After that, the code disables caching for the model.
- Finally the code sets the pre-training token probability to 1.

In [23]:
model_name = "microsoft/phi-2"

compute_dtype = getattr(torch, "float16")

# Model Configurations
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
)

# Loading the Model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    device_map="auto",
    quantization_config=bnb_config,
)

model.config.use_cache = False
model.config.pretraining_tp = 1

# Loading the Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          trust_remote_code=True,
                                         )
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/863 [00:00<?, ?B/s]

configuration_phi.py:   0%|          | 0.00/9.26k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-2:
- configuration_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi.py:   0%|          | 0.00/62.7k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-2:
- modeling_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In the next cell, we set a function for predicting the sentiment of a news headline using the Phi-2 language model. The function takes three arguments:

- test: A Pandas DataFrame containing the news headlines to be predicted.
- model: The pre-trained Phi-2 language model.
- tokenizer: The tokenizer for the Phi-2 language model.

The pipeline() function from the Hugging Face Transformers library is used to generate text from the language model. The task argument specifies that the task is text generation. The model and tokenizer arguments specify the pre-trained Phi-2 language model and the tokenizer for the language model. The max_new_tokens argument specifies the maximum number of new tokens to generate. The temperature argument controls the randomness of the generated text. A lower temperature will produce more predictable text, while a higher temperature will produce more creative and unexpected text.

In [24]:
def predict(X_test, model, tokenizer):
    y_pred = []
    for i in tqdm(range(len(X_test))):
        prompt = X_test.iloc[i]["text"]
        pipe = pipeline(task="text-generation",
                        model=model,
                        tokenizer=tokenizer,
                        max_new_tokens = 3,
                        temperature = 0.0,
                       )
        result = pipe(prompt, pad_token_id=pipe.tokenizer.eos_token_id)
        answer = result[0]['generated_text'].split("The correct option is")[-1].lower()
        if "positive" in answer:
            y_pred.append("positive")
        elif "negative" in answer:
            y_pred.append("negative")
        elif "neutral" in answer:
            y_pred.append("neutral")
        else:
            y_pred.append("none")
    return y_pred

In [25]:
y_pred = predict(X_test, model, tokenizer)

100%|██████████| 900/900 [03:16<00:00,  4.57it/s]


#### Lets Evaluate

In [26]:
evaluate(y_true, y_pred)

Accuracy: 0.348
Accuracy for label 0: 0.050
Accuracy for label 1: 0.920
Accuracy for label 2: 0.073

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.05      0.09       300
           1       0.33      0.92      0.49       300
           2       0.44      0.07      0.13       300

    accuracy                           0.35       900
   macro avg       0.57      0.35      0.24       900
weighted avg       0.57      0.35      0.24       900


Confusion Matrix:
[[ 15 280   5]
 [  1 276  23]
 [  0 278  22]]


# Let's Fine Tune Now

- We configures and initializes a Simple Fine-tuning Trainer (SFTTrainer) for training a large language model using the Parameter-Efficient Fine-Tuning (PEFT) method.
- Which should save time as it operates on a reduced number of parameters compared to the model's overall size.
- The PEFT method focuses on refining a limited set of (additional) model parameters, while keeping the majority of the pre-trained LLM parameters fixed.
- This significantly reduces both computational and storage expenses. Additionally, this strategy addresses the challenge of catastrophic forgetting, which often occurs during the complete fine-tuning of LLMs.

PEFTConfig:

The peft_config object specifies the parameters for PEFT. The following are some of the most important parameters:

- lora_alpha: The learning rate for the LoRA update matrices. (Scaling Factor)
- lora_dropout: The dropout probability for the LoRA update matrices.
- r: The rank of the LoRA update matrices. (Dimensions of Matrix)
- bias: The type of bias to use. The possible values are none, additive, and learned.
- task_type: The type of task that the model is being trained for. The possible values are CAUSAL_LM and MASKED_LM.

**TrainingArguments:**

The training_arguments object specifies the parameters for training the model. The following are some of the most important parameters:

- output_dir: The directory where the training logs and checkpoints will be saved.
- num_train_epochs: The number of epochs to train the model for.
- per_device_train_batch_size: The number of samples in each batch on each device.
- gradient_accumulation_steps: The number of batches to accumulate gradients before updating - the model parameters.
- optim: The optimizer to use for training the model.
- save_steps: The number of steps after which to save a checkpoint.
- logging_steps: The number of steps after which to log the training metrics.
- learning_rate: The learning rate for the optimizer.
- weight_decay: The weight decay parameter for the optimizer.
- fp16: Whether to use 16-bit floating-point precision.
- bf16: Whether to use BFloat16 precision.
- max_grad_norm: The maximum gradient norm.
- max_steps: The maximum number of steps to train the model for.
- warmup_ratio: The proportion of the training steps to use for warming up the learning rate.
- group_by_length: Whether to group the training samples by length.
- lr_scheduler_type: The type of learning rate scheduler to use.
- report_to: The tools to report the training metrics to.
- evaluation_strategy: The strategy for evaluating the model during training.

**SFTTrainer:**

The SFTTrainer is a custom trainer class from the PEFT library. It is used to train large language models using the PEFT method.

The SFTTrainer object is initialized with the following arguments:

- model: The model to be trained.
- train_dataset: The training dataset.
- eval_dataset: The evaluation dataset.
- peft_config: The PEFT configuration.
- dataset_text_field: The name of the text field in the dataset.
- tokenizer: The tokenizer to use.
- args: The training arguments.
- packing: Whether to pack the training samples.
- max_seq_length: The maximum sequence length.

Once the SFTTrainer object is initialized, it can be used to train the model by calling the train() method

In [27]:
import re

def get_num_layers(model):
    numbers = set()
    for name, _ in model.named_parameters():
        for number in re.findall(r'\d+', name):
            numbers.add(int(number))
    return max(numbers)

def get_last_layer_linears(model):
    names = []

    num_layers = get_num_layers(model)
    for name, module in model.named_modules():
        if str(num_layers) in name and not "encoder" in name:
            if isinstance(module, torch.nn.Linear):
                names.append(name)
    return names

In [28]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=[
    "q_proj",
    "up_proj",
    "o_proj",
    "k_proj",
    "down_proj",
    "gate_proj",
    "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

In [29]:
training_arguments = TrainingArguments(
    output_dir="logs",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8, # 4
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="tensorboard",
    evaluation_strategy="epoch"
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
    max_seq_length=512,
)

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

In [30]:
# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained("micrsofot_phi2_sentiment_sim-model")

Epoch,Training Loss,Validation Loss
0,1.4172,1.227682


In [31]:
y_pred = predict(X_test, model, tokenizer)
evaluate(y_true, y_pred)

100%|██████████| 900/900 [04:41<00:00,  3.19it/s]

Accuracy: 0.791
Accuracy for label 0: 0.983
Accuracy for label 1: 0.537
Accuracy for label 2: 0.853

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.98      0.93       300
           1       0.82      0.54      0.65       300
           2       0.69      0.85      0.76       300

    accuracy                           0.79       900
   macro avg       0.80      0.79      0.78       900
weighted avg       0.80      0.79      0.78       900


Confusion Matrix:
[[295   4   1]
 [ 24 161 115]
 [ 13  31 256]]





In [None]:
evaluation = pd.DataFrame({'text': X_test["text"],
                           'y_true':y_true,
                           'y_pred': y_pred},
                         )
evaluation.to_csv("test_predictions.csv", index=False)