<a href="https://colab.research.google.com/github/bearbearyu1223/llm-fine-tuning-playground/blob/main/model_eval_finetune_falcon_7b_conversation_summarization_12_21.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook is for evaluating a fine-tuned [Falcon-7b-sharded model](https://huggingface.co/vilsonrodrigues/falcon-7b-sharded) on [samsum](https://huggingface.co/datasets/samsum) dataset. Model card can be found [here](https://huggingface.co/bearbearyu1223/falcon_7b_LoRA_r16_dialogue_summarization_12_13_2023).


##Installs and Imports Frameworks

In [None]:
!pip install huggingface_hub==0.19.4
!pip install -q -U accelerate git+https://github.com/huggingface/peft.git
!pip install transformers==4.36.0
!pip install datasets==2.15.0 Tokenizers==0.15.0
!pip install -q bitsandbytes
!pip install openai
!pip install --user -U nltk
!pip install py7zr
!pip install evaluate
!pip install rouge-score

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, GenerationConfig
from peft import LoraConfig, get_peft_model, PeftConfig, PeftModel, prepare_model_for_kbit_training
from transformers import TrainingArguments

#ignore warnings
import warnings
warnings.filterwarnings("ignore")

##Loading the dataset from hugging face and Formatting the Training Dataset

---



In [None]:
dataset_name = "samsum"
dataset = load_dataset(dataset_name)

train_dataset = dataset['train']
eval_dataset = dataset['validation']
test_dataset = dataset['test']
dataset

##Inference

In [None]:
# Loading PEFT model

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)

PEFT_MODEL = "bearbearyu1223/falcon_7b_LoRA_r16_dialogue_summarization_12_20_2023"
config = PeftConfig.from_pretrained(PEFT_MODEL)
peft_base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

peft_model = PeftModel.from_pretrained(peft_base_model, PEFT_MODEL)

In [None]:
peft_tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
if peft_tokenizer.pad_token_id is None:
   peft_tokenizer.pad_token = peft_tokenizer.eos_token
   peft_model.config.pad_token_id = peft_tokenizer.eos_token_id
   peft_model.config.pad_token = peft_tokenizer.eos_token
   peft_base_model.config.pad_token_id = peft_tokenizer.eos_token_id
   peft_base_model.config.pad_token = peft_tokenizer.eos_token


In [None]:
import re

def remove_incomplete_sentences(text):

  # Compile the regular expression for complete sentences.
  complete_sentence_regex = re.compile(r"(^.*[\.\?!]|^\S[^.\?!]*)")

  # Find all of the complete sentences in the text.
  complete_sentences = complete_sentence_regex.findall(text)
  text = " ".join(complete_sentences)

  # Return the text with the incomplete sentences removed.
  return text

# Generate Summarization
def get_summary(dialogue, min_length=10, max_length=512, verbose=False):
  prompt= "### Instruction:\n{instruction}\n\n### Dialogue:\n{dialogue}\n\n### Summary:\n".format(instruction="Summarize the Dialogue below.", dialogue=dialogue)
  if verbose:
    print(prompt)

  peft_encoding = peft_tokenizer(prompt, truncation=True, return_tensors="pt").to(torch.device("cuda:0"))
  peft_outputs = peft_model.generate(input_ids=peft_encoding.input_ids, generation_config=GenerationConfig(do_sample=True,
                                                                                                         num_beams=2,
                                                                                                         no_repeat_ngram_size=3,
                                                                                                         max_length=max_length,
                                                                                                         min_length=min_length,
                                                                                                         pad_token_id = peft_tokenizer.eos_token_id,
                                                                                                         eos_token_id = peft_tokenizer.eos_token_id,
                                                                                                         attention_mask = peft_encoding.attention_mask,
                                                                                                         temperature=0.1, top_p=0.9, repetition_penalty=30.0, num_return_sequences=1,))
  peft_text_output = peft_tokenizer.decode(peft_outputs[0], skip_special_tokens=True)

  sub = "### Summary:"
  raw_summary = peft_text_output.split(sub)[1]
  post_processed_summary = remove_incomplete_sentences(raw_summary.strip())

  return post_processed_summary

### Model Inferences

In [None]:
from tqdm import tqdm
import nltk
nltk.download('punkt')
tqdm.pandas()

test_index=11
dialogue=test_dataset[test_index]['dialogue']
summary=test_dataset[test_index]['summary']
peft_output=get_summary(dialogue,verbose=True)

print("Human Summary:")
print(summary)
print("PEFT Summary:")
print(peft_output)

In [None]:
inputs = []
references = []
predictions = []

for sample in tqdm(test_dataset):
  summary=sample['summary']
  dialogue=sample['dialogue']
  peft_summary=get_summary(dialogue=dialogue)
  inputs.append(dialogue)
  references.append(summary)
  predictions.append(peft_summary)

In [None]:
import pandas as pd

dict = {'inputs':inputs, 'summary_human_baseline':references, 'summary_peft_baseline':predictions}
df = pd.DataFrame(dict)
df.to_csv('falcon_7b_LoRA_r16_dialogue_summarization_12_21_2023_results.csv')

### Evaluation

#### Evaluation via Rogue Score

In [None]:
import evaluate

metric = evaluate.load('rouge')
results = metric.compute(predictions=predictions, references=references)

print(f"Rogue1: {results['rouge1']* 100:2f}%")
print(f"rouge2: {results['rouge2']* 100:2f}%")
print(f"rougeL: {results['rougeL']* 100:2f}%")
print(f"rougeLsum: {results['rougeLsum']* 100:2f}%")

#### Evaluation via LLMs

In [None]:
import pandas as pd
from openai import OpenAI
import os
import re

In [None]:
# Evaluation prompt template
EVALUATION_PROMPT_TEMPLATE = """
You will be given one summary written for an conversation. Your task is to rate the summary on one metric.
Please make sure you read and understand these instructions very carefully.

Evaluation Criteria:

{criteria}

Evaluation Steps:

{steps}

Example:

Source Text:

{conversation}

Summary:

{summary}

Evaluation Form (scores ONLY):

- {metric_name}
"""

# Metric 1: Relevance

RELEVANCY_SCORE_CRITERIA = """
Relevance(1-5) - selection of important content from the source. \
The summary should include only important information from the source conversation. \
Annotators were instructed to penalize summaries which contained redundancies and excess information.
1: Poor.
  - The summary includes very little relevant content from the source conversation. It fails to capture essential details and main points. It contains a significant amount of irrelevant or redundant information.

2: Limited.
  - The summary includes some relevant content but overlooks key details and main points from the source. There may be redundancies or minor irrelevant information present. The selection of important content is inadequate.

3: Moderate.
  - The summary includes a moderate amount of relevant content from the source conversation. It captures the main points but may lack depth in some areas. There might be occasional redundancies or minor irrelevant information.

4: Good.
  - The summary includes a good amount of relevant content from the source conversation. It effectively captures and conveys the main points and key details. Redundancies and irrelevant information are minimal or non-disruptive.

5: Excellent.
  - The summary demonstrates excellent relevance by selecting and including all important content from the source conversation. It provides a clear and concise representation of the main points and key details. There are no redundancies or excess information; every sentence contributes to the overall understanding.
"""

RELEVANCY_SCORE_STEPS = """
1. Read the summary and the source conversation carefully.
2. Compare the summary to the source conversation and identify the main points of the conversation.
3. Assess how well the summary covers the main points of the conversation, and how much irrelevant or redundant information it contains.
4. Assign a relevance score from 1 to 5, where 1 is the lowest and 5 is the highest based on the Evaluation Criteria.
"""

# Metric 2: Coherence

COHERENCE_SCORE_CRITERIA = """
Coherence(1-5) - the collective quality of all sentences. \
The summary should be well-structured and well-organized. \
The summary should not just be a heap of related information, but should build from sentence to a coherent body of information about a topic. \
Annotators were also instructed to penalize summaries that were not logically well-organized based on the information from the conversation

1: Poor.
  - The summary is highly disorganized and lacks any logical flow. Sentences are disconnected, making it difficult to follow the conversation. Information is scattered, and the summary fails to build a coherent narrative.

2: Limited.
  - The summary has some organization, but it is still challenging to follow. There is a loose attempt to group related sentences, but transitions are weak. The summary does not effectively build a coherent body of information.

3: Moderate.
  - The summary demonstrates a moderate level of organization. Sentences are somewhat connected, making it somewhat easier to follow. There is an attempt to build a coherent narrative, but it could be improved.

4: Good.
  - The summary is well-structured and organized. Sentences are logically connected, leading to a clear and coherent narrative. Information is presented in a way that progressively builds upon the previous sentences.

5: Excellent.
  - The summary is highly organized and exceptionally coherent. Sentences flow seamlessly, creating a smooth and logical progression of information. It effectively builds a cohesive and complete narrative that is easy to follow and understand.
"""

COHERENCE_SCORE_STEPS = """
1. Read the conversation carefully and identify the main topic and key points.
2. Read the summary and compare it to the conversation. Check if the summary covers the main topic and key points of the conversation,
and if it presents them in a clear and logical order.
3. Assign a score for coherence on a scale of 1 to 5, where 1 is the lowest and 5 is the highest based on the Evaluation Criteria.
"""

# Metric 3: Consistency

CONSISTENCY_SCORE_CRITERIA = """
Consistency(1-5) - the factual alignment between the summary and the summarized source. \
A factually consistent summary contains only statements that are entailed by the source conversation. \
Annotators were also asked to penalize summaries that contained made-up facts or unsupported details from the original discussion.
1: Poor.
  - The summary contains numerous factual errors that do not match the source conversation. It includes fabricated or entirely false information. There is a substantial disparity between the summary and the original conversation.

2: Limited.
  - The summary has multiple factual errors or inconsistencies with the source conversation. It may incorporate some made-up facts or unsupported details from the original discussion. The alignment between the summary and source is weak.

3. Moderate.
  - The summary shows moderate consistency with the source conversation. While it generally aligns with the original discussion, occasional factual errors or minor discrepancies may appear. Some statements in the summary may lack full support from the source.

4. Good.
  - The summary is factually consistent with the source conversation. It accurately reflects the main points and details of the original discussion. Factual inaccuracies or hallucinated facts are minimal or non-existent.

5. Excellent.
  - The summary exhibits excellent consistency with the source conversation. It precisely and faithfully represents the factual content and details of the original discussion. There are no factual inaccuracies or hallucinated facts, and the summary aligns perfectly with the source.
"""

CONSISTENCY_SCORE_STEPS = """
1. Read the conversation carefully and identify the main facts and details it presents.
2. Read the summary and compare it to the conversation. Check if the summary contains any factual errors that are not supported by the conversation.
3. Assign a score for consistency based on the Evaluation Criteria, where 1 is the lowest and 5 is the highest based on the Evaluation Criteria.
"""

# Metric 4: Fluency

FLUENCY_SCORE_CRITERIA = """
Fluency(1-5): the quality of the summary in terms of grammar, spelling, punctuation, word choice, and sentence structure. \
Annotators were also asked to penalize summaries that contained grammar, spelling, and punctuation errors.
1: Poor.
 - The summary contains numerous grammar, spelling, or punctuation errors. It may lack completeness, making it hard to understand. Word choice and sentence structure severely affect coherence.

2: Limited.
 - The summary has several grammar, spelling, or punctuation issues. These issues significantly impact comprehension. Word choice and sentence structure need improvement.

3: Moderate.
 - The summary shows moderate fluency with some noticeable language problems. Grammar, spelling, or punctuation errors don't severely hinder understanding. Word choice and sentence structure are generally acceptable but may require refinement.

4: Good.
 - The summary is fluently written with minimal language errors. Language issues have minimal impact on comprehension. Word choice and sentence structure enhance clarity.

5: Excellent.
 - The summary demonstrates excellent fluency in grammar, spelling, punctuation, word choice, and sentence structure. It is impeccably written with no language-related issues. Word choice and sentence structure greatly enhance overall quality and readability.
"""

FLUENCY_SCORE_STEPS = """
Read the summary and evaluate its fluency. Assign a fluency score from 1 to 5, where 1 is the lowest and 5 is the highest based on the Evaluation Criteria.
"""

In [None]:
client = OpenAI(
    api_key="",
)

In [None]:
def get_geval_score(
    criteria: str, steps: str, conversation: str, summary: str, metric_name: str, verbose=False
):
    prompt = EVALUATION_PROMPT_TEMPLATE.format(
        criteria=criteria,
        steps=steps,
        metric_name=metric_name,
        conversation=conversation,
        summary=summary,
    )
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
        max_tokens=5,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
    )
    result = response.choices[0].message.content
    if result and re.findall("[0-9]+",result):
      score_num = int(re.findall("[0-9]+",result)[0])
    else:
      score_num = pd.NA

    if verbose:
        print('\n{0}:{1}'.format(metric_name, score_num))
    return score_num

In [None]:
evaluation_metrics = {
    "Relevance": (RELEVANCY_SCORE_CRITERIA, RELEVANCY_SCORE_STEPS),
    "Coherence": (COHERENCE_SCORE_CRITERIA, COHERENCE_SCORE_STEPS),
    "Consistency": (CONSISTENCY_SCORE_CRITERIA, CONSISTENCY_SCORE_STEPS),
    "Fluency": (FLUENCY_SCORE_CRITERIA, FLUENCY_SCORE_STEPS),
}

In [None]:
for metric, (criteria, steps) in evaluation_metrics.items():
    col_name_human = metric+'_human_summary'
    dash_line = '-'.join('' for x in range(100))
    print(dash_line)
    print("Generating human baseline summarization score for {0}".format(metric))
    df[col_name_human] = df.progress_apply(lambda x: get_geval_score(criteria=criteria, steps=steps, conversation=x.inputs,
                                                            summary=x.summary_human_baseline,
                                                            metric_name=metric), axis=1)

    print("Human Summary Score for {0} : {1}".format(metric, df[col_name_human].describe()))

    print(dash_line)
    col_name_model = metric+'_peft_model_summary'
    print("Generating PEFT baseline summarization score for {0}".format(metric))
    df[col_name_model] = df.progress_apply(lambda x: get_geval_score(criteria=criteria, steps=steps, conversation=x.inputs,
                                                            summary=x.summary_peft_baseline,
                                                            metric_name=metric), axis=1)
    print("PEFT Model Summary Score for {0} : {1}".format(metric, df[col_name_model].describe()))

df.to_csv('falcon_7b_LoRA_r16_dialogue_summarization_12_21_2023_results_eval.csv')