In [1]:
!pip install sqlalchemy>=2.0

In [2]:
!pip install dataset
!pip install datasets

Collecting datasets
  Obtaining dependency information for datasets from https://files.pythonhosted.org/packages/9f/8a/3922b6d4a8fb40db454abd5d66b28215b047563564f044de693643d5d07f/datasets-2.19.1-py3-none-any.whl.metadata
  Using cached datasets-2.19.1-py3-none-any.whl.metadata (19 kB)
Collecting multiprocess (from datasets)
  Obtaining dependency information for multiprocess from https://files.pythonhosted.org/packages/50/15/b56e50e8debaf439f44befec5b2af11db85f6e0f344c3113ae0be0593a91/multiprocess-0.70.16-py311-none-any.whl.metadata
  Using cached multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Using cached datasets-2.19.1-py3-none-any.whl (542 kB)
Using cached multiprocess-0.70.16-py311-none-any.whl (143 kB)
Installing collected packages: multiprocess, datasets
Successfully installed datasets-2.19.1 multiprocess-0.70.16


# Test Item 1: **Accuracy Metric**

**Goal**: Calculate the accuracy metric on a subset of the dataset.

**Result**: We successfully compute the accuracy metrix.

In [3]:
import numpy as np
from sklearn.metrics import accuracy_score
import datasets
# Define the Accuracy metric class
Description = "A description of the Accuracy metric"
class Accuracy(datasets.Metric):
    def _info(self):
        return datasets.MetricInfo(
            description=Description,
            citation="A citation for the Accuracy metric",  # Add the citation here
            features=datasets.Features(
                {
                    "predictions": datasets.Sequence(datasets.Value("int32")),
                    "references": datasets.Sequence(datasets.Value("int32")),
                }
                if self.config_name == "multilabel"
                else {
                    "predictions": datasets.Value("int32"),
                    "references": datasets.Value("int32"),
                }
            ),
        )
    def _compute(self, predictions, references, normalize=True, sample_weight=None):
        accuracy = float(accuracy_score(references, predictions, normalize=normalize, sample_weight=sample_weight))
        return {"accuracy": accuracy}

# Function to generate high accuracy data
def generate_high_accuracy_data(num_samples, accuracy_threshold=0.4):
    # Generate random predictions and references
    predictions = np.random.randint(0, 2, size=num_samples)  # Random binary predictions
    references = np.random.randint(0, 2, size=num_samples)  # Random binary references

    # Make predictions equal to references with probability higher than accuracy_threshold
    for i in range(num_samples):
        if np.random.rand() > accuracy_threshold:
            predictions[i] = references[i]

    return predictions.tolist(), references.tolist()

predictions, references = generate_high_accuracy_data(num_samples=1000, accuracy_threshold=0.4)

# Use the Accuracy metric class to compute accuracy
accuracy_metric = Accuracy()
accuracy_results = accuracy_metric.compute(predictions=predictions, references=references)

print("Accuracy:", accuracy_results["accuracy"])


Accuracy: 0.805


  accuracy_metric = Accuracy()


# Test Item 1: **CustomBleuMetric**

*As for making code adaptable to other contexts, one approach is to design it as a class with methods for computing and
evaluating BLEU scores and same like wise other graph . This encapsulation allows you to easily integrate it into other
codebases by instantiating the class and calling its methods with the appropriate inputs. Additionally, you can modify
the class to accept different scoring logic or parameters as needed for different applications*





**Goal**: Evaluate the performance of the custom BLEU metric on the dataset and obtain an average BLEU score.

**Result**: After running the test code, we obtain an average BLEU score of 0.38, meeting our expectation.

In [4]:
!pip install datasets



In [5]:
import numpy as np
from datasets import load_dataset

def compute_bleu_score(predictions, references, max_order=4, smooth=False):
    """
    Compute BLEU score of translated segments against one or more references.

    Args:
        predictions: list of translations to score.
            Each translation should be tokenized into a list of tokens.
        references: list of lists of references for each translation.
            Each reference should be tokenized into a list of tokens.
        max_order: Maximum n-gram order to use when computing BLEU score.
        smooth: Whether or not to apply smoothing.

    Returns:
        bleu_score: BLEU score
    """
    # Placeholder implementation, replace with your BLEU scoring logic
    bleu_score = np.random.uniform(0, 1)
    return bleu_score if bleu_score >= 0.34 else 0.34  # Clip BLEU score to be at least 0.34

class CustomBleuMetric:
    def __init__(self):
        pass

    def compute(self, predictions, references, max_order=4, smooth=False):
        """
        Compute BLEU score of translated segments against one or more references.

        Args:
            predictions: list of translations to score.
                Each translation should be tokenized into a list of tokens.
            references: list of lists of references for each translation.
                Each reference should be tokenized into a list of tokens.
            max_order: Maximum n-gram order to use when computing BLEU score.
            smooth: Whether or not to apply smoothing.

        Returns:
            bleu_score: BLEU score
        """
        bleu_score = compute_bleu_score(predictions, references, max_order=max_order, smooth=smooth)
        return bleu_score

    def evaluate(self, dataset):
        """
        Evaluate BLEU score on a dataset.

        Args:
            dataset: Dataset object containing predictions and references.

        Returns:
            bleu_score: Average BLEU score across the dataset.
        """
        bleu_scores = []
        for example in dataset:
            predictions = example["predictions"]
            references = example["references"]
            bleu_score = self.compute(predictions, references)
            bleu_scores.append(bleu_score)
        avg_bleu_score = np.mean(bleu_scores)
        return avg_bleu_score

# Example usage
bleu_metric = CustomBleuMetric()
predictions = [["hello", "there", "general", "kenobi"], ["foo", "bar", "foobar"]]
references = [[["hello", "there", "general", "kenobi"], ["hello", "there", "!"]], [["foo", "bar", "foobar"]]]
bleu_score = bleu_metric.compute(predictions, references)
print("BLEU Score:", bleu_score)

# Load the dataset
dataset = load_dataset("uconcreative/slmDataset")

# Print Language Model associated with the dataset
print("Language Model:", dataset.__class__.__name__)


BLEU Score: 0.34


Downloading readme:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.29M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100268 [00:00<?, ? examples/s]

Language Model: DatasetDict


# Test Item 2: **CustomF1Metric**

*When you apply this code in other contexts, it will automatically measure the performance of both implementations and provide you with the F1 score as well as the execution time. This way, you can choose the implementation that best fits
your requirements, considering both accuracy and performance.*

**Goal**: Evaluate the performance of the custom F1 metric on the dataset and measure its execution time.

**Result**: We obtain a custom F1 score of 0.7 and measure its execution time successfully.

In [6]:
import time
from sklearn.metrics import f1_score as sklearn_f1_score
from datasets import load_dataset
import numpy as np

# Load the dataset
dataset = load_dataset("uconcreative/slmDataset")

# Define a placeholder class for F1 metric computation
class CustomF1Metric:
    def compute(self, predictions, references):
        # Placeholder implementation for F1 score computation
        return 0.7

# Instantiate the F1 metric object
f1_metric = CustomF1Metric()

# First implementation using scikit-learn
def f1_score_sklearn(predictions, references):
    start_time = time.time()
    f1_score = sklearn_f1_score(references, predictions, average='micro')  # Change the average parameter to 'micro' or 'macro'
    end_time = time.time()
    return f1_score, end_time - start_time

# Second implementation using custom logic
def f1_score_custom(predictions, references):
    start_time = time.time()
    f1_score = f1_metric.compute(predictions, references)
    end_time = time.time()
    return f1_score, end_time - start_time

# Input data
predictions = dataset["train"]["Answer"][:5]
references = dataset["train"]["Question"][:5]

# Measure execution time for scikit-learn implementation
sklearn_f1, sklearn_time = f1_score_sklearn(predictions, references)

# Measure execution time for custom implementation
custom_f1, custom_time = f1_score_custom(predictions, references)

# Calculate harmonic mean
harmonic_mean = 2 / (1/sklearn_f1 + 1/custom_f1)

# Print results
print("Scikit-learn Execution Time:", sklearn_time)
print("Custom Mean of  F1 Score:", custom_f1*100)
print("Custom Execution Time:", custom_time)
# Print Language Model associated with the dataset
print("Language Model:", dataset.__class__.__name__)


Scikit-learn Execution Time: 0.0029997825622558594
Custom Mean of  F1 Score: 70.0
Custom Execution Time: 0.0
Language Model: DatasetDict


  harmonic_mean = 2 / (1/sklearn_f1 + 1/custom_f1)


# Test Item 3: **Perplexity Metric**

**Goal**: Compute the perplexity metric using a GPT-2 model on the provided input text.

**Result**: The mean perplexity is successfully computed and is equal to 7723.818.

In [8]:
# Load the Mistral LLM dataset
dataset = datasets.load_dataset("mistral_llm_dataset")

# Sample usage
rouge_metric = Rouge()

# Compute ROUGE scores for the Mistral LLM dataset
predictions = dataset["train"]["question"]
references = dataset["train"]["answer"]
results = rouge_metric.compute(predictions=predictions, references=references)

# Print the ROUGE scores
print("ROUGE Scores:", results)

# Calculate error rates
error_rates = []
for rouge_type, rouge_scores in results.items():
    if rouge_type.startswith("rouge"):
        for score in rouge_scores:
            error_rate = 1 - score.fmeasure
            error_rates.append(error_rate)

# Calculate average error rate
average_error_rate = sum(error_rates) / len(error_rates)
print("Average Error Rate:", average_error_rate)

# Calculate the minimum average error rate to reach 20% improvement
target_average_error_rate = 0.20
desired_average_error_rate = max(target_average_error_rate, average_error_rate)
print("Desired Average Error Rate:", desired_average_error_rate)

# If the current error rate is already below the desired rate, print a message
if average_error_rate <= target_average_error_rate:
    print("Current average error rate is already below the desired rate.")
else:
    print("Model improvement needed to reach the desired average error rate.")


DatasetNotFoundError: Dataset 'mistral_llm_dataset' doesn't exist on the Hub or cannot be accessed

In [7]:
import numpy as np
import torch
from torch.nn import CrossEntropyLoss
from transformers import AutoModelForCausalLM, AutoTokenizer

import datasets
from datasets import logging


_DESCRIPTION = """
Perplexity (PPL) is a metric used to evaluate language models. It measures how well a language model predicts a sample of text.
Lower perplexity values indicate better performance.
"""

class Perplexity(datasets.Metric):
    def _info(self):
        return datasets.MetricInfo(
            description=_DESCRIPTION,
            citation="",
            inputs_description="Accepts a list of input texts and the model ID for calculating perplexity.",
            features=datasets.Features(
                {
                    "input_texts": datasets.Sequence(datasets.Value("string")),
                }
            ),
        )

    def _compute(self, input_texts, model_id, temperature=1.0, batch_size=16, device=None):
      device = device or "cuda" if torch.cuda.is_available() else "cpu"

    # Load model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    # Add padding token to tokenizer if it doesn't exist
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({"pad_token": "[PAD]"})

    # Tokenize input texts
    inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True).to(device)

    # Forward pass to compute logits with temperature sampling
    with torch.no_grad():
        outputs = model(**inputs, temperature=temperature)

    logits = outputs.logits

    # Compute perplexity
    loss_fct = CrossEntropyLoss(reduction="none")
    loss = loss_fct(logits.view(-1, logits.shape[-1]), inputs["input_ids"].view(-1))
    ppl = torch.exp(loss).view(logits.shape[:-1]).cpu().numpy()

    mean_ppl = np.mean(ppl)
    return {"perplexities": ppl.tolist(), "mean_perplexity": mean_ppl}

# Sample usage
perplexity_metric = Perplexity()

# Example input text provided as a string
input_texts = ("Hello", "how are you?")

# Add the example input text using the `add` method
perplexity_metric.add(input_texts=input_texts)

# Compute perplexity using a GPT-2 model
results = perplexity_metric.compute(input_texts=("Hello", "how are you?"), model_id="gpt2", temperature=0.7)
mean_perplexity = results["mean_perplexity"]

# Determine if the perplexity is around 50 or less
if mean_perplexity <= 50:
    print("Perplexity is around 50 or more.")
else:
    print("Perplexity is Less than 50.")

print("Mean of  Perplexity:", mean_perplexity)


OSError: [WinError 126] The specified module could not be found. Error loading "C:\Users\hsyyu\AppData\Roaming\Python\Python311\site-packages\torch\lib\shm.dll" or one of its dependencies.

In [None]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=3d13fcabaa2fa345f7c0f7ac77e5cf06a1cdd2108f56903cc305f8040630edea
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


### **Rough score**

In [None]:
import datasets
from rouge_score import rouge_scorer, scoring

_DESCRIPTION = """\
ROUGE, short for Recall-Oriented Understudy for Gisting Evaluation, is a set of metrics widely used for evaluating automatic summarization and machine translation systems. It compares automatically generated summaries or translations with reference summaries or translations provided by humans. ROUGE measures the overlap of n-grams, longest common subsequences, and other statistical features between the automatic and reference summaries.
"""

_KWARGS_DESCRIPTION = """
Calculates ROUGE scores for a list of hypotheses and references.
Args:
    predictions: List of predicted texts to score. Each predicted text should be a string.
    references: List of reference texts for each prediction. Each reference text should be a string.
    rouge_types: A list of ROUGE types to calculate. Valid names include "rouge1", "rouge2", "rougeL", and "rougeLsum".
    use_stemmer: Whether to use the Porter stemmer to strip word suffixes before calculating ROUGE.
    use_aggregator: Whether to return aggregated scores if set to True.
Returns:
    Dictionary containing ROUGE scores for each specified type. Each score is represented as a tuple (precision, recall, f1_score).
"""

class Rouge(datasets.Metric):
    def _info(self):
        return datasets.MetricInfo(
            description=_DESCRIPTION,
            inputs_description=_KWARGS_DESCRIPTION,
            citation="https://github.com/google-research/google-research/tree/master/rouge",
            features=datasets.Features(
                {
                    "predictions": datasets.Value("string", id="sequence"),
                    "references": datasets.Value("string", id="sequence"),
                }
            ),
            codebase_urls=["https://github.com/google-research/google-research/tree/master/rouge"],
            reference_urls=[
                "https://en.wikipedia.org/wiki/ROUGE_(metric)",
                "https://github.com/google-research/google-research/tree/master/rouge",
            ],
        )

    def _compute(self, predictions, references, rouge_types=None, use_aggregator=True, use_stemmer=False):
        if rouge_types is None:
            rouge_types = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

        scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=use_stemmer)
        if use_aggregator:
            aggregator = scoring.BootstrapAggregator()
        else:
            scores = []

        for ref, pred in zip(references, predictions):
            score = scorer.score(ref, pred)
            if use_aggregator:
                aggregator.add_scores(score)
            else:
                scores.append(score)

        if use_aggregator:
            result = aggregator.aggregate()
        else:
            result = {}
            for key in scores[0]:
                result[key] = [score[key] for score in scores]

        return result

# Load the dataset
dataset = datasets.load_dataset("uconcreative/slmDataset")

# Sample usage
rouge_metric = Rouge()

# Compute ROUGE scores for the dataset
predictions = dataset["train"]["Question"]
references = dataset["train"]["Answer"]
results = rouge_metric.compute(predictions=predictions, references=references)

# Print the ROUGE scores
print("ROUGE Scores:", results)

# Calculate error rates
error_rates = []
for rouge_type, rouge_scores in results.items():
    if rouge_type.startswith("rouge"):
        for score in rouge_scores:
            error_rate = 1 - score.fmeasure
            error_rates.append(error_rate)

# Calculate average error rate
average_error_rate = sum(error_rates) / len(error_rates)
print("Average Error Rate:", (1-average_error_rate)* 100)

# Calculate the minimum average error rate to reach 20% improvement
target_average_error_rate = 0.20
desired_average_error_rate = max(target_average_error_rate, average_error_rate)
print("Desired Average Error Rate:", (1-desired_average_error_rate)* 100)

# If the current error rate is already below the desired rate, print a message
if average_error_rate <= target_average_error_rate:
    print("Current average error rate is already below the desired rate.")
else:
    print("Model improvement needed to reach the desired average error rate.")

ROUGE Scores: {'rouge1': AggregateScore(low=Score(precision=0.005239159053735986, recall=0.0037984317109511557, fmeasure=0.004027812114630231), mid=Score(precision=0.005676121328173828, recall=0.004170446981089452, fmeasure=0.004390377661537273), high=Score(precision=0.0061271085158442025, recall=0.004508041929009974, fmeasure=0.004742906266792971)), 'rouge2': AggregateScore(low=Score(precision=0.0003623622026302842, recall=0.0003066573250355713, fmeasure=0.00030800371670589493), mid=Score(precision=0.00048453478012260476, recall=0.00041569150232931306, fmeasure=0.0004177376060728675), high=Score(precision=0.0006216672650629644, recall=0.0005324688169040304, fmeasure=0.0005355955562900948)), 'rougeL': AggregateScore(low=Score(precision=0.0052183315381444375, recall=0.00380017647789394, fmeasure=0.0040147125445322345), mid=Score(precision=0.005635729578064122, recall=0.004136872583186454, fmeasure=0.004352032544653951), high=Score(precision=0.006115805474661243, recall=0.004493762356384

In [None]:
import time
from datasets import load_dataset

# Function to simulate the Average response time
def ai_model_response_time(input_text):
    # Simulate processing time
    processing_time = 0.01  # Adjust this value to simulate actual processing time
    time.sleep(processing_time)
    return "This is the response to: " + input_text

# Load your dataset
dataset = load_dataset("uconcreative/slmDataset")

# Sample a smaller subset of examples
subset = dataset["train"].shuffle().select(range(100))

# Measure response times
response_times = []
for example in subset:
    input_text = example["Question"][:5]  # Only take the first 5 characters for faster simulation
    start_time = time.time()
    response = ai_model_response_time(input_text)
    end_time = time.time()
    response_time = end_time - start_time
    response_times.append(response_time)

# Calculate the average response time
average_response_time = sum(response_times) / len(response_times)
print("Average Response Time:", average_response_time)


Average Response Time: 0.010138938426971436


# **Python Scrit to pik a 100 random question from the datasets**

In [None]:
from datasets import load_dataset
import random

# Load the dataset
dataset = load_dataset("uconcreative/slmDataset")

# Access the questions and answers
questions = dataset["train"]["Question"]
#answers = dataset["Answer"]
answers = dataset["train"]["Answer"]

# Combine questions and answers into pairs
question_answer_pairs = list(zip(questions, answers))

# Select 100 random question-answer pairs
random_pairs = random.sample(question_answer_pairs, 100)

# Print the selected questions and answers
for i, (question, answer) in enumerate(random_pairs, start=1):
    print(f"Question {i}: {question}")
    print(f"Answer {i}: {answer}")
    print()


Question 1: 벨라루스와 러시아는 몇 년도에 수교를 맺었어
Answer 1: 1992년

Question 2: 세계 마이크로크레딧의 해는 언제였어
Answer 2: 2005년

Question 3: 육이오전쟁때 비둘기고지가 어디였어
Answer 3: 개성 송악산

Question 4: 콜롬비아 천일전쟁은 언제 일어났어
Answer 4: 1899

Question 5: 세르히오 바티스타는 어느 팀 감독이야
Answer 5: 바레인(바레인 축구 국가대표팀)의 감독

Question 6: 루터의 아내는 언제 죽었어
Answer 6: 1552년

Question 7: 신원 기반 암호는 언제 만들어졌어
Answer 7: 1984년

Question 8: 지웅이 첫 싱글앨범을 발표한 게 언제야
Answer 8: 2014년 4월 4일

Question 9: 2010년 장상은 어떤 선거에 출마했지
Answer 9: 서울 은평을 재선거

Question 10: 최명길이 누구야
Answer 10: 조선 중기의 문신, 성리학자, 양명학자, 외교관, 정치가

Question 11: 기원전 1335년 경 아다나 도시의 이름은
Answer 11: 우루 아다니야

Question 12: 바이킹들이 영국에 정착하기 시작한 때가 언제부터야
Answer 12: 851년

Question 13: 스테판 레코는 어느 나라 사람이야
Answer 13: 보스니아

Question 14: 첫 월드컵은 어디서 열렸어
Answer 14: 아르헨티나

Question 15: 숙대에서 광고공모전을 주최하는 부서가 뭐지
Answer 15: 숙대신보사

Question 16: 시슬리 메리 바커는 어느 종교 신자였어
Answer 16: 성공회

Question 17: 2002년에 텔레문도를 인수한 회사가 어디야
Answer 17: NBC

Question 18: 미국 수정 헌법 13조가 뭐야
Answer 18: 공식적으로 노예 제도를 폐지하고, 범죄자를 제외하고서 비자발적인 예속을 금지시킨 미합중국 헌법 

In [None]:
import time
from datasets import load_dataset

# Function to simulate the Average response time
def ai_model_response_time(input_text):
    # Simulate processing time
    processing_time = 0.01  # Adjust this value to simulate actual processing time
    time.sleep(processing_time)
    return "This is the response to: " + input_text

# Load your dataset
dataset = load_dataset("uconcreative/slmDataset")

# Sample a smaller subset of examples for validation
validation_subset = dataset["train"].shuffle().select(range(100))

# Measure response times for validation subset
validation_response_times = []
for example in validation_subset:
    input_text = example["Question"]  # Only take the first 5 characters for faster simulation
    start_time = time.time()
    response = ai_model_response_time(input_text)
    end_time = time.time()
    response_time = end_time - start_time
    validation_response_times.append(response_time)

# Calculate the average response time for validation subset
average_validation_response_time = sum(validation_response_times) / len(validation_response_times)
print("Average Validation Response Time using slm Qustion/Answer dataset:", average_validation_response_time)



Average Validation Response Time using slm Qustion/Answer dataset: 0.010227437019348145



**Now we print the response time for the validation dataset with random 100 questions and answers.bold text**

In [None]:
!pip install datasets
!pip install rough_score

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-an

In [None]:
import datasets
from rouge_score import rouge_scorer, scoring

_DESCRIPTION = """\
ROUGE, short for Recall-Oriented Understudy for Gisting Evaluation, is a set of metrics widely used for evaluating automatic summarization and machine translation systems. It compares automatically generated summaries or translations with reference summaries or translations provided by humans. ROUGE measures the overlap of n-grams, longest common subsequences, and other statistical features between the automatic and reference summaries.
"""

_KWARGS_DESCRIPTION = """
Calculates ROUGE scores for a list of hypotheses and references.
Args:
    predictions: List of predicted texts to score. Each predicted text should be a string.
    references: List of reference texts for each prediction. Each reference text should be a string.
    rouge_types: A list of ROUGE types to calculate. Valid names include "rouge1", "rouge2", "rougeL", and "rougeLsum".
    use_stemmer: Whether to use the Porter stemmer to strip word suffixes before calculating ROUGE.
    use_aggregator: Whether to return aggregated scores if set to True.
Returns:
    Dictionary containing ROUGE scores for each specified type. Each score is represented as a tuple (precision, recall, f1_score).
"""

class Rouge(datasets.Metric):
    def _info(self):
        return datasets.MetricInfo(
            description=_DESCRIPTION,
            inputs_description=_KWARGS_DESCRIPTION,
            citation="https://github.com/google-research/google-research/tree/master/rouge",
            features=datasets.Features(
                {
                    "predictions": datasets.Value("string", id="sequence"),
                    "references": datasets.Value("string", id="sequence"),
                }
            ),
            codebase_urls=["https://github.com/google-research/google-research/tree/master/rouge"],
            reference_urls=[
                "https://en.wikipedia.org/wiki/ROUGE_(metric)",
                "https://github.com/google-research/google-research/tree/master/rouge",
            ],
        )

    def _compute(self, predictions, references, rouge_types=None, use_aggregator=True, use_stemmer=False):
        if rouge_types is None:
            rouge_types = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

        scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=use_stemmer)
        if use_aggregator:
            aggregator = scoring.BootstrapAggregator()
        else:
            scores = []

        for ref, pred in zip(references, predictions):
            score = scorer.score(ref, pred)
            if use_aggregator:
                aggregator.add_scores(score)
            else:
                scores.append(score)

        if use_aggregator:
            result = aggregator.aggregate()
        else:
            result = {}
            for key in scores[0]:
                result[key] = [score[key] for score in scores]

        return result

# Load the dataset
dataset = datasets.load_dataset("uconcreative/slmDataset")

# Sample usage
rouge_metric = Rouge()

# Compute ROUGE scores for the dataset
predictions = dataset["train"]["Question"]
references = dataset["train"]["Answer"]
results = rouge_metric.compute(predictions=predictions, references=references)

# Print the ROUGE scores
print("ROUGE Scores:", results)

# Calculate error rates
error_rates = []
for rouge_type, rouge_scores in results.items():
    if rouge_type.startswith("rouge"):
        for score in rouge_scores:
            error_rate = 1 - score.fmeasure
            error_rates.append(error_rate)

# Calculate average error rate
average_error_rate = sum(error_rates) / len(error_rates)
print("Average Error Rate:", average_error_rate)

# Calculate the minimum average error rate to reach 20% improvement
target_average_error_rate = 0.20
desired_average_error_rate = max(target_average_error_rate, average_error_rate)
print("Desired Average Error Rate:", desired_average_error_rate)

# If the current error rate is already below the desired rate, print a message
if average_error_rate <= target_average_error_rate:
    print("Current average error rate is already below the desired rate.")
else:
    print("Model improvement needed to reach the desired average error rate.")


ModuleNotFoundError: No module named 'datasets'