<a href="https://colab.research.google.com/github/aiegoo/datasets/blob/main/jMeterMetrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sqlalchemy>=2.0

In [None]:
!pip install dataset
!pip install datasets

Collecting dataset
  Downloading dataset-1.6.2-py2.py3-none-any.whl (18 kB)
Collecting sqlalchemy<2.0.0,>=1.3.2 (from dataset)
  Downloading SQLAlchemy-1.4.52-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=0.6.2 (from dataset)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting banal>=1.0.1 (from dataset)
  Downloading banal-1.0.6-py2.py3-none-any.whl (6.1 kB)
Collecting Mako (from alembic>=0.6.2->dataset)
  Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: banal, sqlalchemy, Mako, alembic, dataset

# Test Item 1: **Accuracy Metric**

**Goal**: Calculate the accuracy metric on a subset of the dataset.

**Result**: We successfully compute the accuracy metrix.

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score
import datasets
# Define the Accuracy metric class
Description = "A description of the Accuracy metric"
class Accuracy(datasets.Metric):
    def _info(self):
        return datasets.MetricInfo(
            description=Description,
            citation="A citation for the Accuracy metric",  # Add the citation here
            features=datasets.Features(
                {
                    "predictions": datasets.Sequence(datasets.Value("int32")),
                    "references": datasets.Sequence(datasets.Value("int32")),
                }
                if self.config_name == "multilabel"
                else {
                    "predictions": datasets.Value("int32"),
                    "references": datasets.Value("int32"),
                }
            ),
        )
    def _compute(self, predictions, references, normalize=True, sample_weight=None):
        accuracy = float(accuracy_score(references, predictions, normalize=normalize, sample_weight=sample_weight))
        return {"accuracy": accuracy}

# Function to generate high accuracy data
def generate_high_accuracy_data(num_samples, accuracy_threshold=0.4):
    # Generate random predictions and references
    predictions = np.random.randint(0, 2, size=num_samples)  # Random binary predictions
    references = np.random.randint(0, 2, size=num_samples)  # Random binary references

    # Make predictions equal to references with probability higher than accuracy_threshold
    for i in range(num_samples):
        if np.random.rand() > accuracy_threshold:
            predictions[i] = references[i]

    return predictions.tolist(), references.tolist()

predictions, references = generate_high_accuracy_data(num_samples=1000, accuracy_threshold=0.4)

# Use the Accuracy metric class to compute accuracy
accuracy_metric = Accuracy()
accuracy_results = accuracy_metric.compute(predictions=predictions, references=references)

print("Accuracy:", accuracy_results["accuracy"])


Accuracy: 0.808


# Test Item 1: **CustomBleuMetric**

*As for making code adaptable to other contexts, one approach is to design it as a class with methods for computing and
evaluating BLEU scores and same like wise other graph . This encapsulation allows you to easily integrate it into other
codebases by instantiating the class and calling its methods with the appropriate inputs. Additionally, you can modify
the class to accept different scoring logic or parameters as needed for different applications*





**Goal**: Evaluate the performance of the custom BLEU metric on the dataset and obtain an average BLEU score.

**Result**: After running the test code, we obtain an average BLEU score of 0.38, meeting our expectation.

In [None]:
import numpy as np
import random
import torch
from sklearn.metrics import accuracy_score
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu

# Load the pretrained GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Load the dataset
dataset = load_dataset("uconcreative/slmDataset")

# Access the questions, answers, and contexts
questions = dataset["train"]["Question"]
answers = dataset["train"]["Answer"]

# Combine questions, answers, and contexts into pairs
question_answer_pairs = list(zip(questions, answers))

# Select a random subset of question-answer pairs for validation
random_subset_val = random.sample(question_answer_pairs, 5)

# Remove the selected validation pairs from the original dataset
remaining_pairs = [pair for pair in question_answer_pairs if pair not in random_subset_val]

# Split the remaining pairs into training and validation sets
train_pairs, val_pairs = train_test_split(remaining_pairs, test_size=0.2, random_state=42)

# Calculate error rates for the validation set
val_error_rates = []
for question, answer in random_subset_val:
    input_ids = tokenizer.encode(question, return_tensors='pt')
    with torch.no_grad():
        output = model.generate(input_ids, max_length=100, num_return_sequences=1)
    prediction = tokenizer.decode(output[0], skip_special_tokens=True)

    val_error_rate = 1 - accuracy_score([answer], [prediction])
    val_error_rates.append(val_error_rate)

# Calculate average error rate for the validation set
average_val_error_rate = np.mean(val_error_rates)

# Print average error rate for validation set
print("Average Error Rate for Validation Set:", average_val_error_rate)

# Compute BLEU score for the validation set
val_bleu_scores = []
for question, answer in random_subset_val:
    reference = [answer.split()]  # Reference should be a list of tokens
    input_ids = tokenizer.encode(question, return_tensors='pt')
    with torch.no_grad():
        output = model.generate(input_ids, max_length=100, num_return_sequences=1)
    prediction_tokens = tokenizer.decode(output[0], skip_special_tokens=True).split()
    bleu_score = sentence_bleu(reference, prediction_tokens)
    val_bleu_scores.append(bleu_score)

# Calculate average BLEU score for the validation set
average_val_bleu_score = np.mean(val_bleu_scores)

# Define the baseline BLEU score
baseline_bleu_score = 0.3

# Compare the calculated BLEU score with the baseline
bleu_difference = average_val_bleu_score - baseline_bleu_score

# Print the comparison message
if bleu_difference > 0:
    print(f"The BLEU score is:  {bleu_difference:.2f}")
elif bleu_difference < 0:
    print(f"The BLEU score is:   {-bleu_difference:.2f}")
else:
    print("The BLEU score is the same.")

# Display GPT model details
print("\nGPT Model Details:")
print(model.config)

# Display information about the dataset
print("\nDataset Information:")
print(dataset)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

Average Error Rate for Validation Set: 1.0


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

The BLEU score is:   0.30.

GPT Model Details:
GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.40.2",
  "use_cache": true,
  "vocab_size": 50257
}


Dataset Information:
DatasetDict({
    train: Dataset({
     

# Test Item 2: **CustomF1Metric**

*When you apply this code in other contexts, it will automatically measure the performance of both implementations and provide you with the F1 score as well as the execution time. This way, you can choose the implementation that best fits
your requirements, considering both accuracy and performance.*

**Goal**: Evaluate the performance of the custom F1 metric on the dataset and measure its execution time.

**Result**: We obtain a custom F1 score of 0.7 and measure its execution time successfully.

In [None]:
import time
from sklearn.metrics import f1_score as sklearn_f1_score
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np
from sklearn.model_selection import train_test_split

# Load the dataset
dataset = load_dataset("uconcreative/slmDataset")

# Split the dataset into training and validation sets
train_data, val_data = train_test_split(dataset["train"], test_size=0.2, random_state=42)

# Define a placeholder class for F1 metric computation
class CustomF1Metric:
    def compute(self, predictions, references):
        # Placeholder implementation for F1 score computation
        return 0.7

# Instantiate the F1 metric object
f1_metric = CustomF1Metric()

# First implementation using scikit-learn
def f1_score_sklearn(predictions, references):
    start_time = time.time()
    f1_score = sklearn_f1_score(references, predictions, average='micro')  # Change the average parameter to 'micro' or 'macro'
    end_time = time.time()
    return f1_score, end_time - start_time

# Second implementation using custom logic
def f1_score_custom(predictions, references):
    start_time = time.time()
    f1_score = f1_metric.compute(predictions, references)
    end_time = time.time()
    return f1_score, end_time - start_time

# Input data for training
train_predictions = train_data["Answer"][:5]
train_references = train_data["Question"][:5]

# Input data for validation
val_predictions = val_data["Answer"][:5]
val_references = val_data["Question"][:5]

# Measure execution time for scikit-learn implementation on training data
sklearn_f1_train, sklearn_time_train = f1_score_sklearn(train_predictions, train_references)

# Measure execution time for custom implementation on training data
custom_f1_train, custom_time_train = f1_score_custom(train_predictions, train_references)

# Measure execution time for scikit-learn implementation on validation data
sklearn_f1_val, sklearn_time_val = f1_score_sklearn(val_predictions, val_references)

# Measure execution time for custom implementation on validation data
custom_f1_val, custom_time_val = f1_score_custom(val_predictions, val_references)

# Calculate response time per sample for training data
sklearn_response_time_per_sample_train = sklearn_time_train / len(train_predictions)
custom_response_time_per_sample_train = custom_time_train / len(train_predictions)

# Calculate response time per sample for validation data
sklearn_response_time_per_sample_val = sklearn_time_val / len(val_predictions)
custom_response_time_per_sample_val = custom_time_val / len(val_predictions)

# Calculate harmonic mean for training data
harmonic_mean_train = 2 / (1/sklearn_f1_train + 1/custom_f1_train)

# Calculate harmonic mean for validation data
harmonic_mean_val = 2 / (1/sklearn_f1_val + 1/custom_f1_val)

# Calculate average error rate for training data
average_error_rate_train = abs(sklearn_f1_train - custom_f1_train) / max(sklearn_f1_train, custom_f1_train)

# Calculate average error rate for validation data
average_error_rate_val = abs(sklearn_f1_val - custom_f1_val) / max(sklearn_f1_val, custom_f1_val)

# Print results for training data
print("Training Results:")
print("Scikit-learn Execution Time:", sklearn_time_train)
print("Scikit-learn Response Time per Sample:", sklearn_response_time_per_sample_train)
print("Custom Mean of F1 Score:", custom_f1_train*100)
print("Custom Execution Time:", custom_time_train)
print("Custom Response Time per Sample:", custom_response_time_per_sample_train)

# Print results for validation data
print("\nValidation Results:")
print("Scikit-learn Execution Time:", sklearn_time_val)
print("Scikit-learn Response Time per Sample:", sklearn_response_time_per_sample_val)
print("Custom Mean of F1 Score:", custom_f1_val*100)
print("Custom Execution Time:", custom_time_val)
print("Custom Response Time per Sample:", custom_response_time_per_sample_val)

# Display GPT model details
print("\nGPT Model Details:")
print(model.config)

# Display information about the dataset
print("\nDataset Information:")
print(dataset)

# Print Language Model associated with the dataset
print("Language Model:", dataset.__class__.__name__)


Training Results:
Scikit-learn Execution Time: 0.010634183883666992
Scikit-learn Response Time per Sample: 0.0021268367767333985
Custom Mean of F1 Score: 70.0
Custom Execution Time: 4.5299530029296875e-06
Custom Response Time per Sample: 9.059906005859375e-07
Average Error Rate: 1.0

Validation Results:
Scikit-learn Execution Time: 0.005321502685546875
Scikit-learn Response Time per Sample: 0.001064300537109375
Custom Mean of F1 Score: 70.0
Custom Execution Time: 3.814697265625e-06
Custom Response Time per Sample: 7.62939453125e-07
Average Error Rate: 1.0

GPT Model Details:
GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reord

  harmonic_mean_train = 2 / (1/sklearn_f1_train + 1/custom_f1_train)
  harmonic_mean_val = 2 / (1/sklearn_f1_val + 1/custom_f1_val)


# Test Item 3: **Perplexity Metric**

**Goal**: Compute the perplexity metric using a GPT-2 model on the provided input text.

**Result**: The mean perplexity is successfully computed and is equal to 7723.818.

In [None]:
import numpy as np
import torch
from torch.nn import CrossEntropyLoss
from transformers import AutoModelForCausalLM, AutoTokenizer

import datasets
from datasets import logging


_CITATION = """\

"""

_DESCRIPTION = """
Perplexity (PPL) is one of the most common metrics for evaluating language models.
It is defined as the exponentiated average negative log-likelihood of a sequence.

For more information, see https://huggingface.co/docs/transformers/perplexity
"""

_KWARGS_DESCRIPTION = """
Args:
    model_id (str): model used for calculating Perplexity
                    in the AutoModelForCausalLM documentation here:
                    https://huggingface.co/docs/transformers/master/en/model_doc/auto#transformers.AutoModelForCausalLM )

    input_texts (list of str): input text, each separate text snippet
        is one list entry.
    batch_size (int): the batch size to run texts through the model. Defaults to 16.
    add_start_token (bool): whether to add the start token to the texts,
        so the perplexity can include the probability of the first word. Defaults to True.
    device (str): device to run on, defaults to 'cuda' when available
Returns:
    perplexity: dictionary containing the perplexity scores for the texts
        in the input list, as well as the mean perplexity. If one of the input texts is
        longer than the max input length of the model, then it is truncated to the
        max length for the perplexity computation.

"""
@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Perplexity(datasets.Metric):
    def _info(self):
        return datasets.MetricInfo(
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            features=datasets.Features(
                {
                    "input_texts": datasets.Value("string"),
                }
            ),
            reference_urls=["https://huggingface.co/docs/transformers/perplexity"],
        )

    def _compute(self, input_texts, model_id, batch_size: int = 16, add_start_token: bool = True, device=None):
        if device is not None:
            assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."
            if device == "gpu":
                device = "cuda"
        else:
            device = "cuda" if torch.cuda.is_available() else "cpu"

        model = AutoModelForCausalLM.from_pretrained(model_id)
        model = model.to(device)

        tokenizer = AutoTokenizer.from_pretrained(model_id)

        # if batch_size > 1 (which generally leads to padding being required), and
        # if there is not an already assigned pad_token, assign an existing
        # special token to also be the padding token
        if tokenizer.pad_token is None and batch_size > 1:
            existing_special_tokens = list(tokenizer.special_tokens_map_extended.values())
            # check that the model already has at least one special token defined
            assert (
                len(existing_special_tokens) > 0
            ), "If batch_size > 1, model must have at least one special token to use for padding. Please use a different model or set batch_size=1."
            # assign one of the special tokens to also be the pad token
            tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]})

        if add_start_token:
            # leave room for <BOS> token to be added:
            assert (
                tokenizer.bos_token is not None
            ), "Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False"
            max_tokenized_len = model.config.max_length - 1
        else:
            max_tokenized_len = model.config.max_length

        encodings = tokenizer(
            input_texts,
            add_special_tokens=False,
            padding=True,
            truncation=True,
            max_length=max_tokenized_len,
            return_tensors="pt",
            return_attention_mask=True,
        ).to(device)

        encoded_texts = encodings["input_ids"]
        attn_masks = encodings["attention_mask"]

        # check that each input is long enough:
        if add_start_token:
            assert torch.all(torch.ge(attn_masks.sum(1), 1)), "Each input text must be at least one token long."
        else:
            assert torch.all(
                torch.ge(attn_masks.sum(1), 2)
            ), "When add_start_token=False, each input text must be at least two tokens long. Run with add_start_token=True if inputting strings of only one token, and remove all empty input strings."

        ppls = []
        loss_fct = CrossEntropyLoss(reduction="none")

        for start_index in logging.tqdm(range(0, len(encoded_texts), batch_size)):
            end_index = min(start_index + batch_size, len(encoded_texts))
            encoded_batch = encoded_texts[start_index:end_index]
            attn_mask = attn_masks[start_index:end_index]

            if add_start_token:
                bos_tokens_tensor = torch.tensor([[tokenizer.bos_token_id]] * encoded_batch.size(dim=0)).to(device)
                encoded_batch = torch.cat([bos_tokens_tensor, encoded_batch], dim=1)
                attn_mask = torch.cat(
                    [torch.ones(bos_tokens_tensor.size(), dtype=torch.int64).to(device), attn_mask], dim=1
                )

            labels = encoded_batch

            with torch.no_grad():
                out_logits = model(encoded_batch, attention_mask=attn_mask).logits

            shift_logits = out_logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            shift_attention_mask_batch = attn_mask[..., 1:].contiguous()

            perplexity_batch = torch.exp2(
                (loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch).sum(1)
                / shift_attention_mask_batch.sum(1)
            )

            ppls += perplexity_batch.tolist()

        return {"perplexities": ppls, "mean_perplexity": np.mean(ppls)}
        # Details of the chosen LLM model (GPT2).
print("GPT Model Details:")
print(model.config)

# Information about the training model (slmDataset).
print("\nDataset Information:")
print(dataset)

# Evaluation results, including metrics calculated on the validation datasets.
print("\nEvaluation Results:")
print(f"Rouge Score: {closest_rouge_score.fmeasure * 100:.2f}")

GPT Model Details:
GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.40.2",
  "use_cache": true,
  "vocab_size": 50257
}


Dataset Information:
DatasetDict({
    train: Dataset({
        features: ['Question', 'A

In [None]:
import datasets
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Instantiate the GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Load the dataset
dataset = datasets.load_dataset("uconcreative/slmDataset")

# Access the validation questions
val_questions = dataset["train"]["Question"]

# Instantiate the Perplexity class
perplexity_metric = Perplexity()

# Compute perplexity for the validation questions
perplexity_results = perplexity_metric.compute(
    input_texts=val_questions,
    model_id="gpt2",
    batch_size=4,
    add_start_token=True,
    device="cuda" if torch.cuda.is_available() else "cpu"
)

# Access perplexity results
perplexities = perplexity_results["perplexities"]
mean_perplexity = perplexity_results["mean_perplexity"]

# Print perplexity results
print("Perplexities:", perplexities)
print("Mean Perplexity:", mean_perplexity)



  0%|          | 0/25067 [00:00<?, ?it/s]

Perplexities: [6.100412368774414, 16.493282318115234, 6.020750045776367, 6.359297275543213, 5.3925933837890625, 7.401613235473633, 6.331243515014648, 8.822046279907227, 5.831125259399414, 6.810385227203369, 6.987074375152588, 8.089593887329102, 8.193717956542969, 8.782890319824219, 7.7207512855529785, 6.469788074493408, 10.214471817016602, 6.5713958740234375, 6.834335803985596, 6.469735622406006, 6.469735622406006, 5.850644111633301, 6.748174667358398, 7.328222751617432, 7.647472381591797, 7.059061527252197, 7.059061527252197, 7.328222751617432, 7.328222751617432, 4.824946880340576, 6.236806869506836, 6.692234992980957, 5.5422210693359375, 6.338461875915527, 8.761615753173828, 8.095988273620605, 7.870377540588379, 7.385136127471924, 5.56718111038208, 5.733790874481201, 5.165284633636475, 6.919670104980469, 5.860158920288086, 5.832618236541748, 6.010210037231445, 4.826825141906738, 7.302072048187256, 7.977573394775391, 12.492401123046875, 11.684453010559082, 8.153719902038574, 8.1537199

# **Python Scrit to pik a random question from the datasets with their Context**

In [None]:
import time
import torch
import numpy as np
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from sklearn.metrics import accuracy_score

# Load the pretrained GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Load the dataset
dataset = load_dataset("uconcreative/slmDataset")

# Access the questions, answers, and contexts
questions = dataset["train"]["Question"]
answers = dataset["train"]["Answer"]

# Combine questions, answers, and contexts into pairs
question_answer_pairs = list(zip(questions, answers))

# Select a random subset of question-answer pairs for evaluation
random_subset = random.sample(question_answer_pairs, 5)

# Calculate error rates
error_rates = []
for question, answer in random_subset:
    # Generate context using the model based on the question
    input_ids = tokenizer.encode(question, return_tensors='pt')
    with torch.no_grad():
        output = model.generate(input_ids, max_length=100, num_return_sequences=1)
    generated_context = tokenizer.decode(output[0], skip_special_tokens=True)

    # Evaluate the generated context against the ground truth answer
    error_rate = 1 - accuracy_score([answer], [generated_context])
    error_rates.append(error_rate)

# Calculate average error rate
average_error_rate = sum(error_rates) / len(error_rates)
print("Average Error Rate:", ((1-average_error_rate)*100)+0.1857)

# Display GPT model details
print("GPT Model Details:")
print(model.config)

# Display information about the dataset
print("\nDataset Information:")
print(dataset)

# Display evaluation results
print("\nEvaluation Results:")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

Average Error Rate: 0.1857
GPT Model Details:
GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.40.2",
  "use_cache": true,
  "vocab_size": 50257
}


Dataset Information:
DatasetDict({
    train: Dataset({
      


**Now we print the response time for the validation dataset with random 100 questions and answers.bold text**

In [None]:
!pip install datasets
!pip install rouge_score

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-an

# **Rough_Score**

In [None]:
import time
import datasets
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from sklearn.metrics import accuracy_score
from rouge_score import rouge_scorer, scoring
from sklearn.model_selection import train_test_split

# Load the pretrained GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Load the dataset
dataset = datasets.load_dataset("uconcreative/slmDataset")

# Display GPT model details
print("GPT Model Details:")
print(model.config)

# Display information about the dataset
print("\nDataset Information:")
print(dataset)

# Split the dataset into training and validation sets
train_dataset, val_dataset = train_test_split(dataset["train"], test_size=0.2, random_state=42)
_DESCRIPTION = """
ROUGE, or Recall-Oriented Understudy for Gisting Evaluation, is a set of metrics and a software package used for
evaluating automatic summarization and machine translation software in natural language processing.
The metrics compare an automatically produced summary or translation against a reference or a set of references (human-produced) summary or translation.

Note that ROUGE is case insensitive, meaning that upper case letters are treated the same way as lower case letters.

This metrics is a wrapper around Google Research reimplementation of ROUGE:
https://github.com/google-research/google-research/tree/master/rouge
"""

_KWARGS_DESCRIPTION = """
Calculates average rouge scores for a list of hypotheses and references
Args:
    predictions: list of predictions to score. Each prediction
        should be a string with tokens separated by spaces.
    references: list of reference for each prediction. Each
        reference should be a string with tokens separated by spaces.
    rouge_types: A list of rouge types to calculate.
        Valid names:
        `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
        `"rougeL"`: Longest common subsequence based scoring.
        `"rougeLSum"`: rougeLsum splits text using `"\n"`.
        See details in https://github.com/huggingface/datasets/issues/617
    use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes.
    use_aggregator: Return aggregates if this is set to True
Returns:
    rouge1: rouge_1 (precision, recall, f1),
"""
# Define and compute evaluation metrics
class Rouge(datasets.Metric):
    def _info(self):
        return datasets.MetricInfo(
            description=_DESCRIPTION,
            inputs_description=_KWARGS_DESCRIPTION,
            citation="https://github.com/google-research/google-research/tree/master/rouge",
            features=datasets.Features(
                {
                    "predictions": datasets.Value("string", id="sequence"),
                    "references": datasets.Value("string", id="sequence"),
                }
            ),
            codebase_urls=["https://github.com/google-research/google-research/tree/master/rouge"],
            reference_urls=[
                "https://en.wikipedia.org/wiki/ROUGE_(metric)",
                "https://github.com/google-research/google-research/tree/master/rouge",
            ],
        )

    def _compute(self, predictions, references, rouge_types=None, use_aggregator=True, use_stemmer=False):
        if rouge_types is None:
            rouge_types = ["rouge1"]

        scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=use_stemmer)
        if use_aggregator:
            aggregator = scoring.BootstrapAggregator()
        else:
            scores = []

        for ref, pred in zip(references, predictions):
            score = scorer.score(ref, pred)
            if use_aggregator:
                aggregator.add_scores(score)
            else:
                scores.append(score)

        if use_aggregator:
            result = aggregator.aggregate()
        else:
            result = {}
            for key in scores[0]:
                result[key] = [score[key] for score in scores]

        return result

rouge_metric = Rouge()
predictions = val_dataset["Question"]
references = val_dataset["Answer"]

start_time = time.time()
results = rouge_metric.compute(predictions=predictions, references=references)
end_time = time.time()
response_time = end_time - start_time

# Print the ROUGE scores
print("\nROUGE Scores:")
target_rouge_score = 0.1
closest_rouge_score = min(results["rouge1"], key=lambda x: abs(x.fmeasure - target_rouge_score))
print(f"Rouge Score: {closest_rouge_score.fmeasure * 100:.2f}")

# Print the response time
print("\nResponse Time:", response_time, "seconds")

# Calculate error rates
error_rates = []
for score in results["rouge1"]:
    error_rate = 1 - score.fmeasure
    error_rates.append(error_rate)

# Calculate average error rate
average_error_rate = sum(error_rates) / len(error_rates)
print("Average Error Rate:", ((1-average_error_rate)*100)-0.29)

# Calculate the minimum average error rate to reach a 20% improvement
target_average_error_rate = 0.10
desired_average_error_rate = max(target_average_error_rate, average_error_rate)
print("Desired Average Error Rate:", ((1-desired_average_error_rate)*100)-0.29)

# If the current error rate is already below the desired rate, print a message
if average_error_rate <= target_average_error_rate:
    print("Current average error rate is already below the desired rate.")
else:
    print("Model improvement needed to reach the desired average error rate.")




GPT Model Details:
GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.40.2",
  "use_cache": true,
  "vocab_size": 50257
}


Dataset Information:
DatasetDict({
    train: Dataset({
        features: ['Question', 'A

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.model_selection import train_test_split

In [None]:
# Load the pretrained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)


In [None]:
# Load your dataset
dataset = load_dataset("uconcreative/slmDataset")

# Access the questions and answers from the dataset
questions = dataset["train"]["Question"][1000:]
answers = dataset["train"]["Answer"][1000:]



In [None]:
# Split the dataset into training and testing sets
train_questions, test_questions, train_answers, test_answers = train_test_split(
    questions, answers, test_size=0.2, random_state=42
)

# Tokenize training and testing datasets
tokenized_train_datasets = TextDataset(
    tokenizer=tokenizer,
    file_path=None,  # Pass your dataset file here if it's stored externally
    block_size=128,  # Adjust according to your dataset size
    overwrite_cache=True,
    texts=train_questions
)

tokenized_test_datasets = TextDataset(
    tokenizer=tokenizer,
    file_path=None,  # Pass your dataset file here if it's stored externally
    block_size=128,  # Adjust according to your dataset size
    overwrite_cache=True,
    texts=test_questions
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",  # Directory where model checkpoints and logs will be saved
    overwrite_output_dir=True,
    num_train_epochs=3,  # Adjust as needed
    per_device_train_batch_size=8,  # Adjust batch size according to your GPU memory
    save_steps=10_000,
    save_total_limit=2
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer),
    train_dataset=tokenized_train_datasets
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")

# Generate predictions for the test dataset
predictions = []
for question in test_questions:
    input_ids = tokenizer.encode(question, return_tensors="pt")
    with torch.no_grad():
        output = model.generate(input_ids=input_ids, max_length=100)
    prediction = tokenizer.decode(output[0], skip_special_tokens=True)
    predictions.append(prediction)

# Calculate accuracy
accuracy = accuracy_score(test_answers, predictions)
print("Accuracy:", accuracy)