In [None]:
!pip install datasets
!pip install textstat
!pip install bitsandbytes

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (

In [None]:
from huggingface_hub import hf_hub_download
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from datasets import load_dataset
import re
import numpy as np
from tqdm import tqdm
import torch
import nltk
import textstat
import random
import pickle
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
import spacy
from collections import defaultdict

In [None]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
nlp = spacy.load("en_core_web_lg")

In [None]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
def get_dataset():
    train_dataset = load_dataset("openai/gsm8k", "main", split='train')
    test_dataset = load_dataset("openai/gsm8k", "main", split='test')
    return train_dataset, test_dataset

In [None]:
class Stats_Helpers():
  def __init__(self, question):
    self.question = question
    self.keywords = ['sum', 'total', 'difference', 'product', 'per']
    self.stopwords = set([
        'the', 'is', 'in', 'at', 'of', 'a', 'an', 'and', 'to', 'for', 'on',
        'with', 'as', 'by', 'that', 'from', 'it', 'this', 'be', 'or', 'are',
        'was', 'were', 'but', 'not', 'have', 'has', 'had', 'if', 'then', 'so'
    ])
    self.doc = nlp(question)
    self.unit_pattern = re.compile(r'\b(k|kg|km|m|cm|mm|ml|l|lb|oz|g|mg|hr|min|sec|miles?|feet|inches?)\b', re.IGNORECASE)

  def get_numeric_tokens(self):
    # returns number of numeric tokens in the problem
    number_pattern = r'''
        \b              # Word boundary
        (?!\d{4}\b)     # Exclude 4-digit standalone numbers (likely years)
        \d{1,3}         # 1-3 digits
        (?:,\d{3})+     # Thousands separators (e.g., 1,000)
        (?:\.\d+)?      # Optional decimal portion
        |
        \d+             # Standard integers
        (?:\.\d+)?      # Decimals without commas
        \b              # Word boundary
    '''
    numbers = re.findall(number_pattern, self.question, re.VERBOSE)

    # Convert to floats for normalization (5.0 vs 5 vs 5.00)
    normalized = set()
    for num in numbers:
        try:
            normalized.add(float(num.replace(',', '')))
        except ValueError:
            continue  # Skip malformed numbers

    return list(normalized)

  def get_keyword_frequencies(self):
    """Count exact matches of mathematical keywords"""
    freq = {k: 0 for k in self.keywords}
    for keyword in self.keywords:
        freq[keyword] = len(re.findall(rf'\b{re.escape(keyword)}\b', self.question))
    return freq

  def get_stopword_ratio(self):
    """Calculate fraction of stopwords in question"""
    words = re.findall(r'\b\w+\b', self.question)
    if not words: return 0.0
    stop_count = sum(1 for w in words if w in self.stopwords)
    return round(stop_count / len(words), 4)

  def get_pos_distribution(self):
    """Calculate proportional POS tag distribution using spaCy's universal tags"""
    pos_counts = defaultdict(int)
    for token in self.doc:
        pos_counts[token.pos_] += 1

    total = len(self.doc)
    return {
        'prop_noun': pos_counts['NOUN'] / total if total else 0,
        'prop_verb': pos_counts['VERB'] / total if total else 0,
        'prop_num': pos_counts['NUM'] / total if total else 0,
        'prop_adj': pos_counts['ADJ'] / total if total else 0
    }

  def get_dependency_metrics(self):
    """Calculate syntactic complexity metrics from dependency parse"""
    # Tree depth calculation
    def get_depth(token):
        if token.head == token:  # Root node
            return 1
        return 1 + get_depth(token.head)

    depths = [get_depth(token) for token in self.doc]

    # Dependency distance calculation
    dep_distances = [abs(token.i - token.head.i) for token in self.doc
                    if token.head != token]

    return {
        'max_tree_depth': max(depths),
        'avg_dep_distance': sum(dep_distances)/len(dep_distances) if dep_distances else 0
    }

  def get_named_entities(self):
    """Extract quantities, units, and person names with spaCy NER"""
    entities = {
        'quantity_count': 0,
        'unit_count': 0,
        'person_count': 0,
        'numeric_entities': []
    }

    # SpaCy entity detection
    for ent in self.doc.ents:
        if ent.label_ in ['QUANTITY', 'CARDINAL', 'MONEY']:
            entities['quantity_count'] += 1
            try:
                entities['numeric_entities'].append(float(ent.text.replace(',', '')))
            except:
                continue
        elif ent.label_ == 'PERSON':
            entities['person_count'] += 1

    # Custom unit detection with regex
    units = self.unit_pattern.findall(self.doc.text)
    entities['unit_count'] = len(units)

    return entities

def get_problem_stats(problem):
  stats = {}
  question = problem['question'].lower()
  stat_helper = Stats_Helpers(question)

  # readability metrics
  sentences = [s.strip() for s in question.split('.') if s.strip()]
  words = question.split()
  avg_word_length = sum(len(word) for word in words)/len(words) if words else 0
  stats['word_count'] = len(question.split())
  stats['sentence_count'] = len(sentences)
  stats['avg_sentence_length'] = np.mean([len(s.split()) for s in sentences])
  stats['avg_word_length'] = avg_word_length
  stats['flesch_reading_ease'] = textstat.flesch_reading_ease(question)   # 0 to 100, higher is more readable

  # numerical metrics
  numeric_values = stat_helper.get_numeric_tokens()
  stats.update({
      'num_numeric_values': len(numeric_values),
      'min_value': min(numeric_values) if numeric_values else 0,
      'mean_value': np.mean(numeric_values) if numeric_values else 0,
      'max_value': max(numeric_values) if numeric_values else 0
  })
  stats['num_operations'] = re.findall(r'[+\-*/^]', question)

  # keyword metrics
  stats['keyword_frequencies'] = stat_helper.get_keyword_frequencies()
  stats['stopword_ratio'] = stat_helper.get_stopword_ratio()

  # pos metrics
  stats.update(stat_helper.get_pos_distribution())
  stats.update(stat_helper.get_dependency_metrics())

  # semantic metrics
  stats.update(stat_helper.get_named_entities())

  return stats

In [None]:
def get_model(model_name):
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,  # Match your input dtype
        bnb_4bit_quant_type="nf4",  # Add quantization type
        bnb_4bit_use_double_quant=True
    )
    if model_name == "wizardmath":
        wizardmath_tokenizer = AutoTokenizer.from_pretrained("WizardLM/WizardMath-7B-V1.1")
        wizardmath_model = AutoModelForCausalLM.from_pretrained(
            "WizardLM/WizardMath-7B-V1.1",
            quantization_config=quantization_config,
            device_map={"": 0},
            torch_dtype=torch.float16
        )
        return {
            'model': wizardmath_model,
            'model_name': "wizardmath",
            'tokenizer': wizardmath_tokenizer,
            'cost_per_token': 0.7
        }
    elif model_name == "phi2":
        phi2_tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
        phi2_model = AutoModelForCausalLM.from_pretrained(
            "microsoft/phi-2",
            quantization_config=quantization_config,
            device_map={"": 0},
            torch_dtype=torch.float16,
            trust_remote_code=True
        )
        return {
            'model': phi2_model,
            'model_name': "phi2",
            'tokenizer': phi2_tokenizer,
            'cost_per_token': 0.13  # Lower cost since it's a smaller model
        }

In [None]:
def extract_answer(answer_text):
    # The final answer in GSM8K follows the '####' pattern
    match = re.search(r'####\s*(-?\d+)', answer_text)
    if match:
        return match.group(1).strip()
    return None

In [None]:
def process_problem(problem, model_index, models):
    prompt = f"""

Follow these instructions:
1. Work through the problem step by step
2. Calculate the numerical answer
3. On the last line, write ONLY: #### <numerical answer>. Do not add any units like "kg" or "m", or any currency symbols like "$".
4. Do not write anything after the final answer

-------------------
EXAMPLE FORMAT:
Step 1: [explanation]
Step 2: [explanation]
Final calculation: [calculation]
#### [numerical answer]
-------------------

NOW SOLVE THE PROBLEM CORRECTLY: {problem['question']}
"""
    # print("Entered global process problem")
    model_obj = models[model_index]['model']
    tokenizer = models[model_index].get('tokenizer', None)
    if tokenizer:
        tokenizer = models[model_index]['tokenizer']

    # if models[model_index]['model_name'] == "wizardmath":
    inputs = tokenizer(prompt, return_tensors="pt").to(model_obj.device)
    outputs = model_obj.generate(
        inputs.input_ids,
        max_new_tokens=1024,
        temperature=0.1,
        do_sample=True,
        attention_mask=inputs.attention_mask,
        # pad_token_id=tokenizer.eos_token_id,
    )
    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    prompt_end = full_output.find(f"NOW SOLVE THE PROBLEM CORRECTLY: {problem['question']}")
    if prompt_end != -1:
        # Move past the question to get to the solution
        prompt_end = prompt_end + len(f"NOW SOLVE THE PROBLEM CORRECTLY: {problem['question']}")
        model_response = full_output[prompt_end:].strip()
    else:
        # Fallback if we can't find the exact prompt ending
        model_response = full_output

    # Check for #### pattern first (Phi-2 style)
    hash_match = re.search(r'####\s*([\$]?\s*\d+(?:\.\d+)?)', model_response)
    if hash_match:
        # Extract just the number, removing any currency symbols
        answer_text = hash_match.group(1)
        numeric_match = re.search(r'(\d+(?:\.\d+)?)', answer_text)
        if numeric_match:
            numeric_answer = numeric_match.group(1)
            # return f"{prompt}\n\n{model_response.split('####')[0].strip()}\n#### {numeric_answer}"
            return {
                'prompt': prompt,
                'response': model_response,
                'answer': numeric_answer
            }

    # Check for explicit "answer is" pattern (WizardMath style)
    answer_match = re.search(r'(?:final answer|the answer is)[^0-9]*?([\$]?\s*\d+(?:\.\d+)?)',
                            model_response.lower())
    if answer_match:
        answer_text = answer_match.group(1)
        numeric_match = re.search(r'(\d+(?:\.\d+)?)', answer_text)
        if numeric_match:
            numeric_answer = numeric_match.group(1)
            # Find where this answer occurs in the text to split it there
            answer_position = model_response.lower().find(answer_match.group(0))
            if answer_position != -1:
                # return f"{prompt}\n\n{model_response[:answer_position].strip()}\n#### {numeric_answer}"
                return {
                    'prompt': prompt,
                    'response': model_response,
                    'answer': numeric_answer
                }

    # If all else fails, look for numbers in the last few lines
    lines = model_response.split('\n')
    for i in range(len(lines)-1, max(0, len(lines)-5), -1):
        line = lines[i]
        # Skip lines that are clearly not the answer
        if len(line.strip()) < 1 or any(word in line.lower() for word in ["step", "explanation"]):
            continue

        numeric_match = re.search(r'(\d+(?:\.\d+)?)', line)
        if numeric_match:
            numeric_answer = numeric_match.group(1)
            # return f"{prompt}\n\n{model_response.split(line)[0].strip()}\n#### {numeric_answer}"
            return {
                'prompt': prompt,
                'response': model_response,
                'answer': numeric_answer
            }

    # If we couldn't extract an answer, return the unmodified output
    # return full_output
    return {
        'prompt': prompt,
        'response': full_output,
        'answer': None
    }


In [None]:
import numpy as np
from scipy.linalg import solve_triangular
from collections import defaultdict

class LinUCB:
    def __init__(self, n_arms, context_size, alpha=0.5):
        self.n_arms = n_arms
        self.context_size = context_size
        self.alpha = alpha  # Exploration parameter

        # Initialize arm parameters
        self.A = [np.eye(context_size) for _ in range(n_arms)]
        self.b = [np.zeros(context_size) for _ in range(n_arms)]
        self.theta = [np.zeros(context_size) for _ in range(n_arms)]

    def select_arm(self, context):
        """Select arm with highest UCB score"""
        scores = []
        for arm in range(self.n_arms):
            A_inv = np.linalg.inv(self.A[arm])
            self.theta[arm] = A_inv @ self.b[arm]

            # Compute UCB score
            score = self.theta[arm] @ context
            score += self.alpha * np.sqrt(context @ A_inv @ context)
            scores.append(score)

        return np.argmax(scores)

    def update(self, arm, context, reward):
        """Update chosen arm's parameters"""
        self.A[arm] += np.outer(context, context)
        self.b[arm] += reward * context

class ModelSwitcher:
    def __init__(self, models):
        self.models = models
        self.stats_helper = Stats_Helpers

        # Initialize LinUCB with context size matching get_problem_stats features
        self.bandit = LinUCB(n_arms=2, context_size=23, alpha=0.25)

        # Feature indices mapping (based on your get_problem_stats output)
        self.feature_order = [
            'word_count', 'sentence_count', 'avg_sentence_length', 'avg_word_length',
            'flesch_reading_ease', 'num_numeric_values', 'min_value', 'mean_value',
            'max_value', 'stopword_ratio', 'prop_noun', 'prop_verb', 'prop_num',
            'prop_adj', 'max_tree_depth', 'avg_dep_distance', 'quantity_count',
            'unit_count', 'person_count', 'keyword_frequencies.sum',
            'keyword_frequencies.total', 'keyword_frequencies.difference',
            'keyword_frequencies.product'
        ]

    def _get_context_vector(self, problem_stats):
        """Convert problem stats to normalized feature vector"""
        context = []
        for feat in self.feature_order:
            if 'keyword' in feat:
                key = feat.split('.')[-1]
                context.append(problem_stats['keyword_frequencies'][key])
            else:
                context.append(problem_stats[feat])

        # Normalize numerical features
        context = np.array(context)
        context = (context - np.mean(context)) / (np.std(context) + 1e-8)
        return context

    def process_problem(self, problem, correct_answer):
        # print("Entered model switcher process problem")

        # Get problem features
        stats = get_problem_stats(problem)
        # print("Got stats")
        context = self._get_context_vector(stats)
        # print("Got context")

        # Select model using LinUCB
        model_idx = self.bandit.select_arm(context)
        # print("Selected model")

        # Get model prediction
        prediction = process_problem(problem, model_idx, self.models)
        # print(prediction)
        # print("Got prediction")

        # Calculate reward based on correctness and model cost
        is_correct = self._check_correctness(prediction, correct_answer)
        reward = self._calculate_reward(is_correct, model_idx)

        # Update bandit
        self.bandit.update(model_idx, context, reward)

        return prediction, model_idx, reward

    def _check_correctness(self, prediction, correct_answer):
        """Extract numerical answer from prediction"""
        try:
            pred_answer = float(re.search(r'####\s*([\d.]+)', prediction).group(1))
            return np.isclose(pred_answer, correct_answer, rtol=0.01)
        except:
            return False

    def _calculate_reward(self, correct, model_idx):
        """Custom reward function considering model cost"""
        model_cost = self.models[model_idx]['cost_per_token']

        if correct:
            return 1
        else:
            return 0
        #     return 1.0 if model_idx == 0 else 0.5  # Prefer small model when correct
        # else:
        #     return -2.0 if model_idx == 1 else -1.0  # Penalize big model more for errors

In [None]:
temp_set = get_dataset()
gsm8k_dataset = {
    'train': temp_set[0],
    'test': temp_set[1]
}
models = [get_model('phi2'), get_model('wizardmath')]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/948 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [None]:
def calculate_cost(prediction_data, model):
    """Calculate cost using structured prediction data"""
    tokenizer = model['tokenizer']
    cost_per_token = model['cost_per_token']

    # Tokenize prompt and response separately
    input_tokens = tokenizer.encode(prediction_data['prompt'], return_tensors='pt').shape[1]
    output_tokens = tokenizer.encode(prediction_data['response'], return_tensors='pt').shape[1]

    return (input_tokens + output_tokens) * cost_per_token

In [None]:
switcher = ModelSwitcher(models)
switcher.bandit.alpha = 1

# Training loop (assuming you have problems with known answers)
total_correct = 0
total_cost = 0

num_problems = 200
cur_problem_idx = 0
subset = gsm8k_dataset['train'].select(range(num_problems))

for problem in tqdm(subset, desc="Processing problems"):
    cur_problem_idx += 1
    # Get correct answer from dataset
    correct_answer = extract_answer(problem['answer'])

    # Process problem
    prediction, model_idx, reward = switcher.process_problem(problem, correct_answer)

    # print(f"Problem: {problem['question']}")
    # print(f"Prediction: {prediction}")
    # print(f"Correct Answer: {correct_answer}")
    print(f"Model: {models[model_idx]['model_name']}")

    # Track performance
    predicted_ans = prediction['answer']
    # print(f"Predicted ans: {predicted_ans}")
    # print(f"Correct Answer: {correct_answer}")
    if predicted_ans is not None and float(predicted_ans) == float(correct_answer):
        total_correct += 1
    # total_correct += (1 if switcher._check_correctness(prediction, correct_answer) else 0)
    total_cost += calculate_cost(prediction, models[model_idx])

    new_alpha = 1 - (cur_problem_idx/num_problems)
    if new_alpha < 0.1:
        new_alpha = 0.1
    switcher.bandit.alpha = new_alpha

Processing problems:   0%|          | 0/200 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Processing problems:   0%|          | 1/200 [00:51<2:49:23, 51.07s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:   1%|          | 2/200 [00:58<1:24:04, 25.48s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:   2%|▏         | 3/200 [01:48<1:59:42, 36.46s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:   2%|▏         | 4/200 [01:56<1:22:30, 25.26s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:   2%|▎         | 5/200 [02:46<1:51:08, 34.20s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:   3%|▎         | 6/200 [03:03<1:31:27, 28.29s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:   4%|▎         | 7/200 [03:17<1:15:53, 23.60s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:   4%|▍         | 8/200 [04:06<1:41:44, 31.80s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:   4%|▍         | 9/200 [04:56<1:59:07, 37.42s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:   5%|▌         | 10/200 [05:20<1:45:23, 33.28s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:   6%|▌         | 11/200 [05:34<1:26:05, 27.33s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:   6%|▌         | 12/200 [06:02<1:26:44, 27.69s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:   6%|▋         | 13/200 [06:52<1:47:09, 34.38s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:   7%|▋         | 14/200 [07:05<1:26:32, 27.92s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:   8%|▊         | 15/200 [07:17<1:11:35, 23.22s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:   8%|▊         | 16/200 [07:39<1:10:09, 22.88s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:   8%|▊         | 17/200 [08:29<1:34:22, 30.94s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:   9%|▉         | 18/200 [09:19<1:50:59, 36.59s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  10%|▉         | 19/200 [09:37<1:33:43, 31.07s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  10%|█         | 20/200 [10:27<1:50:20, 36.78s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  10%|█         | 21/200 [11:17<2:01:25, 40.70s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  11%|█         | 22/200 [11:31<1:37:28, 32.85s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  12%|█▏        | 23/200 [12:21<1:51:32, 37.81s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  12%|█▏        | 24/200 [13:10<2:01:19, 41.36s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  12%|█▎        | 25/200 [13:19<1:31:39, 31.43s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  13%|█▎        | 26/200 [13:29<1:12:38, 25.05s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  14%|█▎        | 27/200 [14:19<1:33:59, 32.60s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  14%|█▍        | 28/200 [14:37<1:21:04, 28.28s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  14%|█▍        | 29/200 [14:47<1:05:00, 22.81s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  15%|█▌        | 30/200 [15:38<1:28:17, 31.16s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  16%|█▌        | 31/200 [16:28<1:43:35, 36.78s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  16%|█▌        | 32/200 [17:17<1:53:49, 40.65s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  16%|█▋        | 33/200 [17:31<1:30:33, 32.53s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  17%|█▋        | 34/200 [17:40<1:10:15, 25.39s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  18%|█▊        | 35/200 [18:28<1:28:32, 32.20s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  18%|█▊        | 36/200 [18:40<1:11:14, 26.07s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  18%|█▊        | 37/200 [18:47<55:36, 20.47s/it]  Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  19%|█▉        | 38/200 [19:37<1:19:01, 29.27s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  20%|█▉        | 39/200 [19:53<1:07:39, 25.22s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  20%|██        | 40/200 [20:44<1:27:52, 32.96s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  20%|██        | 41/200 [21:01<1:14:43, 28.20s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  21%|██        | 42/200 [21:09<58:45, 22.31s/it]  Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  22%|██▏       | 43/200 [21:59<1:20:16, 30.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  22%|██▏       | 44/200 [22:49<1:34:45, 36.45s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  22%|██▎       | 45/200 [23:02<1:16:03, 29.44s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  23%|██▎       | 46/200 [23:21<1:07:27, 26.28s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  24%|██▎       | 47/200 [23:31<54:28, 21.36s/it]  Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  24%|██▍       | 48/200 [24:22<1:16:15, 30.10s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  24%|██▍       | 49/200 [25:07<1:27:23, 34.73s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  25%|██▌       | 50/200 [25:18<1:08:30, 27.40s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  26%|██▌       | 51/200 [26:08<1:25:31, 34.44s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  26%|██▌       | 52/200 [26:56<1:34:50, 38.45s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  26%|██▋       | 53/200 [27:47<1:42:55, 42.01s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  27%|██▋       | 54/200 [27:59<1:20:16, 32.99s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  28%|██▊       | 55/200 [28:10<1:04:13, 26.57s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  28%|██▊       | 56/200 [29:00<1:20:22, 33.49s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  28%|██▊       | 57/200 [29:50<1:31:46, 38.51s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  29%|██▉       | 58/200 [30:02<1:12:37, 30.69s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  30%|██▉       | 59/200 [30:53<1:26:18, 36.73s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  30%|███       | 60/200 [31:36<1:29:58, 38.56s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  30%|███       | 61/200 [31:55<1:15:39, 32.66s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  31%|███       | 62/200 [32:45<1:26:58, 37.81s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  32%|███▏      | 63/200 [33:34<1:34:25, 41.36s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  32%|███▏      | 64/200 [33:49<1:15:12, 33.18s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  32%|███▎      | 65/200 [34:09<1:05:51, 29.27s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  33%|███▎      | 66/200 [34:19<52:52, 23.67s/it]  Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  34%|███▎      | 67/200 [34:39<49:39, 22.40s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  34%|███▍      | 68/200 [35:28<1:07:09, 30.52s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  34%|███▍      | 69/200 [36:19<1:19:36, 36.46s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  35%|███▌      | 70/200 [37:09<1:27:50, 40.54s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  36%|███▌      | 71/200 [37:16<1:06:06, 30.75s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  36%|███▌      | 72/200 [37:27<52:52, 24.78s/it]  Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  36%|███▋      | 73/200 [37:42<45:56, 21.71s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  37%|███▋      | 74/200 [38:32<1:03:26, 30.21s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  38%|███▊      | 75/200 [39:22<1:15:13, 36.11s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  38%|███▊      | 76/200 [39:35<1:00:26, 29.25s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  38%|███▊      | 77/200 [39:46<48:39, 23.74s/it]  Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  39%|███▉      | 78/200 [39:55<39:08, 19.25s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  40%|███▉      | 79/200 [40:45<57:23, 28.46s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  40%|████      | 80/200 [41:34<1:09:35, 34.79s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  40%|████      | 81/200 [41:47<55:57, 28.22s/it]  Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  41%|████      | 82/200 [42:37<1:08:05, 34.63s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  42%|████▏     | 83/200 [42:51<55:45, 28.59s/it]  Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  42%|████▏     | 84/200 [43:41<1:07:21, 34.84s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  42%|████▎     | 85/200 [44:30<1:15:07, 39.20s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  43%|████▎     | 86/200 [44:38<56:33, 29.77s/it]  Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  44%|████▎     | 87/200 [44:46<44:00, 23.37s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  44%|████▍     | 88/200 [45:36<58:14, 31.20s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  44%|████▍     | 89/200 [45:45<45:23, 24.54s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  45%|████▌     | 90/200 [46:34<58:35, 31.96s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  46%|████▌     | 91/200 [47:23<1:07:27, 37.14s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  46%|████▌     | 92/200 [48:12<1:13:21, 40.76s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  46%|████▋     | 93/200 [49:02<1:17:10, 43.27s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  47%|████▋     | 94/200 [49:51<1:19:32, 45.02s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  48%|████▊     | 95/200 [50:01<1:00:51, 34.78s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  48%|████▊     | 96/200 [50:21<52:30, 30.29s/it]  Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  48%|████▊     | 97/200 [51:11<1:01:46, 35.98s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  49%|████▉     | 98/200 [51:28<51:39, 30.39s/it]  Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  50%|████▉     | 99/200 [51:43<43:26, 25.81s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  50%|█████     | 100/200 [52:01<39:20, 23.61s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  50%|█████     | 101/200 [52:15<33:47, 20.48s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  51%|█████     | 102/200 [53:04<47:28, 29.07s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  52%|█████▏    | 103/200 [53:53<56:55, 35.21s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  52%|█████▏    | 104/200 [54:13<48:52, 30.54s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  52%|█████▎    | 105/200 [54:23<38:41, 24.44s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  53%|█████▎    | 106/200 [55:13<50:05, 31.97s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  54%|█████▎    | 107/200 [56:02<57:32, 37.13s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  54%|█████▍    | 108/200 [56:52<1:02:55, 41.04s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  55%|█████▍    | 109/200 [57:09<51:10, 33.74s/it]  Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  55%|█████▌    | 110/200 [57:59<58:06, 38.74s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  56%|█████▌    | 111/200 [58:20<49:30, 33.37s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  56%|█████▌    | 112/200 [58:31<38:59, 26.58s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  56%|█████▋    | 113/200 [58:41<31:35, 21.79s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  57%|█████▋    | 114/200 [59:31<43:09, 30.11s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  57%|█████▊    | 115/200 [1:00:20<50:44, 35.82s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  58%|█████▊    | 116/200 [1:00:36<41:43, 29.81s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  58%|█████▊    | 117/200 [1:01:06<41:22, 29.91s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  59%|█████▉    | 118/200 [1:01:55<48:48, 35.72s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  60%|█████▉    | 119/200 [1:02:05<37:33, 27.82s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  60%|██████    | 120/200 [1:02:54<45:39, 34.25s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  60%|██████    | 121/200 [1:03:43<51:04, 38.79s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  61%|██████    | 122/200 [1:04:04<43:32, 33.50s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  62%|██████▏   | 123/200 [1:04:54<49:09, 38.31s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  62%|██████▏   | 124/200 [1:05:12<40:49, 32.23s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  62%|██████▎   | 125/200 [1:06:02<46:53, 37.52s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  63%|██████▎   | 126/200 [1:06:14<36:44, 29.79s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  64%|██████▎   | 127/200 [1:06:20<27:32, 22.63s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  64%|██████▍   | 128/200 [1:07:09<36:56, 30.78s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  64%|██████▍   | 129/200 [1:07:20<29:25, 24.87s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  65%|██████▌   | 130/200 [1:08:10<37:41, 32.31s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  66%|██████▌   | 131/200 [1:08:59<42:59, 37.39s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  66%|██████▌   | 132/200 [1:09:00<30:03, 26.52s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  66%|██████▋   | 133/200 [1:09:50<37:18, 33.41s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  67%|██████▋   | 134/200 [1:10:05<30:31, 27.76s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  68%|██████▊   | 135/200 [1:10:54<37:06, 34.25s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  68%|██████▊   | 136/200 [1:11:06<29:22, 27.55s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  68%|██████▊   | 137/200 [1:11:55<35:40, 33.98s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  69%|██████▉   | 138/200 [1:12:44<39:52, 38.59s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  70%|██████▉   | 139/200 [1:13:04<33:33, 33.01s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  70%|███████   | 140/200 [1:13:10<24:55, 24.92s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  70%|███████   | 141/200 [1:13:24<21:11, 21.56s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  71%|███████   | 142/200 [1:13:34<17:36, 18.21s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  72%|███████▏  | 143/200 [1:13:49<16:20, 17.21s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  72%|███████▏  | 144/200 [1:14:29<22:26, 24.05s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  72%|███████▎  | 145/200 [1:15:18<28:55, 31.56s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  73%|███████▎  | 146/200 [1:15:38<25:06, 27.89s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  74%|███████▎  | 147/200 [1:16:28<30:28, 34.51s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  74%|███████▍  | 148/200 [1:17:17<33:47, 38.99s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  74%|███████▍  | 149/200 [1:18:06<35:46, 42.08s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  75%|███████▌  | 150/200 [1:18:17<27:12, 32.64s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  76%|███████▌  | 151/200 [1:19:06<30:40, 37.56s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  76%|███████▌  | 152/200 [1:19:55<32:50, 41.05s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  76%|███████▋  | 153/200 [1:20:02<24:05, 30.76s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  77%|███████▋  | 154/200 [1:20:52<28:01, 36.55s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  78%|███████▊  | 155/200 [1:21:11<23:28, 31.30s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  78%|███████▊  | 156/200 [1:21:20<18:01, 24.57s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  78%|███████▊  | 157/200 [1:22:09<22:55, 31.98s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  79%|███████▉  | 158/200 [1:22:58<26:00, 37.17s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  80%|███████▉  | 159/200 [1:23:48<27:54, 40.84s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  80%|████████  | 160/200 [1:24:37<28:52, 43.32s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  80%|████████  | 161/200 [1:25:26<29:19, 45.11s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  81%|████████  | 162/200 [1:25:38<22:16, 35.17s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  82%|████████▏ | 163/200 [1:25:55<18:13, 29.55s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  82%|████████▏ | 164/200 [1:26:44<21:16, 35.47s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  82%|████████▎ | 165/200 [1:26:53<16:01, 27.46s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  83%|████████▎ | 166/200 [1:27:08<13:27, 23.76s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  84%|████████▎ | 167/200 [1:27:24<11:49, 21.51s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  84%|████████▍ | 168/200 [1:27:42<10:57, 20.54s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  84%|████████▍ | 169/200 [1:28:02<10:28, 20.26s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  85%|████████▌ | 170/200 [1:28:51<14:30, 29.03s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  86%|████████▌ | 171/200 [1:28:58<10:49, 22.41s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  86%|████████▌ | 172/200 [1:29:06<08:27, 18.11s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  86%|████████▋ | 173/200 [1:29:56<12:20, 27.43s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  87%|████████▋ | 174/200 [1:30:13<10:33, 24.37s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  88%|████████▊ | 175/200 [1:31:02<13:16, 31.86s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  88%|████████▊ | 176/200 [1:31:52<14:52, 37.17s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  88%|████████▊ | 177/200 [1:32:04<11:23, 29.71s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  89%|████████▉ | 178/200 [1:32:53<13:00, 35.48s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  90%|████████▉ | 179/200 [1:33:43<13:53, 39.70s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  90%|█████████ | 180/200 [1:33:53<10:19, 30.98s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  90%|█████████ | 181/200 [1:34:43<11:33, 36.48s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  91%|█████████ | 182/200 [1:35:00<09:14, 30.83s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  92%|█████████▏| 183/200 [1:35:16<07:29, 26.42s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  92%|█████████▏| 184/200 [1:35:34<06:19, 23.71s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  92%|█████████▎| 185/200 [1:36:23<07:51, 31.40s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  93%|█████████▎| 186/200 [1:36:30<05:36, 24.04s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  94%|█████████▎| 187/200 [1:36:41<04:22, 20.16s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  94%|█████████▍| 188/200 [1:37:02<04:03, 20.29s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  94%|█████████▍| 189/200 [1:37:51<05:19, 29.08s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  95%|█████████▌| 190/200 [1:38:41<05:52, 35.22s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  96%|█████████▌| 191/200 [1:38:50<04:07, 27.55s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  96%|█████████▌| 192/200 [1:39:13<03:29, 26.15s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  96%|█████████▋| 193/200 [1:39:30<02:44, 23.46s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  97%|█████████▋| 194/200 [1:40:19<03:06, 31.12s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  98%|█████████▊| 195/200 [1:41:09<03:03, 36.64s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  98%|█████████▊| 196/200 [1:41:58<02:41, 40.32s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  98%|█████████▊| 197/200 [1:42:08<01:34, 31.40s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  99%|█████████▉| 198/200 [1:42:57<01:13, 36.63s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems: 100%|█████████▉| 199/200 [1:43:46<00:40, 40.36s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems: 100%|██████████| 200/200 [1:44:02<00:00, 31.21s/it]

Model: wizardmath





In [None]:
(total_correct/num_problems)*100

64.0