In [1]:
!pip install datasets
!pip install textstat
!pip install bitsandbytes

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
from huggingface_hub import hf_hub_download
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from datasets import load_dataset
import re
import numpy as np
from tqdm import tqdm
import torch
import nltk
import textstat
import random
import pickle
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
import spacy
from collections import defaultdict

In [4]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [5]:
nlp = spacy.load("en_core_web_lg")

In [6]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [7]:
def get_dataset():
    train_dataset = load_dataset("openai/gsm8k", "main", split='train')
    test_dataset = load_dataset("openai/gsm8k", "main", split='test')
    return train_dataset, test_dataset

In [8]:
class Stats_Helpers():
  def __init__(self, question):
    self.question = question
    self.keywords = ['sum', 'total', 'difference', 'product', 'per']
    self.stopwords = set([
        'the', 'is', 'in', 'at', 'of', 'a', 'an', 'and', 'to', 'for', 'on',
        'with', 'as', 'by', 'that', 'from', 'it', 'this', 'be', 'or', 'are',
        'was', 'were', 'but', 'not', 'have', 'has', 'had', 'if', 'then', 'so'
    ])
    self.doc = nlp(question)
    self.unit_pattern = re.compile(r'\b(k|kg|km|m|cm|mm|ml|l|lb|oz|g|mg|hr|min|sec|miles?|feet|inches?)\b', re.IGNORECASE)

  def get_numeric_tokens(self):
    number_pattern = r'''
        \b              # Word boundary
        (?!\d{4}\b)     # Exclude 4-digit standalone numbers (likely years)
        \d{1,3}         # 1-3 digits
        (?:,\d{3})+     # Thousands separators (e.g., 1,000)
        (?:\.\d+)?      # Optional decimal portion
        |
        \d+             # Standard integers
        (?:\.\d+)?      # Decimals without commas
        \b              # Word boundary
    '''
    numbers = re.findall(number_pattern, self.question, re.VERBOSE)

    normalized = set()
    for num in numbers:
        try:
            normalized.add(float(num.replace(',', '')))
        except ValueError:
            continue

    return list(normalized)

  def get_keyword_frequencies(self):
    freq = {k: 0 for k in self.keywords}
    for keyword in self.keywords:
        freq[keyword] = len(re.findall(rf'\b{re.escape(keyword)}\b', self.question))
    return freq

  def get_stopword_ratio(self):
    words = re.findall(r'\b\w+\b', self.question)
    if not words: return 0.0
    stop_count = sum(1 for w in words if w in self.stopwords)
    return round(stop_count / len(words), 4)

  def get_pos_distribution(self):
    pos_counts = defaultdict(int)
    for token in self.doc:
        pos_counts[token.pos_] += 1

    total = len(self.doc)
    return {
        'prop_noun': pos_counts['NOUN'] / total if total else 0,
        'prop_verb': pos_counts['VERB'] / total if total else 0,
        'prop_num': pos_counts['NUM'] / total if total else 0,
        'prop_adj': pos_counts['ADJ'] / total if total else 0
    }

  def get_dependency_metrics(self):
    """Calculate syntactic complexity metrics from dependency parse"""
    def get_depth(token):
        if token.head == token:
            return 1
        return 1 + get_depth(token.head)

    depths = [get_depth(token) for token in self.doc]

    dep_distances = [abs(token.i - token.head.i) for token in self.doc
                    if token.head != token]

    return {
        'max_tree_depth': max(depths),
        'avg_dep_distance': sum(dep_distances)/len(dep_distances) if dep_distances else 0
    }

  def get_named_entities(self):
    """Extract quantities, units, and person names with spaCy NER"""
    entities = {
        'quantity_count': 0,
        'unit_count': 0,
        'person_count': 0,
        'numeric_entities': []
    }

    for ent in self.doc.ents:
        if ent.label_ in ['QUANTITY', 'CARDINAL', 'MONEY']:
            entities['quantity_count'] += 1
            try:
                entities['numeric_entities'].append(float(ent.text.replace(',', '')))
            except:
                continue
        elif ent.label_ == 'PERSON':
            entities['person_count'] += 1

    units = self.unit_pattern.findall(self.doc.text)
    entities['unit_count'] = len(units)

    return entities

def get_problem_stats(problem):
  stats = {}
  question = problem['question'].lower()
  stat_helper = Stats_Helpers(question)

  sentences = [s.strip() for s in question.split('.') if s.strip()]
  words = question.split()
  avg_word_length = sum(len(word) for word in words)/len(words) if words else 0
  stats['word_count'] = len(question.split())
  stats['sentence_count'] = len(sentences)
  stats['avg_sentence_length'] = np.mean([len(s.split()) for s in sentences])
  stats['avg_word_length'] = avg_word_length
  stats['flesch_reading_ease'] = textstat.flesch_reading_ease(question)   # 0 to 100, higher is more readable

  numeric_values = stat_helper.get_numeric_tokens()
  stats.update({
      'num_numeric_values': len(numeric_values),
      'min_value': min(numeric_values) if numeric_values else 0,
      'mean_value': np.mean(numeric_values) if numeric_values else 0,
      'max_value': max(numeric_values) if numeric_values else 0
  })
  stats['num_operations'] = re.findall(r'[+\-*/^]', question)

  stats['keyword_frequencies'] = stat_helper.get_keyword_frequencies()
  stats['stopword_ratio'] = stat_helper.get_stopword_ratio()

  stats.update(stat_helper.get_pos_distribution())
  stats.update(stat_helper.get_dependency_metrics())

  stats.update(stat_helper.get_named_entities())

  return stats

In [9]:
def get_model(model_name):
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True
    )
    if model_name == "wizardmath":
        wizardmath_tokenizer = AutoTokenizer.from_pretrained("WizardLM/WizardMath-7B-V1.1")
        wizardmath_model = AutoModelForCausalLM.from_pretrained(
            "WizardLM/WizardMath-7B-V1.1",
            quantization_config=quantization_config,
            device_map={"": 0},
            torch_dtype=torch.float16
        )
        return {
            'model': wizardmath_model,
            'model_name': "wizardmath",
            'tokenizer': wizardmath_tokenizer,
            'cost_per_token': 0.7
        }
    elif model_name == "phi2":
        phi2_tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
        phi2_model = AutoModelForCausalLM.from_pretrained(
            "microsoft/phi-2",
            quantization_config=quantization_config,
            device_map={"": 0},
            torch_dtype=torch.float16,
            trust_remote_code=True
        )
        return {
            'model': phi2_model,
            'model_name': "phi2",
            'tokenizer': phi2_tokenizer,
            'cost_per_token': 0.27
        }

In [10]:
def extract_answer(answer_text):
    match = re.search(r'####\s*(-?\d+)', answer_text)
    if match:
        return match.group(1).strip()
    return None

In [11]:
def process_problem(problem, model_index, models):
    prompt = f"""

Follow these instructions:
1. Work through the problem step by step
2. Calculate the numerical answer
3. On the last line, write ONLY: #### <numerical answer>. Do not add any units like "kg" or "m", or any currency symbols like "$".
4. Do not write anything after the final answer

-------------------
EXAMPLE FORMAT:
Step 1: [explanation]
Step 2: [explanation]
Final calculation: [calculation]
#### [numerical answer]
-------------------

NOW SOLVE THE PROBLEM CORRECTLY: {problem['question']}
"""
    # print("Entered global process problem")
    model_obj = models[model_index]['model']
    tokenizer = models[model_index].get('tokenizer', None)
    if tokenizer:
        tokenizer = models[model_index]['tokenizer']

    # if models[model_index]['model_name'] == "wizardmath":
    inputs = tokenizer(prompt, return_tensors="pt").to(model_obj.device)
    outputs = model_obj.generate(
        inputs.input_ids,
        max_new_tokens=1024,
        temperature=0.1,
        do_sample=True,
        attention_mask=inputs.attention_mask,
        # pad_token_id=tokenizer.eos_token_id,
    )
    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    prompt_end = full_output.find(f"NOW SOLVE THE PROBLEM CORRECTLY: {problem['question']}")
    if prompt_end != -1:
        prompt_end = prompt_end + len(f"NOW SOLVE THE PROBLEM CORRECTLY: {problem['question']}")
        model_response = full_output[prompt_end:].strip()
    else:
        model_response = full_output

    hash_match = re.search(r'####\s*([\$]?\s*\d+(?:\.\d+)?)', model_response)
    if hash_match:
        answer_text = hash_match.group(1)
        numeric_match = re.search(r'(\d+(?:\.\d+)?)', answer_text)
        if numeric_match:
            numeric_answer = numeric_match.group(1)
            # return f"{prompt}\n\n{model_response.split('####')[0].strip()}\n#### {numeric_answer}"
            return {
                'prompt': prompt,
                'response': model_response,
                'answer': numeric_answer
            }

    answer_match = re.search(r'(?:final answer|the answer is)[^0-9]*?([\$]?\s*\d+(?:\.\d+)?)',
                            model_response.lower())
    if answer_match:
        answer_text = answer_match.group(1)
        numeric_match = re.search(r'(\d+(?:\.\d+)?)', answer_text)
        if numeric_match:
            numeric_answer = numeric_match.group(1)
            answer_position = model_response.lower().find(answer_match.group(0))
            if answer_position != -1:
                # return f"{prompt}\n\n{model_response[:answer_position].strip()}\n#### {numeric_answer}"
                return {
                    'prompt': prompt,
                    'response': model_response,
                    'answer': numeric_answer
                }

    lines = model_response.split('\n')
    for i in range(len(lines)-1, max(0, len(lines)-5), -1):
        line = lines[i]
        if len(line.strip()) < 1 or any(word in line.lower() for word in ["step", "explanation"]):
            continue

        numeric_match = re.search(r'(\d+(?:\.\d+)?)', line)
        if numeric_match:
            numeric_answer = numeric_match.group(1)
            # return f"{prompt}\n\n{model_response.split(line)[0].strip()}\n#### {numeric_answer}"
            return {
                'prompt': prompt,
                'response': model_response,
                'answer': numeric_answer
            }

    # return full_output
    return {
        'prompt': prompt,
        'response': full_output,
        'answer': None
    }


In [12]:
import numpy as np
from scipy.linalg import solve_triangular
from collections import defaultdict

class LinUCB:
    def __init__(self, n_arms, context_size, alpha=0.5):
        self.n_arms = n_arms
        self.context_size = context_size
        self.alpha = alpha

        self.A = [np.eye(context_size) for _ in range(n_arms)]
        self.b = [np.zeros(context_size) for _ in range(n_arms)]
        self.theta = [np.zeros(context_size) for _ in range(n_arms)]

    def select_arm(self, context):
        scores = []
        for arm in range(self.n_arms):
            A_inv = np.linalg.inv(self.A[arm])
            self.theta[arm] = A_inv @ self.b[arm]

            score = self.theta[arm] @ context
            score += self.alpha * np.sqrt(context @ A_inv @ context)
            scores.append(score)

        return np.argmax(scores)

    def update(self, arm, context, reward):
        self.A[arm] += np.outer(context, context)
        self.b[arm] += reward * context

class ModelSwitcher:
    def __init__(self, models):
        self.models = models
        self.stats_helper = Stats_Helpers

        self.bandit = LinUCB(n_arms=2, context_size=23, alpha=0.25)

        self.feature_order = [
            'word_count', 'sentence_count', 'avg_sentence_length', 'avg_word_length',
            'flesch_reading_ease', 'num_numeric_values', 'min_value', 'mean_value',
            'max_value', 'stopword_ratio', 'prop_noun', 'prop_verb', 'prop_num',
            'prop_adj', 'max_tree_depth', 'avg_dep_distance', 'quantity_count',
            'unit_count', 'person_count', 'keyword_frequencies.sum',
            'keyword_frequencies.total', 'keyword_frequencies.difference',
            'keyword_frequencies.product'
        ]

    def _get_context_vector(self, problem_stats):
        context = []
        for feat in self.feature_order:
            if 'keyword' in feat:
                key = feat.split('.')[-1]
                context.append(problem_stats['keyword_frequencies'][key])
            else:
                context.append(problem_stats[feat])

        context = np.array(context)
        context = (context - np.mean(context)) / (np.std(context) + 1e-8)
        return context

    def process_problem(self, problem, correct_answer):
        # print("Entered model switcher process problem")
        stats = get_problem_stats(problem)
        # print("Got stats")
        context = self._get_context_vector(stats)
        # print("Got context")

        model_idx = self.bandit.select_arm(context)
        # print("Selected model")
        prediction = process_problem(problem, model_idx, self.models)
        # print(prediction)
        # print("Got prediction")

        is_correct = self._check_correctness(prediction, correct_answer)
        reward = self._calculate_reward(is_correct, model_idx)

        self.bandit.update(model_idx, context, reward)

        return prediction, model_idx, reward

    def _check_correctness(self, prediction, correct_answer):
        try:
            pred_answer = float(re.search(r'####\s*([\d.]+)', prediction).group(1))
            return np.isclose(pred_answer, correct_answer, rtol=0.01)
        except:
            return False

    def _calculate_reward(self, correct, model_idx):
        model_cost = self.models[model_idx]['cost_per_token']

        if correct:
            return 1
        else:
            return 0
        #     return 1.0 if model_idx == 0 else 0.5
        # else:
        #     return -2.0 if model_idx == 1 else -1.0

In [13]:
temp_set = get_dataset()
gsm8k_dataset = {
    'train': temp_set[0],
    'test': temp_set[1]
}
models = [get_model('phi2'), get_model('wizardmath')]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/948 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [14]:
def calculate_cost(prediction_data, model):
    """Calculate cost using structured prediction data"""
    tokenizer = model['tokenizer']
    cost_per_token = model['cost_per_token']

    input_tokens = tokenizer.encode(prediction_data['prompt'], return_tensors='pt').shape[1]
    output_tokens = tokenizer.encode(prediction_data['response'], return_tensors='pt').shape[1]

    return (input_tokens + output_tokens) * cost_per_token

In [None]:
import time
import pandas as pd

switcher = ModelSwitcher(models)
switcher.bandit.alpha = 1

total_correct = 0
total_cost = 0

num_problems = 400
start_idx = 2000
cur_problem_idx = 0
# subset = gsm8k_dataset['train'].select(range(num_problems))
subset = gsm8k_dataset['train'].select(range(start_idx, start_idx + num_problems))

phi2_preds = []
wizardmath_preds = []

start_time = time.time()
for problem in tqdm(subset, desc="Processing problems"):
    cur_problem_idx += 1
    correct_answer = extract_answer(problem['answer'])

    prediction, model_idx, reward = switcher.process_problem(problem, correct_answer)

    # print(f"Problem: {problem['question']}")
    # print(f"Prediction: {prediction}")
    # print(f"Correct Answer: {correct_answer}")
    print(f"Model: {models[model_idx]['model_name']}")

    # Track performance
    predicted_answer = prediction['answer']
    # print(f"Predicted ans: {predicted_ans}")
    # print(f"Correct Answer: {correct_answer}")
    # if predicted_ans is not None and float(predicted_ans) == float(correct_answer):
    #     total_correct += 1
    # total_correct += (1 if switcher._check_correctness(prediction, correct_answer) else 0)
    total_cost += calculate_cost(prediction, models[model_idx])

    if predicted_answer is not None and float(predicted_answer) == float(correct_answer):
        total_correct += 1
        if model_idx == 0:
            phi2_preds.append({'question': problem['question'], 'answer': problem['answer'], 'model_response': prediction['response'], 'is_correct': True})
        else:
            wizardmath_preds.append({'question': problem['question'], 'answer': problem['answer'], 'model_response': prediction['response'], 'is_correct': True})
    else:
        if model_idx == 0:
            phi2_preds.append({'question': problem['question'], 'answer': problem['answer'], 'model_response': prediction['response'], 'is_correct': False})
        else:
            wizardmath_preds.append({'question': problem['question'], 'answer': problem['answer'], 'model_response': prediction['response'], 'is_correct': False})

    # if cur_problem_idx % 50 == 0:
    #     phi2_df = pd.DataFrame(phi2_preds)
    #     phi2_df['model_name'] = 'phi-2'

    #     wizardmath_df = pd.DataFrame(wizardmath_preds)
    #     wizardmath_df['model_name'] = 'wizardmath'

    #     combined_df = pd.concat([phi2_df, wizardmath_df], ignore_index=True)
    #     interim_csv_path = f'/content/gdrive/MyDrive/ai_project/combined_predictions_interim4_{cur_problem_idx}.csv'
    #     combined_df.to_csv(interim_csv_path, index=False)
    #     print(f"Interim predictions saved to: {interim_csv_path}")

    new_alpha = 1 - (cur_problem_idx/num_problems)
    if new_alpha < 0.1:
        new_alpha = 0.1
    switcher.bandit.alpha = new_alpha

end_time = time.time()

Processing problems:   0%|          | 0/400 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Processing problems:   0%|          | 1/400 [00:53<5:55:56, 53.53s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:   0%|          | 2/400 [01:07<3:20:44, 30.26s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:   1%|          | 3/400 [01:59<4:26:15, 40.24s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:   1%|          | 4/400 [02:09<3:06:27, 28.25s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:   1%|▏         | 5/400 [03:01<4:03:06, 36.93s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:   2%|▏         | 6/400 [03:53<4:36:34, 42.12s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:   2%|▏         | 7/400 [04:03<3:26:01, 31.45s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:   2%|▏         | 8/400 [04:55<4:08:51, 38.09s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:   2%|▏         | 9/400 [05:14<3:29:30, 32.15s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:   2%|▎         | 10/400 [06:07<4:09:23, 38.37s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:   3%|▎         | 11/400 [06:25<3:29:10, 32.26s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:   3%|▎         | 12/400 [06:37<2:47:55, 25.97s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:   3%|▎         | 13/400 [06:46<2:14:49, 20.90s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:   4%|▎         | 14/400 [07:38<3:15:07, 30.33s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:   4%|▍         | 15/400 [07:48<2:34:55, 24.14s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:   4%|▍         | 16/400 [08:40<3:28:35, 32.59s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:   4%|▍         | 17/400 [08:56<2:56:44, 27.69s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:   4%|▍         | 18/400 [09:43<3:32:30, 33.38s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:   5%|▍         | 19/400 [09:58<2:56:12, 27.75s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:   5%|▌         | 20/400 [10:50<3:42:17, 35.10s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:   5%|▌         | 21/400 [11:02<2:58:49, 28.31s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:   6%|▌         | 22/400 [11:11<2:21:35, 22.47s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:   6%|▌         | 23/400 [12:03<3:17:00, 31.35s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:   6%|▌         | 24/400 [12:55<3:55:32, 37.59s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:   6%|▋         | 25/400 [13:47<4:21:40, 41.87s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:   6%|▋         | 26/400 [13:57<3:20:46, 32.21s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:   7%|▋         | 27/400 [14:05<2:35:10, 24.96s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:   7%|▋         | 28/400 [14:22<2:19:10, 22.45s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:   7%|▋         | 29/400 [14:37<2:05:13, 20.25s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:   8%|▊         | 30/400 [15:29<3:04:01, 29.84s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:   8%|▊         | 31/400 [16:21<3:44:56, 36.58s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:   8%|▊         | 32/400 [17:13<4:13:01, 41.25s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:   8%|▊         | 33/400 [18:05<4:31:55, 44.46s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:   8%|▊         | 34/400 [18:17<3:30:43, 34.55s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:   9%|▉         | 35/400 [18:33<2:57:03, 29.11s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:   9%|▉         | 36/400 [18:43<2:21:14, 23.28s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:   9%|▉         | 37/400 [19:34<3:12:15, 31.78s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  10%|▉         | 38/400 [20:26<3:48:07, 37.81s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  10%|▉         | 39/400 [21:18<4:12:44, 42.01s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  10%|█         | 40/400 [21:37<3:29:53, 34.98s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  10%|█         | 41/400 [21:49<2:48:03, 28.09s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  10%|█         | 42/400 [22:16<2:46:56, 27.98s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  11%|█         | 43/400 [23:10<3:31:23, 35.53s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  11%|█         | 44/400 [23:26<2:57:37, 29.94s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  11%|█▏        | 45/400 [23:36<2:21:05, 23.85s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  12%|█▏        | 46/400 [23:47<1:57:10, 19.86s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  12%|█▏        | 47/400 [24:38<2:53:12, 29.44s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  12%|█▏        | 48/400 [25:31<3:32:59, 36.30s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  12%|█▏        | 49/400 [26:23<4:00:08, 41.05s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  12%|█▎        | 50/400 [27:15<4:18:26, 44.30s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  13%|█▎        | 51/400 [27:24<3:15:45, 33.65s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  13%|█▎        | 52/400 [28:23<3:59:52, 41.36s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  13%|█▎        | 53/400 [29:16<4:19:18, 44.84s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  14%|█▎        | 54/400 [29:23<3:14:14, 33.68s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  14%|█▍        | 55/400 [29:32<2:30:33, 26.18s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  14%|█▍        | 56/400 [30:24<3:14:25, 33.91s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  14%|█▍        | 57/400 [30:38<2:39:52, 27.97s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  14%|█▍        | 58/400 [30:48<2:08:08, 22.48s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  15%|█▍        | 59/400 [31:40<2:58:13, 31.36s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  15%|█▌        | 60/400 [31:54<2:27:43, 26.07s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  15%|█▌        | 61/400 [32:23<2:32:12, 26.94s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  16%|█▌        | 62/400 [32:33<2:03:26, 21.91s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  16%|█▌        | 63/400 [33:25<2:53:49, 30.95s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  16%|█▌        | 64/400 [34:18<3:29:51, 37.47s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  16%|█▋        | 65/400 [34:29<2:45:15, 29.60s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  16%|█▋        | 66/400 [34:46<2:23:34, 25.79s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  17%|█▋        | 67/400 [35:38<3:07:32, 33.79s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  17%|█▋        | 68/400 [36:32<3:40:01, 39.76s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  17%|█▋        | 69/400 [36:41<2:48:25, 30.53s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  18%|█▊        | 70/400 [36:55<2:21:14, 25.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  18%|█▊        | 71/400 [37:48<3:05:12, 33.78s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  18%|█▊        | 72/400 [38:00<2:28:33, 27.18s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  18%|█▊        | 73/400 [38:52<3:09:36, 34.79s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  18%|█▊        | 74/400 [39:45<3:37:48, 40.09s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  19%|█▉        | 75/400 [39:51<2:41:35, 29.83s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  19%|█▉        | 76/400 [40:01<2:09:58, 24.07s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  19%|█▉        | 77/400 [40:14<1:50:38, 20.55s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  20%|█▉        | 78/400 [41:07<2:43:50, 30.53s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  20%|█▉        | 79/400 [42:00<3:18:20, 37.07s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  20%|██        | 80/400 [42:52<3:42:09, 41.65s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  20%|██        | 81/400 [43:06<2:56:33, 33.21s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  20%|██        | 82/400 [43:57<3:25:14, 38.72s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  21%|██        | 83/400 [44:02<2:31:29, 28.67s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  21%|██        | 84/400 [44:21<2:14:48, 25.60s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  21%|██▏       | 85/400 [44:32<1:51:55, 21.32s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  22%|██▏       | 86/400 [45:25<2:41:18, 30.82s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  22%|██▏       | 87/400 [46:17<3:14:01, 37.19s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  22%|██▏       | 88/400 [46:27<2:31:20, 29.10s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  22%|██▏       | 89/400 [46:36<1:59:36, 23.08s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  22%|██▎       | 90/400 [46:53<1:49:39, 21.22s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  23%|██▎       | 91/400 [47:45<2:36:09, 30.32s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  23%|██▎       | 92/400 [48:37<3:08:29, 36.72s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  23%|██▎       | 93/400 [49:28<3:31:00, 41.24s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  24%|██▎       | 94/400 [49:35<2:37:27, 30.88s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  24%|██▍       | 95/400 [50:28<3:10:32, 37.48s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  24%|██▍       | 96/400 [51:15<3:24:29, 40.36s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  24%|██▍       | 97/400 [51:29<2:44:04, 32.49s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  24%|██▍       | 98/400 [52:21<3:12:20, 38.21s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  25%|██▍       | 99/400 [52:36<2:37:12, 31.34s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  25%|██▌       | 100/400 [52:44<2:01:31, 24.31s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  25%|██▌       | 101/400 [53:35<2:41:46, 32.46s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  26%|██▌       | 102/400 [54:28<3:11:32, 38.57s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  26%|██▌       | 103/400 [55:19<3:29:43, 42.37s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  26%|██▌       | 104/400 [55:30<2:41:53, 32.81s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  26%|██▋       | 105/400 [56:22<3:09:22, 38.52s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  26%|██▋       | 106/400 [56:28<2:21:07, 28.80s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  27%|██▋       | 107/400 [57:19<2:53:57, 35.62s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  27%|██▋       | 108/400 [58:11<3:16:59, 40.48s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  27%|██▋       | 109/400 [58:19<2:29:10, 30.76s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  28%|██▊       | 110/400 [58:35<2:07:16, 26.33s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  28%|██▊       | 111/400 [58:51<1:51:07, 23.07s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  28%|██▊       | 112/400 [59:42<2:31:55, 31.65s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  28%|██▊       | 113/400 [59:58<2:08:57, 26.96s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  28%|██▊       | 114/400 [1:00:08<1:43:23, 21.69s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  29%|██▉       | 115/400 [1:00:59<2:25:41, 30.67s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  29%|██▉       | 116/400 [1:01:10<1:57:05, 24.74s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  29%|██▉       | 117/400 [1:02:02<2:35:00, 32.87s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  30%|██▉       | 118/400 [1:02:55<3:02:49, 38.90s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  30%|██▉       | 119/400 [1:03:09<2:27:02, 31.40s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  30%|███       | 120/400 [1:04:02<2:56:11, 37.76s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  30%|███       | 121/400 [1:04:15<2:21:14, 30.38s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  30%|███       | 122/400 [1:04:31<2:00:29, 26.00s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  31%|███       | 123/400 [1:05:23<2:36:18, 33.86s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  31%|███       | 124/400 [1:05:39<2:10:46, 28.43s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  31%|███▏      | 125/400 [1:06:29<2:39:58, 34.90s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  32%|███▏      | 126/400 [1:06:36<2:02:03, 26.73s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  32%|███▏      | 127/400 [1:07:30<2:38:34, 34.85s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  32%|███▏      | 128/400 [1:07:41<2:05:54, 27.77s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  32%|███▏      | 129/400 [1:07:52<1:41:53, 22.56s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  32%|███▎      | 130/400 [1:08:44<2:22:10, 31.60s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  33%|███▎      | 131/400 [1:08:56<1:54:42, 25.59s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  33%|███▎      | 132/400 [1:09:48<2:30:17, 33.65s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  33%|███▎      | 133/400 [1:10:11<2:14:21, 30.19s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  34%|███▎      | 134/400 [1:11:03<2:43:00, 36.77s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  34%|███▍      | 135/400 [1:11:11<2:04:28, 28.18s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  34%|███▍      | 136/400 [1:11:59<2:31:03, 34.33s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  34%|███▍      | 137/400 [1:12:45<2:45:06, 37.67s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  34%|███▍      | 138/400 [1:13:37<3:02:59, 41.91s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  35%|███▍      | 139/400 [1:13:43<2:15:48, 31.22s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  35%|███▌      | 140/400 [1:13:51<1:45:25, 24.33s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  35%|███▌      | 141/400 [1:14:07<1:33:55, 21.76s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  36%|███▌      | 142/400 [1:14:59<2:12:29, 30.81s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  36%|███▌      | 143/400 [1:15:52<2:40:50, 37.55s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  36%|███▌      | 144/400 [1:16:44<2:58:45, 41.90s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  36%|███▋      | 145/400 [1:17:01<2:25:54, 34.33s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  36%|███▋      | 146/400 [1:17:53<2:48:06, 39.71s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  37%|███▋      | 147/400 [1:18:02<2:07:51, 30.32s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  37%|███▋      | 148/400 [1:18:54<2:35:05, 36.93s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  37%|███▋      | 149/400 [1:19:46<2:54:02, 41.60s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  38%|███▊      | 150/400 [1:19:57<2:14:32, 32.29s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  38%|███▊      | 151/400 [1:20:15<1:55:55, 27.93s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  38%|███▊      | 152/400 [1:20:23<1:30:29, 21.89s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  38%|███▊      | 153/400 [1:21:15<2:08:11, 31.14s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  38%|███▊      | 154/400 [1:22:08<2:34:04, 37.58s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  39%|███▉      | 155/400 [1:22:20<2:02:09, 29.92s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  39%|███▉      | 156/400 [1:23:12<2:29:07, 36.67s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  39%|███▉      | 157/400 [1:23:22<1:55:54, 28.62s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  40%|███▉      | 158/400 [1:23:30<1:30:16, 22.38s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  40%|███▉      | 159/400 [1:23:39<1:14:18, 18.50s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  40%|████      | 160/400 [1:24:32<1:55:06, 28.78s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


Processing problems:  40%|████      | 161/400 [1:25:25<2:22:48, 35.85s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: phi2


Processing problems:  40%|████      | 162/400 [1:25:40<1:58:00, 29.75s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  41%|████      | 163/400 [1:25:52<1:36:39, 24.47s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  41%|████      | 164/400 [1:26:11<1:29:07, 22.66s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model: wizardmath


Processing problems:  41%|████▏     | 165/400 [1:26:21<1:13:50, 18.85s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: wizardmath


Processing problems:  42%|████▏     | 166/400 [1:27:14<1:53:35, 29.13s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: phi2


In [None]:
import os
if not os.path.exists('/content/gdrive/MyDrive/ai_project'):
    os.makedirs('/content/gdrive/MyDrive/ai_project')

In [None]:
import pandas as pd
import os

os.makedirs('/content', exist_ok=True)

phi2_df = pd.DataFrame(phi2_preds)
phi2_df['model_name'] = 'phi-2'

wizardmath_df = pd.DataFrame(wizardmath_preds)
wizardmath_df['model_name'] = 'wizardmath'

combined_df = pd.concat([phi2_df, wizardmath_df], ignore_index=True)
# combined_csv_path = '/content/combined_predictions.csv'
# combined_df.to_csv(combined_csv_path, index=False)
combined_csv_path = '/content/gdrive/MyDrive/ai_project/combined_predictions5.csv'
combined_df.to_csv(combined_csv_path, index=False)

accuracy = total_correct / num_problems
time_taken = end_time - start_time

# Create metrics DataFrame
metrics_df = pd.DataFrame({
    'total_problems': [num_problems],
    'correct_answers': [total_correct],
    'accuracy': [accuracy],
    'time_seconds': [time_taken],
    'avg_time_per_problem': [time_taken/num_problems],
    'total_cost': [total_cost]
})

# metrics_csv_path = '/content/experiment_metrics.csv'
metrics_csv_path = '/content/gdrive/MyDrive/ai_project/experiment_metrics4.csv'
metrics_df.to_csv(metrics_csv_path, index=False)

fpath = '/content/gdrive/MyDrive/ai_project/accuracy_summary4.txt'
# with open('/content/accuracy_summary.txt', 'w') as f:
with open(fpath, 'w') as f:
    f.write(f"""Experiment Results:
Total Problems: {num_problems}
Correct Answers: {total_correct}
Accuracy: {accuracy:.2%}
Time Taken: {time_taken:.2f} seconds
Average Time per Problem: {time_taken/num_problems:.2f}s
Total Cost: ${total_cost:.4f}
""")

print(f"Predictions saved to: {combined_csv_path}")
print(f"Metrics saved to: {metrics_csv_path}")
