In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os

project_path = '/content/drive/MyDrive/COMP545_PROJECT'
os.chdir(project_path)

In [None]:
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

In [3]:
        from huggingface_hub import login
        login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
import os
import json
import argparse
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from tqdm import tqdm

from utils.utils import load_jsonl, InferenceDataset
from utils.model import BERTBinaryClassifier, MultiClassClassifier

In [5]:
def predict(
    multi_class_model_path,
    binary_model_path,
    input_file,
    model_name="google/embeddinggemma-300m",
    batch_size=16,
    max_length=2048,
    device="cuda"
):
    print(f"Loading data from {input_file}...")
    data = load_jsonl(input_file)

    # Remove duplicates based on query_id to get unique queries
    # Keep dataset information for each query
    unique_data = {}
    for sample in data:
        query_id = sample.get("query_id", "")
        if query_id not in unique_data:
            unique_data[query_id] = sample

    unique_samples = list(unique_data.values())
    print(f"Found {len(unique_samples)} unique queries")

    # Count datasets
    dataset_counts = {}
    for sample in unique_samples:
        ds = sample.get("dataset", "unknown")
        dataset_counts[ds] = dataset_counts.get(ds, 0) + 1
    print(f"Datasets: {dataset_counts}")

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    dataset = InferenceDataset(unique_samples, tokenizer, max_length)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    print(f"\nLoading multi-class model from {multi_class_model_path}...")
    multi_class_model = MultiClassClassifier(
        model_name=model_name,
        hidden_dim=768,
        num_classes=3,
        dropout=0.1
    ).to(device)

    multi_class_checkpoint = torch.load(multi_class_model_path, map_location=device)
    multi_class_model.load_state_dict(multi_class_checkpoint)
    multi_class_model.eval()
    print("Multi-class model loaded successfully")

    print(f"\nLoading binary model from {binary_model_path}...")
    binary_model = BERTBinaryClassifier(
        model_name=model_name,
        hidden_dim=768,
        dropout=0.1
    ).to(device)

    binary_checkpoint = torch.load(binary_model_path, map_location=device)
    binary_model.load_state_dict(binary_checkpoint)
    binary_model.eval()
    print("Binary model loaded successfully")

    print("\nPerforming inference...")
    results = []

    # Create a mapping from query_id to dataset
    query_to_dataset = {sample.get("query_id", ""): sample.get("dataset", "unknown") for sample in unique_samples}

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Predicting"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            queries = batch['query']
            query_ids = batch['query_id']

            multi_class_logits = multi_class_model(input_ids, attention_mask)
            multi_class_probs = torch.softmax(multi_class_logits, dim=1)  # [batch_size, 3]

            binary_logits = binary_model(input_ids, attention_mask)
            binary_probs = torch.sigmoid(binary_logits)  # [batch_size]
            binary_probs = binary_probs.unsqueeze(1)  # [batch_size, 1]

            feature_vectors = torch.cat([multi_class_probs, binary_probs], dim=1)

            feature_vectors_np = feature_vectors.cpu().numpy()

            for i in range(len(queries)):
                query_id = query_ids[i]
                result = {
                    "query_id": query_id,
                    "dataset": query_to_dataset.get(query_id, "unknown"),
                    "query": queries[i],
                    "feature_vector": feature_vectors_np[i].tolist()
                }
                results.append(result)

    return results


LOCAL ONLY(NOT FOR COLAB )

In [6]:
from pathlib import Path
base = Path(os.getcwd())


Generate query vectors

In [7]:
results = predict(
multi_class_model_path="model_checkpoints/embeddinggemma_multi_class_classifier/best_model_acc_0.9849_epoch_1.pth",
binary_model_path="model_checkpoints/embeddinggemma_difficulty_predictor/final_model.pth",
input_file="data/model_data/extracted_dataset_samples_ood.jsonl",

model_name="google/embeddinggemma-300m",
batch_size=16,
max_length=2048,
device="cuda"
)

Loading data from data/model_data/extracted_dataset_samples_ood.jsonl...
Found 1301 unique queries
Datasets: {'gpqa': 545, 'musr': 756}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]


Loading multi-class model from model_checkpoints/embeddinggemma_multi_class_classifier/best_model_acc_0.9849_epoch_1.pth...


config.json:   0%|          | 0.00/1.49k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.21G [00:00<?, ?B/s]

Multi-class model loaded successfully

Loading binary model from model_checkpoints/embeddinggemma_difficulty_predictor/final_model.pth...
Binary model loaded successfully

Performing inference...


Predicting: 100%|██████████| 82/82 [05:38<00:00,  4.12s/it]


In [None]:
ID_results = predict(
multi_class_model_path="model_checkpoints/embeddinggemma_multi_class_classifier/best_model_acc_0.9849_epoch_1.pth",
binary_model_path="model_checkpoints/embeddinggemma_difficulty_predictor/final_model.pth",
input_file="data/model_data/extracted_dataset_samples.jsonl",

model_name="google/embeddinggemma-300m",
batch_size=16,
max_length=2048,
device="cuda"
)

Loading data from data/model_data/extracted_dataset_samples.jsonl...
Found 18705 unique queries
Datasets: {'bbh': 5757, 'math': 1324, 'mmlu_pro': 11624}

Loading multi-class model from model_checkpoints/embeddinggemma_multi_class_classifier/best_model_acc_0.9849_epoch_1.pth...
Multi-class model loaded successfully

Loading binary model from model_checkpoints/embeddinggemma_difficulty_predictor/final_model.pth...
Binary model loaded successfully

Performing inference...


Predicting: 100%|██████████| 1170/1170 [1:20:14<00:00,  4.12s/it]


Load Model INFO

In [None]:
model_info_file_path = 'data/model_data/models_info.json'
model_info = pd.read_json(model_info_file_path).T
model_info


Unnamed: 0,bbh_acc,math_acc,mmlu_pro_acc,feature_vector,co2_cost,base_model
MaziyarPanahi/calme-3.2-instruct-78b,0.729734,0.399547,0.730303,"[0.729734421107446, 0.399546827794561, 0.73030...",66.01,Qwen/Qwen2.5-72B
Qwen/Qwen2.5-0.5B-Instruct,0.330672,0.089879,0.171958,"[0.330671758375282, 0.089879154078549, 0.17195...",1.24,Qwen/Qwen2.5-0.5B
Qwen/Qwen2.5-1.5B-Instruct,0.424753,0.204683,0.27992,"[0.42475264710987604, 0.204682779456193, 0.279...",1.37,Qwen/Qwen2.5-1.5B
Qwen/Qwen2.5-14B-Instruct,0.637389,0.537764,0.490442,"[0.637389342128102, 0.5377643504531721, 0.4904...",3.55,Qwen/Qwen2.5-14B
Qwen/Qwen2.5-32B-Instruct,0.687381,0.617069,0.566656,"[0.6873806630793261, 0.6170694864048331, 0.566...",11.5,Qwen/Qwen2.5-32B
Qwen/Qwen2.5-3B-Instruct,0.465544,0.358006,0.325465,"[0.46554417635827106, 0.358006042296072, 0.325...",2.78,Qwen/Qwen2.5-3B
Qwen/Qwen2.5-72B-Instruct,0.725568,0.589124,0.562583,"[0.725568477694844, 0.589123867069486, 0.56258...",47.65,Qwen/Qwen2.5-72B
Qwen/Qwen2.5-7B-Instruct,0.536886,0.48565,0.42869,"[0.5368859572990801, 0.48564954682779404, 0.42...",3.24,Qwen/Qwen2.5-7B
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,0.591217,0.594411,0.466672,"[0.59121680263843, 0.5944108761329301, 0.46667...",3.99,Qwen/Qwen2.5-14B
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,0.417636,0.43429,0.468667,"[0.417635827113348, 0.43429003021148005, 0.468...",47.28,Qwen/Qwen2.5-32B


In [None]:
model_info_feature_cost = model_info[['feature_vector', 'co2_cost']]
model_info_feature_cost = model_info_feature_cost.rename(columns={"feature_vector": "model_feature_vector", "co2_cost": "model_co2_cost"})
model_info_feature_cost

Unnamed: 0,model_feature_vector,model_co2_cost
MaziyarPanahi/calme-3.2-instruct-78b,"[0.729734421107446, 0.399546827794561, 0.73030...",66.01
Qwen/Qwen2.5-0.5B-Instruct,"[0.330671758375282, 0.089879154078549, 0.17195...",1.24
Qwen/Qwen2.5-1.5B-Instruct,"[0.42475264710987604, 0.204682779456193, 0.279...",1.37
Qwen/Qwen2.5-14B-Instruct,"[0.637389342128102, 0.5377643504531721, 0.4904...",3.55
Qwen/Qwen2.5-32B-Instruct,"[0.6873806630793261, 0.6170694864048331, 0.566...",11.5
Qwen/Qwen2.5-3B-Instruct,"[0.46554417635827106, 0.358006042296072, 0.325...",2.78
Qwen/Qwen2.5-72B-Instruct,"[0.725568477694844, 0.589123867069486, 0.56258...",47.65
Qwen/Qwen2.5-7B-Instruct,"[0.5368859572990801, 0.48564954682779404, 0.42...",3.24
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,"[0.59121680263843, 0.5944108761329301, 0.46667...",3.99
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,"[0.417635827113348, 0.43429003021148005, 0.468...",47.28


In [None]:
result_df = pd.DataFrame(results)
result_df

Unnamed: 0,query_id,dataset,query,feature_vector
0,gpqa_q0,gpqa,"A large gene has dozens of exons, of which the...","[1.2503788582307607e-07, 5.4703028581570834e-0..."
1,gpqa_q1,gpqa,trans-cinnamaldehyde was treated with methylma...,"[9.303199277610474e-08, 4.9024914915207773e-05..."
2,gpqa_q2,gpqa,A coating is applied to a substrate resulting ...,"[1.4215720511856489e-06, 0.0003667732526082545..."
3,gpqa_q3,gpqa,"aniline is heated with sulfuric acid, forming ...","[1.282016484083215e-07, 6.299507367657498e-05,..."
4,gpqa_q4,gpqa,In a parallel universe where a magnet can have...,"[1.0665405625331914e-07, 5.589770808001049e-05..."
...,...,...,...,...
1296,musr_q751,musr,In the high-stakes world of a prestigious tale...,"[9.818768376135267e-06, 0.0006710647139698267,..."
1297,musr_q752,musr,"In the vibrant epicenter of New York City, a p...","[9.525953828415368e-06, 0.0006735894130542874,..."
1298,musr_q753,musr,"In the pulsating heart of Silicon Valley, a bu...","[5.026178314437857e-06, 0.0004365589702501893,..."
1299,musr_q754,musr,"In the pulsating heart of Silicon Valley, a bu...","[1.5150030776567291e-05, 0.000895775796379894,..."


In [None]:
ID_results_df = pd.DataFrame(ID_results)
ID_results_df

Unnamed: 0,query_id,dataset,query,feature_vector
0,bbh_q0,bbh,not ( True ) and ( True ) is,"[0.9999936819076538, 2.6843852083402453e-06, 3..."
1,bbh_q1,bbh,not True or False or ( False ) is,"[0.9999901056289673, 4.502657247940078e-06, 5...."
2,bbh_q2,bbh,True or not False and True and False is,"[0.9999886751174927, 4.963138053426519e-06, 6...."
3,bbh_q3,bbh,not True and ( False or True ) is,"[0.9999932050704956, 2.9324012302822666e-06, 3..."
4,bbh_q4,bbh,not True or ( False and True ) is,"[0.9999934434890747, 2.847306177500286e-06, 3...."
...,...,...,...,...
18700,mmlu_pro_q11640,mmlu_pro,A hot mild steel rod is placed in a carbonaceo...,"[1.0312151488278687e-07, 5.070183033240028e-05..."
18701,mmlu_pro_q11641,mmlu_pro,Consider the evaporation of liquid ammonia int...,"[7.761179290355358e-07, 0.0002615608973428607,..."
18702,mmlu_pro_q11642,mmlu_pro,The frequency range of a commercially broadcas...,"[2.316960490134079e-07, 9.768984455149621e-05,..."
18703,mmlu_pro_q11643,mmlu_pro,Suppose there are 100 identical firms in a per...,"[8.748448721007662e-08, 4.5654796849703416e-05..."


Generate perdiction reuslt for all different alpha

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

In [None]:
#simple MLP should work
class RouterNN(nn.Module):
    def __init__(self, query_vector,model_vector,projection_dim =32, hidden_size=(256,128), dropout_rate=0.1, output_size=1):
        super(RouterNN, self).__init__()
        self.q_proj = nn.Sequential(
            nn.Linear((query_vector), projection_dim),
            nn.ReLU(),
        )
        self.m_proj = nn.Sequential(
            nn.Linear((model_vector), projection_dim),
            nn.ReLU(),
        )
        self.fc_layers = nn.Sequential(
            nn.Linear(projection_dim*4 + 1, hidden_size[0]),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size[0], hidden_size[1]),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size[1], output_size)
        )


    def forward(self, query,model):
        q_emb = self.q_proj(query)
        m_emb = self.m_proj(model)

        prod = q_emb * m_emb
        diff  = torch.abs(q_emb - m_emb)
        cos = nn.functional.cosine_similarity(q_emb, m_emb, dim=1, eps=1e-8).unsqueeze(1)

        x = torch.cat((q_emb, m_emb, prod, diff, cos), dim=1)
        out = self.fc_layers(x)



        return out

In [None]:
def find_the_best_alpha(model_feature_and_cost_df, query_vector_df,accuracy_perdictor_model_pth,device):
  # model set up
  route_model = RouterNN(query_vector=4,model_vector=3, projection_dim=32, hidden_size=(256,128), dropout_rate=0.1, output_size=1)
  route_model.load_state_dict(torch.load(accuracy_perdictor_model_pth))
  route_model.to(device)
  route_model.eval()

  #query df setting
  query_id_and_vector = query_vector_df[['query_id','feature_vector']]

  #Need to nomalize co2 cost in order to avoid complete lead by co2 cost
  cost_min = model_feature_and_cost_df['model_co2_cost'].min()
  cost_max = model_feature_and_cost_df['model_co2_cost'].max()
  results = {}
  for alpha in np.arange(0,1,0.01):
    query=[]
    for query_id, input_query_vector in query_id_and_vector.itertuples(index=False):
      best_model_name = None
      best_score = -float('inf')
      print(f'query id{query_id}')
      for model_name, model_vector, co2_cost in model_feature_and_cost_df.itertuples(index=True):
        input_query_tensor = torch.tensor(input_query_vector, dtype=torch.float32).unsqueeze(0).to(device)
        model_vector_tensor = torch.tensor(model_vector, dtype=torch.float32).unsqueeze(0).to(device)
        with torch.no_grad():
            logit = route_model(input_query_tensor, model_vector_tensor)
            prob_correct = torch.sigmoid(logit).item()

            #normalize co2 cost
            co2_cost_norm = (co2_cost - cost_min) / (cost_max - cost_min)
            score = (1 - alpha)  * prob_correct - alpha * co2_cost_norm

            #update best model for this query if needed
            if score > best_score:
                best_score = score
                best_model_name = model_name

      query.append((query_id, best_model_name, best_score, alpha))
    results[alpha] = query
  return results



In [None]:
router_path = 'best_router_model.pth'
device='cuda'

In [None]:
all_alpha_model_ID = find_the_best_alpha(model_info_feature_cost,ID_results_df,router_path,device)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
query idmmlu_pro_q6637
query idmmlu_pro_q6638
query idmmlu_pro_q6639
query idmmlu_pro_q6640
query idmmlu_pro_q6641
query idmmlu_pro_q6642
query idmmlu_pro_q6643
query idmmlu_pro_q6644
query idmmlu_pro_q6645
query idmmlu_pro_q6646
query idmmlu_pro_q6647
query idmmlu_pro_q6648
query idmmlu_pro_q6649
query idmmlu_pro_q6650
query idmmlu_pro_q6651
query idmmlu_pro_q6652
query idmmlu_pro_q6653
query idmmlu_pro_q6654
query idmmlu_pro_q6655
query idmmlu_pro_q6656
query idmmlu_pro_q6657
query idmmlu_pro_q6658
query idmmlu_pro_q6659
query idmmlu_pro_q6660
query idmmlu_pro_q6661
query idmmlu_pro_q6662
query idmmlu_pro_q6663
query idmmlu_pro_q6664
query idmmlu_pro_q6665
query idmmlu_pro_q6666
query idmmlu_pro_q6667
query idmmlu_pro_q6668
query idmmlu_pro_q6669
query idmmlu_pro_q6670
query idmmlu_pro_q6671
query idmmlu_pro_q6672
query idmmlu_pro_q6673
query idmmlu_pro_q6674
query idmmlu_pro_q6675
query idmmlu_pro_q6676
query idmmlu_pr

In [None]:
all_alpha_model = find_the_best_alpha(model_info_feature_cost,result_df,router_path,device)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
query idgpqa_q204
query idgpqa_q205
query idgpqa_q206
query idgpqa_q207
query idgpqa_q208
query idgpqa_q209
query idgpqa_q210
query idgpqa_q211
query idgpqa_q212
query idgpqa_q213
query idgpqa_q214
query idgpqa_q215
query idgpqa_q216
query idgpqa_q217
query idgpqa_q218
query idgpqa_q219
query idgpqa_q220
query idgpqa_q221
query idgpqa_q222
query idgpqa_q223
query idgpqa_q224
query idgpqa_q225
query idgpqa_q226
query idgpqa_q227
query idgpqa_q228
query idgpqa_q229
query idgpqa_q230
query idgpqa_q231
query idgpqa_q232
query idgpqa_q233
query idgpqa_q234
query idgpqa_q235
query idgpqa_q236
query idgpqa_q237
query idgpqa_q238
query idgpqa_q239
query idgpqa_q240
query idgpqa_q241
query idgpqa_q242
query idgpqa_q243
query idgpqa_q244
query idgpqa_q245
query idgpqa_q246
query idgpqa_q247
query idgpqa_q248
query idgpqa_q249
query idgpqa_q250
query idgpqa_q251
query idgpqa_q252
query idgpqa_q253
query idgpqa_q254
query idgpqa_q255

In [None]:
all_data =[]
for alpha, query_result in all_alpha_model.items():
  all_data.extend(query_result)


In [None]:
all_data_ID =[]
for alpha, query_result in all_alpha_model_ID.items():
  all_data_ID.extend(query_result)


In [None]:
all_alpha_model_ID_df = pd.DataFrame(all_data_ID, columns=['query_id', 'best_model', 'score', 'alpha'])
all_alpha_model_ID_df

Unnamed: 0,query_id,best_model,score,alpha
0,bbh_q0,Qwen/Qwen2.5-72B-Instruct,0.996412,0.00
1,bbh_q1,Qwen/Qwen2.5-72B-Instruct,0.965912,0.00
2,bbh_q2,Qwen/Qwen2.5-72B-Instruct,0.974069,0.00
3,bbh_q3,Qwen/Qwen2.5-72B-Instruct,0.993087,0.00
4,bbh_q4,Qwen/Qwen2.5-72B-Instruct,0.984613,0.00
...,...,...,...,...
1870495,mmlu_pro_q11640,meta-llama/Llama-3.2-1B-Instruct,0.001290,0.99
1870496,mmlu_pro_q11641,meta-llama/Llama-3.2-1B-Instruct,0.002730,0.99
1870497,mmlu_pro_q11642,meta-llama/Llama-3.2-1B-Instruct,0.000679,0.99
1870498,mmlu_pro_q11643,meta-llama/Llama-3.2-1B-Instruct,0.000421,0.99


In [None]:
all_alpha_model_ID_df.to_csv("all_alpha_model_ID.csv")

In [None]:
all_alpha_model_df = pd.DataFrame(all_data, columns=['query_id', 'best_model', 'score', 'alpha'])
all_alpha_model_df

Unnamed: 0,query_id,best_model,score,alpha
0,gpqa_q0,MaziyarPanahi/calme-3.2-instruct-78b,0.826828,0.00
1,gpqa_q1,MaziyarPanahi/calme-3.2-instruct-78b,0.644907,0.00
2,gpqa_q2,MaziyarPanahi/calme-3.2-instruct-78b,0.786405,0.00
3,gpqa_q3,MaziyarPanahi/calme-3.2-instruct-78b,0.606267,0.00
4,gpqa_q4,MaziyarPanahi/calme-3.2-instruct-78b,0.885207,0.00
...,...,...,...,...
130095,musr_q751,meta-llama/Llama-3.2-1B-Instruct,0.001927,0.99
130096,musr_q752,meta-llama/Llama-3.2-1B-Instruct,0.001705,0.99
130097,musr_q753,meta-llama/Llama-3.2-1B-Instruct,0.001777,0.99
130098,musr_q754,meta-llama/Llama-3.2-1B-Instruct,0.001881,0.99


In [None]:
all_alpha_model_df.to_json("all_alpha_model.json")

In [None]:
all_alpha_model_df.to_csv("all_alpha_model.csv")