In [None]:
import os 
os.environ["CUDA_VISIBLE_DEVICES"] = "6,7"  
import numpy as np
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import json
from tqdm import tqdm
from typing import List, Tuple, Set


model_id = "./models/gpt-oss-120b".strip()
tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype="auto",
)
model.eval()

if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
    tokenizer.pad_token = tokenizer.eos_token



In [None]:

@torch.no_grad()
def texts_to_last_hidden_embeddings(
    texts,
    batch_size=8,
    max_length=None,
    pooling="last_token",
):
    device = next(model.parameters()).device
    emb_list = []

    for start in tqdm(
        range(0, len(texts), batch_size),
        desc="Encoding batches",
        total=(len(texts) + batch_size - 1) // batch_size,
    ):
        batch_texts = texts[start : start + batch_size]

        enc = tokenizer(
            batch_texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_length,
        )
        enc = {k: v.to(device) for k, v in enc.items()}

        out = model(**enc, output_hidden_states=True, return_dict=True)
        last_layer = out.hidden_states[-1]

        if pooling == "last_token":
            last_idx = enc["attention_mask"].sum(dim=1) - 1
            last_idx = last_idx.clamp(min=0)

            pooled = last_layer[
                torch.arange(last_layer.size(0), device=device), last_idx
            ]

        elif pooling == "mean":
            mask = enc["attention_mask"].unsqueeze(-1).to(last_layer.dtype)
            pooled = (last_layer * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)

        else:
            raise ValueError(f"Unknown pooling={pooling}")

        emb_list.append(pooled.float().cpu().numpy())

    return np.concatenate(emb_list, axis=0)


with open("./putnam.json", "r") as f:
    data = json.load(f)

len(data)

In [None]:

def generate_fuse_pairs(
    emb_path: str,
    minority_size: int,
    K: int = 5,
    max_iterations: int = 50,
) -> Tuple[List[Tuple[int, int]],
           List[Tuple[int, int]],
           List[Tuple[int, int]]]:
    """
    Args:
        emb_path: path to .npy file of shape (N, D)
        minority_size: number of B (non-solvable) samples at the bottom
        K: per-question fuse pair cap
        max_iterations: how many similarity ranks to traverse

    Returns:
        AA_pairs: list of (i, j) both in A
        AB_pairs: list of (i, j) one in A, one in B
        final_pairs: union of AA and AB (unique pairs)
    """

    emb = np.load(emb_path)  # (N, D)
    N = emb.shape[0]

    # Split A / B
    A_end = N - minority_size
    is_A = np.zeros(N, dtype=bool)
    is_A[:A_end] = True

    # Normalize embeddings for cosine similarity
    emb = emb / np.linalg.norm(emb, axis=1, keepdims=True)

    # Full cosine similarity matrix
    sim = emb @ emb.T
    np.fill_diagonal(sim, -np.inf)

    # Pre-sort neighbors for each question
    sorted_neighbors = np.argsort(-sim, axis=1)

    usage = np.zeros(N, dtype=int)
    used_pairs: Set[Tuple[int, int]] = set()

    AA_pairs = []
    AB_pairs = []

    for it in range(max_iterations):
        for i in range(N):
            if usage[i] >= K:
                continue

            # Guard: not enough neighbors
            if it >= sorted_neighbors.shape[1]:
                continue

            j = sorted_neighbors[i, it]

            if usage[j] >= K:
                continue

            a, b = min(i, j), max(i, j)
            if (a, b) in used_pairs:
                continue

            # Accept pair
            used_pairs.add((a, b))
            usage[i] += 1
            usage[j] += 1

            if is_A[i] and is_A[j]:
                AA_pairs.append((a, b))
            elif is_A[i] ^ is_A[j]:
                AB_pairs.append((a, b))
            # B+B pairs are ignored by construction

    # Final non-overlapping pair list
    final_pairs = list(set(AA_pairs).union(set(AB_pairs)))

    return AA_pairs, AB_pairs, final_pairs


aa , ab , final = generate_fuse_pairs(
    emb_path="./questions.npy",
    minority_size=0,
    K=5,
    max_iterations=5,)


In [None]:
# samples = []
# c = 0
# for x in ds:
#     try:
#         if int(x["expected_answer"]):
#             if x["problem_source"] == "aops_c7_college_math":
#                 rate  = (x["pass_rate_72b_tir"])
        
#                 if 0.0 < float(rate) < 0.3:

#                     samples.append(
#                         {
#                             "rate" :  float(rate), 
#                             "question" : x["problem"] , 
#                             "solution" : x["generated_solution"] , 
#                             "value" : int(x["expected_answer"])
#                         }
#                     )
#                     c+=1
#                     if c==300:
#                         break
#         else:
#             continue

#     except Exception as e :
#         pass


In [None]:
# type(rate)

In [None]:
from datasets import load_dataset
ds = load_dataset("nvidia/OpenMathReasoning" , streaming = True , split = "tir")


data = []
c = 0
for x in ds:
    try:
        if int(x["expected_answer"]):
            if x["problem_source"] == "aops_c7_college_math":
                rate  = (x["pass_rate_72b_tir"])
        
                if 0.0 < float(rate) < 0.3:

                    data.append(
                        {
                            "rate" :  float(rate), 
                            "question" : x["problem"] , 
                            "solution" : x["generated_solution"] , 
                            "value" : int(x["expected_answer"])
                        }
                    )
                    c+=1
                    if c==300:
                        break
        else:
            continue

    except Exception as e :
        pass

unique = []
seen = set()

for x in data:
    q = x["question"].strip()
    if q not in seen:
        seen.add(q)
        unique.append(x)

data = unique


In [None]:
import os
import pandas as pd 

from tqdm import tqdm 

samples = []
questions = []
d = "/data/aneesh/datasets/nvidia_cot/data"
files = os.listdir(d)
for x in tqdm(files):
    if x.endswith(".parquet"):

        df = pd.read_parquet(os.path.join(d , x))
        for i in range(len(df)):

            ans = df.iloc[i]["expected_answer"]
            try:
                if len(samples) == 20_000:
                    break
                
                if df.iloc[i]["problem_source"] not in ["aops_c7_college_math" , "aops_c6_high_school_olympiads"  , "c7_college_math" , "aops_c4_high_school_math"]:
                    continue

                # try:
                #     if int(df.iloc[i]["pass_rate_72b_tir"]) >  0.4 :
                #         continue
                # except:
                #     pass

                if df.iloc[i]["problem"] in questions:
                    continue

                if 0 <= int(ans) < 100_000:
                    samples.append(
                        {
                            "question" : df.iloc[i]["problem"] , 
                            "answer" :  int(ans)  , 
                            "pass_rate_72b_tir" : df.iloc[i]["pass_rate_72b_tir"] , 
                            "solution" : df.iloc[i]["generated_solution"].replace("<think>" , "").replace("</think>" , "")
                        }
                    )
                    questions.append(df.iloc[i]["problem"])

            except:
                pass

        # break
    

In [None]:
len(aa) , len(ab)

In [None]:
samples = []
for x in aa:
    first = int(x[0])
    second = int(x[1])
    samples.append(
        {
            "question_1" : data[first]["question"], 
            "question_2" : data[second]["question"],
            
            "answer_1" : data[first]["answer"],
            "answer_2" : data[second]["answer"],

            "split" : "neutral"

        }
    )
    

# for x in ab:
#     first = int(x[0])
#     second = int(x[1])
#     samples.append(
#         {
#             "question_1" : data[first]["question"], 
#             "question_2" : data[second]["question"],
            
#             "answer_1" : data[first]["answer"],
#             "answer_2" : data[second]["answer"],

#             "split" : "correct_and_incorrect"
#         }
#     )

