In [1]:
import sys

sys.path.append("../../")

%load_ext autoreload
%autoreload 2

In [11]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from ast import literal_eval
from pathlib import Path
import hashlib
from functools import reduce
from tqdm import tqdm_notebook

from src.utils import find_meta_category
from src.feature_extractor import sample_feature_combinations

## Download prerequisite files

Fetch all the results and feature values


In [3]:
skip_download = True
if not skip_download:
# You can get the experiments file here: 01J6KF3JRCATRJQ9CPJTRV5VBM (https://beaker.org/ds/01J6KF3JRCATRJQ9CPJTRV5VBM/details)
    !echo "Fetching experiments list..."
    !beaker dataset fetch 01J6KF3JRCATRJQ9CPJTRV5VBM --prefix experiments.txt -q
    !echo "Fetching extracted features..."
    !mkdir -p features/
    !beaker dataset fetch 01J6KF3JRCATRJQ9CPJTRV5VBM --prefix features/ -q
    #!beaker dataset fetch 01J6KFVCRCTYHCZDR0XNK0G9HT --prefix features/
    !echo "Fetching helpsteer2 dataset..."
    !beaker dataset fetch 01J6KBM2VCM9EQ7MER26VBXCCM
    !echo "Fetching extracted subsets... (this will take ~10 minutes)"
    !beaker dataset fetch 01J6KF3JRCATRJQ9CPJTRV5VBM --prefix data/ -q
    
!echo "Collating all evaluation results"
%run ../../scripts/fetch_evals_rewardbench.py --output_file results.csv --gpt4_threshold_score 0.658 --experiment_prefix rm-eval-helpsteer2 --experiments_file experiments.txt

Collating all evaluation results
2024-09-04 11:42:50 - INFO - root - Logged-in as ljm (ljm@allenai.org)
2024-09-04 11:42:51 - INFO - root - Found 291 experiments that match 'rm-eval-helpsteer2'


100%|██████████| 291/291 [00:36<00:00,  7.96it/s]

2024-09-04 11:43:28 - INFO - root - Computing category scores...
2024-09-04 11:43:28 - INFO - root - Deriving features from the experiments file: experiments.txt
2024-09-04 11:43:28 - INFO - root - Will attempt merge via feature hash
2024-09-04 11:43:28 - INFO - root - Creating labels in column 'label' with GPT-4 threshold '0.658'
2024-09-04 11:43:28 - INFO - root - Saving 128 results to results.csv
2024-09-04 11:43:28 - INFO - root - Saved on results.csv





In [4]:
!ls data | wc -l

     330


Collate feature set for all instances


In [5]:
LEXICAL_FEATS_PATH = Path("features")
DATASET_PATH = Path("helpsteer2_human_vs_gpt4_weighted_for_llama.jsonl")


def get_dataset_features(
    feature_path=LEXICAL_FEATS_PATH, dataset_path=DATASET_PATH
) -> "pd.DataFrame":
    lexical_features = [
        "rouge",
        "bertscore",
        "bertscore_length",
        "entity_sim",
        "cosine_sim",
        "prompt_len",
        "len_longer",
        "len_shorter",
        "token_len_difference",
    ]
    lexical_feature_files = [
        file
        for file in feature_path.glob("*.jsonl")
        if any(file.stem in feat for feat in lexical_features)
    ]
    lexical_feats_df = reduce(
        lambda left, right: left.merge(
            right, on=["id", "prompt", "completion_a", "completion_b"], how="outer"
        ),
        [pd.read_json(file, lines=True) for file in lexical_feature_files],
    )

    df = pd.read_json(dataset_path, lines=True).rename(columns={"prompt_hash": "id"})
    finaldf = df.merge(lexical_feats_df, how="left", on="id").drop(
        columns=["prompt", "completion_a", "completion_b"]
    )

    # Hacky way for token_len_difference
    finaldf = finaldf.rename(columns={"token_len_diff": "token_len_difference"})
    return finaldf

In [6]:
results_df = pd.read_csv("results.csv").dropna()
features_df = get_dataset_features()
features_df.to_csv("helpsteer2_all_features.csv", index=False)
# print(len(results_df)), print(len(features_df))

This dataframe contains the features for instances in the dataset


In [7]:
features_df.head()

Unnamed: 0,id,text,response_a,response_b,pref_human,pref_gpt4,rating_human,rating_gpt4,completions,subject_of_expertise,...,format_constraints,cosine_sim,token_len_difference,rouge,bertscore,len_longer,prompt_len,bertscore_length,len_shorter,entity_sim
0,000be890ad02c46a3554c77f098f151f,In what ways can researching something be help...,Engaging in research is a valuable endeavor th...,There are several reasons why doing research c...,B-is-clearly-better,A-is-clearly-better,"[2.83, 6.28]","[6.09, 5.98]",[Engaging in research is a valuable endeavor t...,"[Education, Psychology]",...,"[#words <= 200, style = formal]",0.709485,166,0.328704,0.799727,376,25,0.435609,210,1.0
1,001458ba4f2925f2bc6ae9d8d197d543,Write an inspiring song about alena wicker for...,"Here's a song inspired by Alena Wicker, a youn...",Here's an inspiring song about Alena Wicker fo...,B-is-clearly-better,B-is-clearly-better,"[3.73, 4.77]","[1.59, 2.25]","[Here's a song inspired by Alena Wicker, a you...",[Performing arts],...,"[content = sone, style = inspiring, include: t...",0.844983,61,0.463866,0.842092,503,44,0.724686,442,0.333333
2,0014db4cd30cad106320d9043f45d607,Hi! I am a Hypnotist and transformational / mi...,They want a solution to their problems. They w...,They want a solution to lead a more fulfilling...,B-is-clearly-better,A-is-clearly-better,"[4.77, 5.95]","[5.87, 5.62]",[They want a solution to their problems. They ...,"[Business, Psychology]",...,[],0.508719,424,0.150259,0.772617,480,209,0.000456,56,0.0
3,003af4a043a0cb2186bc448a290d35f6,Australian Defence Force Military Police membe...,"Yes, members of the Australian Defence Force M...","Yes, members of the Australian Defence Force M...",Tie,B-is-clearly-better,"[6.28, 6.28]","[5.62, 6.09]","[Yes, members of the Australian Defence Force ...","[Military sciences, Law]",...,[],0.87233,121,0.510121,0.881362,217,30,0.268603,96,0.5
4,003dd8aa40b027ae755e47917c89c0a3,Heather: I want to attend the music festival t...,It is unclear if all girls have enough money t...,Yes\n\nBoth Heather and Kara state that they h...,A-is-clearly-better,A-is-clearly-better,"[6.14, 3.02]","[5.13, 0.64]",[It is unclear if all girls have enough money ...,"[Performing arts, Economics]",...,[],0.839198,16,0.474227,0.866347,68,107,0.671132,52,0.6


## Get proportion of instances that fulfill the conditions

1. For each row, get features that were activated
2. Then for each activated feature, we get the proportion by looking at the feature dataframe.
3. The proportion is computed as: `number_of_instance_that_fulfill_a_single_condition` / `total_number_of_instances`


In [8]:
# Inspect nan columns
rows_with_nan = features_df[features_df.isna().any(axis=1)]
nan_columns = rows_with_nan.columns[rows_with_nan.isna().any()]
df_nan_columns = rows_with_nan[nan_columns]
df_nan_columns

Unnamed: 0,expertise_level,format_constraints
289,,[]
1317,expert domain knowledge,
4613,basic domain knowledge,
4734,general public,


So what you're going to do instead, is to take the binary_cols, and then for each element of that binary_cols, you compute the "weight"


In [9]:
def compute_instances(feat: str, features_df: "pd.DataFrame") -> float:
    """Compute the ratio of instances that fulfill a given feature 'feat' vs. the total dataset 'len(features_df)'"""
    total = len(features_df)
    lexical_features = [
        "rouge",
        "bertscore",
        "bertscore_length",
        "entity_sim",
        "cosine_sim",
        "prompt_len",
        "len_longer",
        "len_shorter",
        "token_len_difference",
    ]

    if feat.split("__")[0] in lexical_features:
        feat_name, value = feat.split("__")
        min_val_str, max_val_str = value.split("|")
        min_val, max_val = float(min_val_str.split("=")[1]), float(
            max_val_str.split("=")[1]
        )
        return features_df[feat_name].between(min_val, max_val).mean()
    else:
        # Parse the feature
        feat_name, value = feat.split("=")
        meta_category = find_meta_category(feat_name)
        if meta_category == "scalar":
            v = value.replace("_", " ")
            return features_df[feat_name].value_counts().get(v) / total
        elif meta_category == "closed_set":
            v = value.replace("_", " ")
            list_of_values = features_df[feat_name].tolist()
            return sum([1 if v in listval else 0 for listval in list_of_values]) / total
        elif meta_category == "open_set":
            list_of_values = features_df[feat_name].tolist()
            return sum([1 if listval else 0 for listval in list_of_values]) / total

        return find_meta_category(feat_name)


# feats = results_df.columns[results_df.isin([0, 1]).all()]  # get binary columns
# feat_map = {
#    feat: compute_instances(feat, features_df) for feat in feats if feat != "label"
# }

# ratio_df = results_df.apply(
#    lambda row: row.map(lambda x: feat_map.get(row.name, 1) if x == 1 else x)
# )

For each result, we get the `hash`, find the extracted subset (because they were randomly-sampled) from `data`, and compute the ratio from there.


In [62]:
import re

get_per_hash_ratios = True


def extract_hash(string):
    match = re.search(r"FEATS_(.*?)_SWAPS", string)
    return match.group(1) if match else None


result_hashes = results_df["hash"].to_list()
subsets = {extract_hash(str(file)): file for file in Path("data").glob("*.jsonl")}
feats = results_df.columns[results_df.isin([0, 1]).all()]  # get binary collumns

hash_ratios = {}
for result_hash in tqdm_notebook(result_hashes):
    if result_hash in subsets:
        sampled_features_df = pd.read_json(subsets[result_hash], lines=True)
        sampled_features_df["id"] = sampled_features_df["prompt"].apply(
            lambda x: hashlib.md5(x.encode("utf-8")).hexdigest()
        )
        # Get the features from features_df based on the existing prompt_hashes in sampled_features_df
        sdf = features_df[features_df["id"].isin(sampled_features_df["id"].to_list())]
        hash_ratios[result_hash] = {
            feat: compute_instances(feat, sdf) for feat in feats if feat != "label"
        }


def replace_values(row):
    feat_map = hash_ratios.get(row["hash"], {})
    for col in feat_map:
        if row[col] == 1 and col in feat_map:
            row[col] = feat_map[col]
    return row


ratio_df = results_df.apply(replace_values, axis=1)


# Get feat_map with default counts
feats = results_df.columns[results_df.isin([0, 1]).all()]  # get binary columns
feat_map = {
    feat: compute_instances(feat, features_df) for feat in feats if feat != "label"
}

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for result_hash in tqdm_notebook(result_hashes):


  0%|          | 0/127 [00:00<?, ?it/s]

## Regressor training


In [23]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

In [44]:
force_new_split = True

feat_names = list(list(hash_ratios.values())[0].keys())
if Path("validation_set.jsonl").exists() and not force_new_split:
    print("Reusing existing validation set")
    val_df = pd.read_json("validation_set.jsonl", lines=True)
    train_df = ratio_df[~ratio_df["hash"].isin(val_df["hash"])]
    X_train = train_df[feat_names]
    y_train = train_df["Overall"]
    X_test = val_df[feat_names]
    y_test = val_df["Overall"]
else:
    X = ratio_df[feat_names]
    y = ratio_df["Overall"]
    X_train, X_test, y_train, y_test, train_idx, test_idx = train_test_split(
        X, y, ratio_df.index, test_size=0.2, random_state=42
    )
    # Save the validation set
    validation_set = ratio_df.loc[test_idx]
    validation_set.to_json("validation_set.jsonl", lines=True, orient="records")

print(f"Train size: {len(X_train)}, test size: {len(X_test)}")

Train size: 101, test size: 26


### Train LinearRegressor


In [60]:
def train_linear_regressor(X_train, X_test, y_train, y_test):
    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)

    return model, {"mse": mse, "rmse": rmse}


# print(f"Feature names: {poly.get_feature_names_out(X.columns)}")
model, scores = train_linear_regressor(X_train, X_test, y_train, y_test)
print(scores)
print(f"intercept: {model.intercept_}")

{'mse': 0.0011864301508304313, 'rmse': 0.03444459537910746}
intercept: 0.7142278960890482


In [52]:
pct_of_train = [0.25, 0.50, 0.75, 1]
for pct in pct_of_train:
    num_train = int(len(X_train) * pct)
    _, scores = train_linear_regressor(
        X_train[:num_train], X_test, y_train[:num_train], y_test
    )
    print(num_train, scores)

25 {'mse': 0.2141320609375028, 'rmse': 0.46274405553988784}
50 {'mse': 0.0006750773774280583, 'rmse': 0.02598225120015697}
75 {'mse': 0.0008797178968680422, 'rmse': 0.02966003871993498}
101 {'mse': 0.0011864301508304313, 'rmse': 0.03444459537910746}


### Train LightGBM


In [53]:
import lightgbm as lgb


def train_lightgbm(X_train, X_test, y_train, y_test):
    train_data = lgb.Dataset(X_train, label=y_train, params={"verbose": -1})
    test_data = lgb.Dataset(
        X_test, label=y_test, reference=train_data, params={"verbose": -1}
    )
    params = {
        "objective": "regression",
        "metric": "rmse",
        "boosting": "gbdt",
        "learning_rate": 0.1,
        "num_leaves": 2,
    }
    # Train the model
    model = lgb.train(params, train_data, valid_sets=[test_data])
    # Predict and evaluate
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)

    return model, {"mse": mse, "rmse": rmse}


model, scores = train_lightgbm(X_train, X_test, y_train, y_test)
print(scores)

importances = model.feature_importance()
importance_df = pd.DataFrame(
    {"feature": X.columns, "importance": importances}
).sort_values(by="importance", ascending=False)
importance_df

{'mse': 0.0010180543772647782, 'rmse': 0.03190696440065677}


Unnamed: 0,feature,importance
4,bertscore_length__min_val=0.67|max_val=1.0,61
31,prompt_len__min_val=0.67|max_val=1.0,39
0,bertscore__min_val=0.33|max_val=0.67,0
42,subject_of_expertise=Earth_sciences,0
47,subject_of_expertise=Human_physical_performanc...,0
...,...,...
27,open_endedness=moderate,0
28,open_endedness=no,0
29,prompt_len__min_val=0.0|max_val=0.33,0
30,prompt_len__min_val=0.33|max_val=0.67,0


In [54]:
pct_of_train = [0.25, 0.50, 0.75, 1]
for pct in pct_of_train:
    num_train = int(len(X_train) * pct)
    _, scores = train_lightgbm(X_train[:num_train], X_test, y_train[:num_train], y_test)
    print(num_train, scores)

25 {'mse': 0.0009778488241440854, 'rmse': 0.03127057441340158}
50 {'mse': 0.000982158756371883, 'rmse': 0.03133941218931655}
75 {'mse': 0.0009770653479119278, 'rmse': 0.03125804453115914}
101 {'mse': 0.0010180543772647782, 'rmse': 0.03190696440065677}


## Simulation


In [55]:
_, combinations = sample_feature_combinations(
    meta_analyzer_n_samples=2000, max_number=10
)

10it [00:00, 69442.12it/s]
45it [00:00, 93530.07it/s]
120it [00:00, 73082.11it/s]
210it [00:00, 62446.21it/s]
252it [00:00, 9605.54it/s]
210it [00:00, 15881.50it/s]
120it [00:00, 32405.13it/s]
45it [00:00, 29280.74it/s]
10it [00:00, 20049.25it/s]
1it [00:00, 9986.44it/s]

2024-09-04 11:58:05 - INFO - root - Adding meta analyzer features



10it [00:00, 90982.73it/s]
45it [00:00, 83811.58it/s]
120it [00:00, 80171.47it/s]
210it [00:00, 58810.43it/s]
252it [00:00, 47871.94it/s]
210it [00:00, 42622.98it/s]
120it [00:00, 36628.81it/s]
45it [00:00, 31689.67it/s]
10it [00:00, 21194.06it/s]
1it [00:00, 11366.68it/s]


In [64]:
sim_df = pd.DataFrame(0, index=np.arange(len(combinations)), columns=X.columns)
for idx, combination in tqdm_notebook(enumerate(combinations), total=len(combinations)):
    activated_feats = []
    for feat in combination:
        if "analyzer" in feat:
            feature_name_str, value_str = feat.split("::")[1].split("|")
            feature_name, value = (
                feature_name_str.split("=")[-1],
                value_str.split("=")[-1],
            )
            activated_feats.append(f"{feature_name}={value}")
        else:
            activated_feats.append(feat.replace("::", "__"))
    sim_df.loc[idx, activated_feats] = 1
sim_df = sim_df.apply(
    lambda row: row.map(lambda x: feat_map.get(row.name, 1) if x == 1 else x)
).dropna(axis=1, how="any")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for idx, combination in tqdm_notebook(enumerate(combinations), total=len(combinations)):


  0%|          | 0/4069 [00:00<?, ?it/s]

In [65]:
sim_df

Unnamed: 0,bertscore__min_val=0.33|max_val=0.67,bertscore__min_val=0.67|max_val=1.0,bertscore_length__min_val=0.0|max_val=0.33,bertscore_length__min_val=0.33|max_val=0.67,bertscore_length__min_val=0.67|max_val=1.0,complexity_of_intents=complex,complexity_of_intents=moderate,complexity_of_intents=simple,cosine_sim__min_val=0.0|max_val=0.33,cosine_sim__min_val=0.33|max_val=0.67,...,subject_of_expertise=Philosophy,subject_of_expertise=Political_science,subject_of_expertise=Social_work,subject_of_expertise=Sociology,subject_of_expertise=Space_sciences,subject_of_expertise=System_science,token_len_difference__min_val=0.0|max_val=0.33,token_len_difference__min_val=0.33|max_val=0.67,token_len_difference__min_val=0.67|max_val=1.0,type_of_in_context_material=1
0,0.027559,0.0,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0,0,0.0,0,0,0.000000,0.0,0.000000,0.000000
1,0.000000,0.0,0.376181,0.000000,0.00000,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0,0,0.0,0,0,0.000000,0.0,0.000000,0.000000
2,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.049016,0.000000,...,0.0,0,0,0.0,0,0,0.000000,0.0,0.000000,0.000000
3,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0,0,0.0,0,0,0.000000,0.0,0.000000,0.000000
4,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0,0,0.0,0,0,0.000000,0.0,0.000000,0.107874
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4064,0.027559,0.0,0.000000,0.373425,0.00000,0.0,0.0,0.0,0.000000,0.219488,...,0.0,0,0,0.0,0,0,0.000000,0.0,0.007579,0.000000
4065,0.000000,0.0,0.000000,0.000000,0.24872,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0,0,0.0,0,0,0.003051,0.0,0.000000,0.000000
4066,0.027559,0.0,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0,0,0.0,0,0,0.000000,0.0,0.007579,0.000000
4067,0.000000,0.0,0.376181,0.000000,0.00000,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0,0,0.0,0,0,0.003051,0.0,0.000000,0.000000


In [82]:
sim_results = sim_df.copy(deep=True)
sim_results["activated_features"] = sim_results.apply(
    lambda row: [col for col in sim_results.columns if row[col] != 0], axis=1
)
sim_results["pred"] = model.predict(sim_df)
sim_results = sim_results.sort_values(by="pred", ascending=False).reset_index(drop=True)
sim_results["hash"] = sim_results["activated_features"].apply(
    lambda x: hashlib.md5("___".join(x).encode("utf-8")).hexdigest()
)
sim_results = sim_results.drop_duplicates(subset=["hash"]).reset_index(drop=True)
sim_results[["activated_features", "pred"]].head(20)

Unnamed: 0,activated_features,pred
0,"[complexity_of_intents=simple, languages=English]",0.760504
1,"[bertscore__min_val=0.67|max_val=1.0, complexi...",0.756522
2,"[complexity_of_intents=simple, languages=Engli...",0.755165
3,"[bertscore__min_val=0.67|max_val=1.0, complexi...",0.752846
4,"[complexity_of_intents=simple, cosine_sim__min...",0.751346
5,"[complexity_of_intents=simple, cosine_sim__min...",0.750481
6,[complexity_of_intents=simple],0.748798
7,"[complexity_of_intents=simple, safety_concern=...",0.748798
8,"[complexity_of_intents=simple, languages=Engli...",0.747751
9,"[bertscore__min_val=0.67|max_val=1.0, complexi...",0.746023


In [84]:
top_n = 100
human_score = 0.715
better_than_humans = sim_results[sim_results["pred"] > human_score]
top_combinations = (
    better_than_humans.activated_features.head(top_n).drop_duplicates().to_list()
)
print(top_combinations)

[['complexity_of_intents=simple', 'languages=English'], ['bertscore__min_val=0.67|max_val=1.0', 'complexity_of_intents=simple', 'cosine_sim__min_val=0.33|max_val=0.67', 'entity_sim__min_val=0.0|max_val=0.33'], ['complexity_of_intents=simple', 'languages=English', 'safety_concern=safe'], ['bertscore__min_val=0.67|max_val=1.0', 'complexity_of_intents=simple', 'prompt_len__min_val=0.67|max_val=1.0'], ['complexity_of_intents=simple', 'cosine_sim__min_val=0.33|max_val=0.67', 'entity_sim__min_val=0.0|max_val=0.33'], ['complexity_of_intents=simple', 'cosine_sim__min_val=0.67|max_val=1.0', 'entity_sim__min_val=0.0|max_val=0.33'], ['complexity_of_intents=simple'], ['complexity_of_intents=simple', 'safety_concern=moderate'], ['complexity_of_intents=simple', 'languages=English', 'open_endedness=moderate'], ['bertscore__min_val=0.67|max_val=1.0', 'complexity_of_intents=simple', 'cosine_sim__min_val=0.67|max_val=1.0', 'languages=English', 'open_endedness=no', 'token_len_difference__min_val=0.67|max

So now you have determined 10 feature combinations that seem to work well. The next step is to train RMs and evaluate them.


In [85]:
from beaker import Beaker, ExperimentSpec
from copy import deepcopy

In [None]:
spec = ExperimentSpec.from_file("../../beaker/template.yml")
exp_spec = deepcopy(spec)
template_task = exp_spec.tasks.pop(0)

new_tasks = []
for idx, combination in enumerate(top_combinations):
    feats_to_run = []
    for feat in combination:
        if "min_val" in feat:
            if "token_len_difference" in feat:
                feat = feat.replace("difference", "diff")
            feats_to_run.append(feat.replace("__", "::"))
        else:
            feat_name, value = feat.split("=")
            category = find_meta_category(feat_name)
            if category == "closed_set":
                key = "constraints"
            elif category == "scalar":
                key = "value"
            elif category == "open_set":
                key = "check_for_existence"
            feats_to_run.append(f"{category}::feature_name={feat_name}|{key}={value}")
    # Create beaker task
    task = deepcopy(template_task)
    task.name = f"get-features-datamodel-{idx}"
    task.arguments.extend(["--features"] + feats_to_run)
    new_tasks.append(task)

exp_spec.tasks = new_tasks
exp_spec.validate()
exp_spec.to_file("experiments.yml")

Get finished jobs and download the subsets and create an `experiments.txt` file


In [None]:
# experiment_id = "01J6TS47Q2KNKYRCYHC8A0DE4B"
# experiment_id = "01J6WDKDPQCM92REXJ1VCNJ0NW"
experiment_id = "01J6XJSWMSAM2ARJ1WXAP6PV8T"
top_subsets_dir = Path("top_n_subsets")
top_subsets_dir.mkdir(parents=True, exist_ok=True)
experiments_file = top_subsets_dir / "top_n_subsets_experiments.txt"
beaker = Beaker.from_env("ai2/ljm-oe-adapt")
experiment = beaker.experiment.get(experiment_id)

experiment_ids = []
for job in tqdm_notebook(experiment.jobs):
    if job.is_done:
        # Get output
        dataset_id = job.execution.result.beaker
        beaker.dataset.fetch(
            dataset_id,
            force=True,
            target=top_subsets_dir,
            prefix="data/",
            quiet=True,
        )

        beaker.dataset.fetch(
            dataset_id,
            force=True,
            target=top_subsets_dir,
            prefix="experiments.txt",
            quiet=True,
        )

        with open(top_subsets_dir / "experiments.txt", "r") as f:
            data = f.read().splitlines()
            id = data[0]
            experiment_ids.append(id)

print(experiment_ids)
with open(experiments_file, "a") as f:
    for id in set(experiment_ids):
        f.write("\n" + id)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for job in tqdm_notebook(experiment.jobs):


  0%|          | 0/65 [00:00<?, ?it/s]

['human_datamodel_7000_FEATS_818fcd7dfafaf799a556f7f3363e7765_SWAPS_1922::bertscore__min_val-0.67|max_val-1.0___scalar__feature_name-complexity_of_intents|value-simple___entity_sim__min_val-0.67|max_val-1.0___scalar__feature_name-safety_concern|value-safe', 'human_datamodel_7000_FEATS_818fcd7dfafaf799a556f7f3363e7765_SWAPS_1922::bertscore__min_val-0.67|max_val-1.0___scalar__feature_name-complexity_of_intents|value-simple___entity_sim__min_val-0.67|max_val-1.0___scalar__feature_name-safety_concern|value-safe', 'human_datamodel_7000_FEATS_818fcd7dfafaf799a556f7f3363e7765_SWAPS_1922::bertscore__min_val-0.67|max_val-1.0___scalar__feature_name-complexity_of_intents|value-simple___entity_sim__min_val-0.67|max_val-1.0___scalar__feature_name-safety_concern|value-safe', 'human_datamodel_7000_FEATS_4bf313ac33a2c5007338baf8ef25ec52_SWAPS_1658::scalar__feature_name-complexity_of_intents|value-simple___cosine_sim__min_val-0.33|max_val-0.67', 'human_datamodel_7000_FEATS_4bf313ac33a2c5007338baf8ef25e

In [None]:
len(set(experiment_ids))

13