In [1]:
import sys

sys.path.append("../../")

%load_ext autoreload
%autoreload 2

In [2]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from ast import literal_eval
from pathlib import Path
import hashlib
from functools import reduce
from tqdm import tqdm_notebook

from src.utils import find_meta_category
from src.feature_extractor import sample_feature_combinations

## Download prerequisite files

Fetch all the results and feature values


In [89]:
skip_download = True
if not skip_download:
# You can get the experiments file here: 01J6KF3JRCATRJQ9CPJTRV5VBM (https://beaker.org/ds/01J6KF3JRCATRJQ9CPJTRV5VBM/details)
    !echo "Fetching experiments list..."
    !beaker dataset fetch 01J6KF3JRCATRJQ9CPJTRV5VBM --prefix experiments.txt -q
    !echo "Fetching extracted features..."
    !mkdir -p features/
    !beaker dataset fetch 01J6KF3JRCATRJQ9CPJTRV5VBM --prefix features/ -q
    #!beaker dataset fetch 01J6KFVCRCTYHCZDR0XNK0G9HT --prefix features/
    !echo "Fetching helpsteer2 dataset..."
    !beaker dataset fetch 01J6KBM2VCM9EQ7MER26VBXCCM
    !echo "Fetching extracted subsets... (this will take ~10 minutes)"
    !beaker dataset fetch 01J6KF3JRCATRJQ9CPJTRV5VBM --prefix data/ -q
    
!echo "Collating all evaluation results"
%run ../../scripts/fetch_evals_rewardbench.py --output_file results.csv --gpt4_threshold_score 0.658 --experiment_prefix rm-eval-helpsteer2 --experiments_file experiments.txt

Collating all evaluation results
2024-09-05 20:02:35 - INFO - root - Logged-in as ljm (ljm@allenai.org)
2024-09-05 20:02:37 - INFO - root - Found 354 experiments that match 'rm-eval-helpsteer2'


100%|██████████| 354/354 [01:06<00:00,  5.32it/s]

2024-09-05 20:03:43 - INFO - root - Computing category scores...
2024-09-05 20:03:43 - INFO - root - Deriving features from the experiments file: experiments.txt
2024-09-05 20:03:43 - INFO - root - Will attempt merge via feature hash
2024-09-05 20:03:43 - INFO - root - Creating labels in column 'label' with GPT-4 threshold '0.658'
2024-09-05 20:03:43 - INFO - root - Saving 162 results to results.csv
2024-09-05 20:03:43 - INFO - root - Saved on results.csv





In [90]:
!ls data | wc -l

     330


Collate feature set for all instances


In [91]:
LEXICAL_FEATS_PATH = Path("features")
DATASET_PATH = Path("helpsteer2_human_vs_gpt4_weighted_for_llama.jsonl")


def get_dataset_features(
    feature_path=LEXICAL_FEATS_PATH, dataset_path=DATASET_PATH
) -> "pd.DataFrame":
    lexical_features = [
        "rouge",
        "bertscore",
        "bertscore_length",
        "entity_sim",
        "cosine_sim",
        "prompt_len",
        "len_longer",
        "len_shorter",
        "token_len_difference",
    ]
    lexical_feature_files = [
        file
        for file in feature_path.glob("*.jsonl")
        if any(file.stem in feat for feat in lexical_features)
    ]
    lexical_feats_df = reduce(
        lambda left, right: left.merge(
            right, on=["id", "prompt", "completion_a", "completion_b"], how="outer"
        ),
        [pd.read_json(file, lines=True) for file in lexical_feature_files],
    )

    df = pd.read_json(dataset_path, lines=True).rename(columns={"prompt_hash": "id"})
    finaldf = df.merge(lexical_feats_df, how="left", on="id").drop(
        columns=["prompt", "completion_a", "completion_b"]
    )

    # Hacky way for token_len_difference
    finaldf = finaldf.rename(columns={"token_len_diff": "token_len_difference"})
    return finaldf

In [92]:
results_df = pd.read_csv("results.csv").dropna()
features_df = get_dataset_features()
features_df.to_csv("helpsteer2_all_features.csv", index=False)
# print(len(results_df)), print(len(features_df))

This dataframe contains the features for instances in the dataset


In [93]:
features_df.head()

Unnamed: 0,id,text,response_a,response_b,pref_human,pref_gpt4,rating_human,rating_gpt4,completions,subject_of_expertise,...,format_constraints,cosine_sim,token_len_difference,rouge,bertscore,len_longer,prompt_len,bertscore_length,len_shorter,entity_sim
0,000be890ad02c46a3554c77f098f151f,In what ways can researching something be help...,Engaging in research is a valuable endeavor th...,There are several reasons why doing research c...,B-is-clearly-better,A-is-clearly-better,"[2.83, 6.28]","[6.09, 5.98]",[Engaging in research is a valuable endeavor t...,"[Education, Psychology]",...,"[#words <= 200, style = formal]",0.709485,166,0.328704,0.799727,376,25,0.435609,210,1.0
1,001458ba4f2925f2bc6ae9d8d197d543,Write an inspiring song about alena wicker for...,"Here's a song inspired by Alena Wicker, a youn...",Here's an inspiring song about Alena Wicker fo...,B-is-clearly-better,B-is-clearly-better,"[3.73, 4.77]","[1.59, 2.25]","[Here's a song inspired by Alena Wicker, a you...",[Performing arts],...,"[content = sone, style = inspiring, include: t...",0.844983,61,0.463866,0.842092,503,44,0.724686,442,0.333333
2,0014db4cd30cad106320d9043f45d607,Hi! I am a Hypnotist and transformational / mi...,They want a solution to their problems. They w...,They want a solution to lead a more fulfilling...,B-is-clearly-better,A-is-clearly-better,"[4.77, 5.95]","[5.87, 5.62]",[They want a solution to their problems. They ...,"[Business, Psychology]",...,[],0.508719,424,0.150259,0.772617,480,209,0.000456,56,0.0
3,003af4a043a0cb2186bc448a290d35f6,Australian Defence Force Military Police membe...,"Yes, members of the Australian Defence Force M...","Yes, members of the Australian Defence Force M...",Tie,B-is-clearly-better,"[6.28, 6.28]","[5.62, 6.09]","[Yes, members of the Australian Defence Force ...","[Military sciences, Law]",...,[],0.87233,121,0.510121,0.881362,217,30,0.268603,96,0.5
4,003dd8aa40b027ae755e47917c89c0a3,Heather: I want to attend the music festival t...,It is unclear if all girls have enough money t...,Yes\n\nBoth Heather and Kara state that they h...,A-is-clearly-better,A-is-clearly-better,"[6.14, 3.02]","[5.13, 0.64]",[It is unclear if all girls have enough money ...,"[Performing arts, Economics]",...,[],0.839198,16,0.474227,0.866347,68,107,0.671132,52,0.6


## Get proportion of instances that fulfill the conditions

1. For each row, get features that were activated
2. Then for each activated feature, we get the proportion by looking at the feature dataframe.
3. The proportion is computed as: `number_of_instance_that_fulfill_a_single_condition` / `total_number_of_instances`


In [94]:
# Inspect nan columns
rows_with_nan = features_df[features_df.isna().any(axis=1)]
nan_columns = rows_with_nan.columns[rows_with_nan.isna().any()]
df_nan_columns = rows_with_nan[nan_columns]
df_nan_columns

Unnamed: 0,expertise_level,format_constraints
289,,[]
1317,expert domain knowledge,
4613,basic domain knowledge,
4734,general public,


So what you're going to do instead, is to take the binary_cols, and then for each element of that binary_cols, you compute the "weight"


In [95]:
def compute_instances(feat: str, features_df: "pd.DataFrame") -> float:
    """Compute the ratio of instances that fulfill a given feature 'feat' vs. the total dataset 'len(features_df)'"""
    total = len(features_df)
    lexical_features = [
        "rouge",
        "bertscore",
        "bertscore_length",
        "entity_sim",
        "cosine_sim",
        "prompt_len",
        "len_longer",
        "len_shorter",
        "token_len_difference",
    ]

    if feat.split("__")[0] in lexical_features:
        feat_name, value = feat.split("__")
        min_val_str, max_val_str = value.split("|")
        min_val, max_val = float(min_val_str.split("=")[1]), float(
            max_val_str.split("=")[1]
        )
        return features_df[feat_name].between(min_val, max_val).mean()
    else:
        # Parse the feature
        feat_name, value = feat.split("=")
        meta_category = find_meta_category(feat_name)
        if meta_category == "scalar":
            v = value.replace("_", " ")
            return features_df[feat_name].value_counts().get(v) / total
        elif meta_category == "closed_set":
            v = value.replace("_", " ")
            list_of_values = features_df[feat_name].tolist()
            return sum([1 if v in listval else 0 for listval in list_of_values]) / total
        elif meta_category == "open_set":
            list_of_values = features_df[feat_name].tolist()
            return sum([1 if listval else 0 for listval in list_of_values]) / total

        return find_meta_category(feat_name)


# feats = results_df.columns[results_df.isin([0, 1]).all()]  # get binary columns
# feat_map = {
#    feat: compute_instances(feat, features_df) for feat in feats if feat != "label"
# }

# ratio_df = results_df.apply(
#    lambda row: row.map(lambda x: feat_map.get(row.name, 1) if x == 1 else x)
# )

For each result, we get the `hash`, find the extracted subset (because they were randomly-sampled) from `data`, and compute the ratio from there.


In [96]:
import re

get_per_hash_ratios = True


def extract_hash(string):
    match = re.search(r"FEATS_(.*?)_SWAPS", string)
    return match.group(1) if match else None


result_hashes = results_df["hash"].to_list()
subsets = {extract_hash(str(file)): file for file in Path("data").glob("*.jsonl")}
feats = results_df.columns[results_df.isin([0, 1]).all()]  # get binary collumns

hash_ratios = {}
for result_hash in tqdm_notebook(result_hashes):
    if result_hash in subsets:
        sampled_features_df = pd.read_json(subsets[result_hash], lines=True)
        sampled_features_df["id"] = sampled_features_df["prompt"].apply(
            lambda x: hashlib.md5(x.encode("utf-8")).hexdigest()
        )
        # Get the features from features_df based on the existing prompt_hashes in sampled_features_df
        sdf = features_df[features_df["id"].isin(sampled_features_df["id"].to_list())]
        hash_ratios[result_hash] = {
            feat: compute_instances(feat, sdf) for feat in feats if feat != "label"
        }


def replace_values(row):
    feat_map = hash_ratios.get(row["hash"], {})
    for col in feat_map:
        if row[col] == 1 and col in feat_map:
            row[col] = feat_map[col]
    return row


ratio_df = results_df.apply(replace_values, axis=1)


# Get feat_map with default counts
feats = results_df.columns[results_df.isin([0, 1]).all()]  # get binary columns
feat_map = {
    feat: compute_instances(feat, features_df) for feat in feats if feat != "label"
}

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for result_hash in tqdm_notebook(result_hashes):


  0%|          | 0/162 [00:00<?, ?it/s]

## Regressor training


In [97]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

In [98]:
force_new_split = False

feat_names = list(list(hash_ratios.values())[0].keys())
if Path("validation_set.jsonl").exists() and not force_new_split:
    print("Reusing existing validation set")
    val_df = pd.read_json("validation_set.jsonl", lines=True)
    train_df = ratio_df[~ratio_df["hash"].isin(val_df["hash"])]
    X_train = train_df[feat_names]
    y_train = train_df["Overall"]
    X_test = val_df[feat_names]
    y_test = val_df["Overall"]
else:
    X = ratio_df[feat_names]
    y = ratio_df["Overall"]
    X_train, X_test, y_train, y_test, train_idx, test_idx = train_test_split(
        X, y, ratio_df.index, test_size=0.2, random_state=42
    )
    # Save the validation set
    validation_set = ratio_df.loc[test_idx]
    validation_set.to_json("validation_set.jsonl", lines=True, orient="records")

print(f"Train size: {len(X_train)}, test size: {len(X_test)}")

Reusing existing validation set
Train size: 136, test size: 26


### Train LinearRegressor


In [302]:
def train_linear_regressor(X_train, X_test, y_train, y_test):
    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)

    return model, {"mse": mse, "rmse": rmse}


# print(f"Feature names: {poly.get_feature_names_out(X.columns)}")
model, scores = train_linear_regressor(X_train, X_test, y_train, y_test)
print(scores)
print(f"intercept: {model.intercept_}")

{'mse': 0.00114341958753493, 'rmse': 0.033814487834875305}
intercept: 0.7096455171821233


In [303]:
pct_of_train = [0.25, 0.50, 0.75, 1]
for pct in pct_of_train:
    num_train = int(len(X_train) * pct)
    _, scores = train_linear_regressor(
        X_train[:num_train], X_test, y_train[:num_train], y_test
    )
    print(num_train, scores)

34 {'mse': 0.002986807169156422, 'rmse': 0.05465168953615636}
68 {'mse': 0.0010468886203582025, 'rmse': 0.03235565824331507}
102 {'mse': 0.001311752166949297, 'rmse': 0.03621811931822658}
136 {'mse': 0.00114341958753493, 'rmse': 0.033814487834875305}


### Train LightGBM


In [280]:
import lightgbm as lgb


def train_lightgbm(X_train, X_test, y_train, y_test):
    train_data = lgb.Dataset(X_train, label=y_train, params={"verbose": -1})
    test_data = lgb.Dataset(
        X_test, label=y_test, reference=train_data, params={"verbose": -1}
    )
    params = {
        "objective": "regression",
        "metric": "rmse",
        "boosting": "gbdt",
        "learning_rate": 0.1,
        "num_leaves": 31,
        "scale_pos_weight": 0.4,
    }
    # Train the model
    model = lgb.train(params, train_data, valid_sets=[test_data])
    # Predict and evaluate
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)

    return model, {"mse": mse, "rmse": rmse}


model, scores = train_lightgbm(X_train, X_test, y_train, y_test)
print(scores)

importances = model.feature_importance()
importance_df = pd.DataFrame(
    {"feature": X_train.columns, "importance": importances}
).sort_values(by="importance", ascending=False)
importance_df.head(10)

{'mse': 0.0010441872370196086, 'rmse': 0.03231388613304826}


Unnamed: 0,feature,importance
1,bertscore__min_val=0.67|max_val=1.0,76
31,prompt_len__min_val=0.67|max_val=1.0,55
4,bertscore_length__min_val=0.67|max_val=1.0,25
33,rouge__min_val=0.33|max_val=0.67,20
24,len_shorter__min_val=0.67|max_val=1.0,14
63,token_len_difference__min_val=0.67|max_val=1.0,10
0,bertscore__min_val=0.33|max_val=0.67,0
42,subject_of_expertise=Earth_sciences,0
45,subject_of_expertise=Family_and_consumer_science,0
44,subject_of_expertise=Electrical_engineering,0


In [128]:
pct_of_train = [0.25, 0.50, 0.75, 1]
for pct in pct_of_train:
    num_train = int(len(X_train) * pct)
    _, scores = train_lightgbm(X_train[:num_train], X_test, y_train[:num_train], y_test)
    print(num_train, scores)

34 {'mse': 0.002460854569408138, 'rmse': 0.049607001213620425}
68 {'mse': 0.0014385044096877847, 'rmse': 0.03792762067000492}
102 {'mse': 0.0011444071579815999, 'rmse': 0.03382908745416583}
136 {'mse': 0.0010727582034843026, 'rmse': 0.03275298770317454}


### Train Polynomial Regressor


In [113]:
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np


def train_quadratic_regressor(X_train, X_test, y_train, y_test):
    poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    X_train_poly = poly.fit_transform(X_train_scaled)
    X_test_poly = poly.transform(X_test_scaled)

    print(X_train_poly)

    model = LinearRegression()
    model.fit(X_train_poly, y_train)

    y_pred = model.predict(X_test_poly)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    return model, {"mse": mse, "rmse": rmse}


model, scores = train_quadratic_regressor(X_train, X_test, y_train, y_test)
print(scores)
print(f"intercept: {model.intercept_}")

[[-0.17399853 -0.50917478 -0.39055714 ...  0.17036594  0.08063528
   0.03816519]
 [-0.17399853 -0.50917478 -0.39055714 ...  0.17036594  0.08063528
   0.03816519]
 [-0.17399853 -0.50917478 -0.39055714 ...  0.17036594  0.08063528
   0.03816519]
 ...
 [-0.17399853 -0.50917478 -0.39055714 ...  0.17036594  0.08063528
   0.03816519]
 [-0.17399853 -0.50917478 -0.39055714 ...  0.17036594  0.08063528
   0.03816519]
 [-0.17399853 -0.50917478 -0.39055714 ...  0.17036594  0.08063528
   0.03816519]]
{'mse': 632082.7304842963, 'rmse': 795.0363076516043}
intercept: -64.4694750623211


In [108]:
pct_of_train = [0.25, 0.50, 0.75, 1]
for pct in pct_of_train:
    num_train = int(len(X_train) * pct)
    _, scores = train_quadratic_regressor(
        X_train[:num_train], X_test, y_train[:num_train], y_test
    )
    print(num_train, scores)

34 {'mse': 0.005630193351098253, 'rmse': 0.07503461435296548}
68 {'mse': 0.2106703901077218, 'rmse': 0.4589884422376252}
102 {'mse': 66.04850532786098, 'rmse': 8.127023152905434}
136 {'mse': 917758.9197794044, 'rmse': 957.9973485242036}


### Linear Regression (statsmodels)


In [124]:
import statsmodels.api as sm


def train_linear_regressor_statsmodels(X_train, X_test, y_train, y_test):
    X_train_sm = sm.add_constant(X_train)  # Add a constant (intercept)
    X_test_sm = sm.add_constant(X_test)

    model = sm.OLS(y_train, X_train_sm).fit()  # Fit the model

    y_pred = model.predict(X_test_sm)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    return model, {"mse": mse, "rmse": rmse}


model, scores = train_linear_regressor_statsmodels(X_train, X_test, y_train, y_test)
print(scores)
print(model.summary())

{'mse': 0.0011434195875349245, 'rmse': 0.03381448783487522}
                            OLS Regression Results                            
Dep. Variable:                Overall   R-squared:                       0.509
Model:                            OLS   Adj. R-squared:                  0.288
Method:                 Least Squares   F-statistic:                     2.300
Date:                Thu, 05 Sep 2024   Prob (F-statistic):           0.000456
Time:                        20:38:07   Log-Likelihood:                 345.25
No. Observations:                 136   AIC:                            -604.5
Df Residuals:                      93   BIC:                            -479.3
Df Model:                          42                                         
Covariance Type:            nonrobust                                         
                                                                     coef    std err          t      P>|t|      [0.025      0.975]
-------------------

In [117]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd


def calculate_vif(X):
    # Add a constant (intercept) if needed
    X_with_const = sm.add_constant(X)

    # Create a DataFrame for VIF values
    vif_data = pd.DataFrame()
    vif_data["feature"] = X_with_const.columns

    # Calculate VIF for each feature
    vif_data["VIF"] = [
        variance_inflation_factor(X_with_const.values, i)
        for i in range(X_with_const.shape[1])
    ]

    return vif_data

In [118]:
vif_df = calculate_vif(X_train)
print(vif_df)

                                            feature        VIF
0                                             const  11.234942
1              bertscore__min_val=0.33|max_val=0.67   1.375240
2               bertscore__min_val=0.67|max_val=1.0   1.219384
3        bertscore_length__min_val=0.0|max_val=0.33   1.422304
4       bertscore_length__min_val=0.33|max_val=0.67   1.319649
..                                              ...        ...
61              subject_of_expertise=System_science        NaN
62   token_len_difference__min_val=0.0|max_val=0.33   1.308015
63  token_len_difference__min_val=0.33|max_val=0.67        NaN
64   token_len_difference__min_val=0.67|max_val=1.0   1.371943
65                    type_of_in_context_material=1   8.499547

[66 rows x 2 columns]


  return 1 - self.ssr/self.centered_tss


In [121]:
vif_df.sort_values(by="VIF", ascending=False).dropna()

Unnamed: 0,feature,VIF
0,const,11.234942
65,type_of_in_context_material=1,8.499547
16,expertise_level=expert_domain_knowledge,5.739525
54,subject_of_expertise=Medicine_and_health,5.615211
6,complexity_of_intents=complex,4.099281
26,open_endedness=high,3.79131
19,languages=English,3.358993
18,format_constraints=1,3.189217
55,subject_of_expertise=Military_sciences,3.185454
28,open_endedness=moderate,3.125325


## Ridge regression


In [123]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score


def train_ridge_regression(X_train, X_test, y_train, y_test, alpha=1.0):
    """
    Train a Ridge regression model with data scaling and evaluation.

    Args:
        X_train (array-like): Training feature matrix.
        X_test (array-like): Testing feature matrix.
        y_train (array-like): Training target variable.
        y_test (array-like): Testing target variable.
        alpha (float): Regularization strength (default=1.0).

    Returns:
        model: Trained Ridge regression model.
        results: Dictionary with model performance metrics.
    """
    # Standardize the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Initialize and train the Ridge regression model
    model = Ridge(alpha=alpha)
    model.fit(X_train_scaled, y_train)

    # Make predictions and evaluate the model
    y_pred = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results = {
        "mse": mse,
        "r2": r2,
        "coefficients": model.coef_,
        "intercept": model.intercept_,
    }

    return model, results

In [126]:
model, results = train_ridge_regression(X_train, X_test, y_train, y_test, alpha=0.5)

print("Model performance metrics:")
print(f"MSE: {results['mse']:.4f}")
print(f"R^2: {results['r2']:.4f}")
print(f"Coefficients: {results['coefficients']}")
print(f"Intercept: {results['intercept']:.4f}")

Model performance metrics:
MSE: 0.0011
R^2: -0.1693
Coefficients: [-1.63557168e-03  2.38934635e-04 -2.29470908e-03 -1.16054283e-03
 -8.59406914e-03 -1.50614468e-03 -1.68555111e-03  7.23575323e-03
 -3.09365889e-03 -7.98728948e-04 -1.92057609e-03  1.00217595e-04
 -2.90232105e-03 -5.82696394e-03 -1.51368217e-03 -7.84774399e-04
  5.58307834e-03 -6.94789044e-04  3.02332461e-03  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  1.22006619e-03 -4.03653628e-03 -2.63795528e-03 -1.22159038e-03
  2.44241398e-05  0.00000000e+00  0.00000000e+00 -6.04454804e-04
 -3.53850355e-04 -1.70981606e-03 -5.73720050e-03 -1.40647581e-03
 -4.32289144e-03  0.00000000e+00 -2.01530580e-03  0.00000000e+00
 -5.70020350e-03  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -1.78423252e-04  0.00000000e+00
 -2.26371370e-03  0.00000000e+00 -1.99730761e-03  0.00000000e+00
  0.00000000e+00  7.41713044e-04  2.44464462e-03  0.00000000e+00
  1.82218868e-03 -4.0936

## Simulation


In [134]:
_, combinations = sample_feature_combinations(
    meta_analyzer_n_samples=2000, max_number=10
)

10it [00:00, 65027.97it/s]
45it [00:00, 97140.34it/s]
120it [00:00, 67117.81it/s]
210it [00:00, 50418.08it/s]
252it [00:00, 44522.52it/s]
210it [00:00, 40864.98it/s]
120it [00:00, 36285.52it/s]
45it [00:00, 28425.25it/s]
10it [00:00, 21323.36it/s]
1it [00:00, 9986.44it/s]

2024-09-05 20:45:51 - INFO - root - Adding meta analyzer features



10it [00:00, 30218.33it/s]
45it [00:00, 92794.34it/s]
120it [00:00, 80312.19it/s]
210it [00:00, 55019.29it/s]
252it [00:00, 50972.44it/s]
210it [00:00, 42051.17it/s]
120it [00:00, 26092.09it/s]
45it [00:00, 31732.29it/s]
10it [00:00, 19526.55it/s]
1it [00:00, 10645.44it/s]


In [135]:
sim_df = pd.DataFrame(0, index=np.arange(len(combinations)), columns=X_train.columns)
for idx, combination in tqdm_notebook(enumerate(combinations), total=len(combinations)):
    activated_feats = []
    for feat in combination:
        if "analyzer" in feat:
            feature_name_str, value_str = feat.split("::")[1].split("|")
            feature_name, value = (
                feature_name_str.split("=")[-1],
                value_str.split("=")[-1],
            )
            activated_feats.append(f"{feature_name}={value}")
        else:
            activated_feats.append(feat.replace("::", "__"))
    sim_df.loc[idx, activated_feats] = 1
sim_df = sim_df.apply(
    lambda row: row.map(lambda x: feat_map.get(row.name, 1) if x == 1 else x)
).dropna(axis=1, how="any")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for idx, combination in tqdm_notebook(enumerate(combinations), total=len(combinations)):


  0%|          | 0/4069 [00:00<?, ?it/s]

In [137]:
sim_results = sim_df.copy(deep=True)
sim_results["activated_features"] = sim_results.apply(
    lambda row: [col for col in sim_results.columns if row[col] != 0], axis=1
)
sim_results["pred"] = model.predict(sim_df)
sim_results = sim_results.sort_values(by="pred", ascending=False).reset_index(drop=True)
sim_results["hash"] = sim_results["activated_features"].apply(
    lambda x: hashlib.md5("___".join(x).encode("utf-8")).hexdigest()
)
sim_results = sim_results.drop_duplicates(subset=["hash"]).reset_index(drop=True)
sim_results[["activated_features", "pred"]].head(20)

Unnamed: 0,activated_features,pred
0,"[complexity_of_intents=simple, languages=Engli...",0.759464
1,"[complexity_of_intents=simple, languages=English]",0.758409
2,"[complexity_of_intents=simple, format_constrai...",0.757289
3,"[complexity_of_intents=simple, format_constrai...",0.757289
4,"[complexity_of_intents=simple, format_constrai...",0.756233
5,"[complexity_of_intents=simple, format_constrai...",0.756233
6,"[bertscore__min_val=0.67|max_val=1.0, complexi...",0.753125
7,"[complexity_of_intents=simple, languages=Engli...",0.751945
8,"[complexity_of_intents=simple, languages=Engli...",0.751945
9,"[complexity_of_intents=simple, languages=Engli...",0.751945


## Do some evals


In [307]:
results_df = pd.read_csv("../../data/top.csv")
actual_scores = results_df["Overall"]

results_df = results_df.head(16)


def is_binary(series):
    return set(series.dropna().unique()).issubset({0, 1})


def update_name(name):
    if "feature_name" in name:
        _, feature = name.split("__")
        name_str, val_str = feature.split("|")
        _, x = name_str.split("=")
        _, y = val_str.split("=")
        return f"{x}={y}"

    else:
        return name


binary_columns = [
    col for col in results_df.columns if is_binary(results_df[col]) and col != "label"
]
results_df = results_df[binary_columns]
results_df = results_df.rename(
    columns={col: update_name(col) for col in binary_columns}
)
results_df = results_df.rename(
    columns={
        "token_len_diff__min_val=0.0|max_val=0.33": "token_len_difference__min_val=0.0|max_val=0.33",
        "token_len_diff__min_val=0.33|max_val=0.67": "token_len_difference__min_val=0.33|max_val=0.67",
        "token_len_diff__min_val=0.67|max_val=1.0": "token_len_difference__min_val=0.67|max_val=1.0",
    }
)

results_df = results_df.apply(
    lambda row: row.map(lambda x: feat_map.get(row.name, 1) if x == 1 else x)
).dropna(axis=1, how="any")


for unused_feat in list(set(sim_df.columns) - set(results_df.columns)):
    results_df[unused_feat] = 0

results_df = results_df.reindex(sorted(results_df.columns), axis=1)

results_df["predicted_scores"] = model.predict(results_df)

results_df["activated_features"] = results_df.apply(
    lambda row: [
        col
        for col in results_df.columns
        if row[col] != 0 and col not in ("predicted_scores")
    ],
    axis=1,
)

results_df["actual_scores"] = actual_scores

results_df["actual_scores_rank"] = results_df["actual_scores"].rank(ascending=False)
results_df["predicted_scores_rank"] = results_df["predicted_scores"].rank(
    ascending=False
)

In [308]:
results_df[
    [
        "activated_features",
        "actual_scores",
        "actual_scores_rank",
        "predicted_scores",
        "predicted_scores_rank",
    ]
]

Unnamed: 0,activated_features,actual_scores,actual_scores_rank,predicted_scores,predicted_scores_rank
0,"[bertscore__min_val=0.67|max_val=1.0, complexi...",0.775226,1.0,0.759017,1.0
1,"[complexity_of_intents=simple, rouge__min_val=...",0.754287,2.0,0.743325,5.0
2,"[complexity_of_intents=simple, open_endedness=...",0.746361,3.0,0.701636,15.0
3,"[complexity_of_intents=simple, cosine_sim__min...",0.742213,4.0,0.74272,6.0
4,"[cosine_sim__min_val=0.33|max_val=0.67, langua...",0.741107,5.0,0.708555,11.0
5,"[bertscore__min_val=0.67|max_val=1.0, cosine_s...",0.728751,6.0,0.709163,10.0
6,"[complexity_of_intents=simple, rouge__min_val=...",0.728602,7.0,0.747429,3.0
7,"[bertscore__min_val=0.67|max_val=1.0, complexi...",0.727363,8.0,0.738763,7.0
8,"[complexity_of_intents=complex, cosine_sim__mi...",0.723358,9.0,0.689556,16.0
9,"[complexity_of_intents=simple, entity_sim__min...",0.72303,10.0,0.707394,12.0


In [309]:
from scipy.stats import spearmanr
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(results_df["actual_scores"], results_df["predicted_scores"])
rmse = np.sqrt(mse)

# Compute Spearman's rho
rho, p_value = spearmanr(results_df["actual_scores"], results_df["predicted_scores"])

print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"Spearman's rho: {rho:.4f}")
print(f"p-value: {p_value:.4f}")

MSE: 0.0005
RMSE: 0.0223
Spearman's rho: 0.2471
p-value: 0.3563


In [26]:
top_n = 100
human_score = 0.715
better_than_humans = sim_results[sim_results["pred"] > human_score]
top_combinations = (
    better_than_humans.activated_features.head(top_n).drop_duplicates().to_list()
)
print(top_combinations)

[['complexity_of_intents=simple', 'languages=English', 'subject_of_expertise=Agriculture'], ['complexity_of_intents=simple', 'languages=English'], ['complexity_of_intents=simple', 'languages=English', 'rouge__min_val=0.0|max_val=0.33'], ['bertscore__min_val=0.67|max_val=1.0', 'complexity_of_intents=simple', 'languages=English', 'open_endedness=no', 'token_len_difference__min_val=0.0|max_val=0.33'], ['complexity_of_intents=simple', 'languages=English', 'safety_concern=high'], ['bertscore__min_val=0.67|max_val=1.0', 'complexity_of_intents=simple', 'entity_sim__min_val=0.0|max_val=0.33', 'languages=English', 'safety_concern=moderate', 'token_len_difference__min_val=0.67|max_val=1.0'], ['complexity_of_intents=simple', 'languages=English', 'open_endedness=moderate'], ['complexity_of_intents=simple', 'languages=English', 'open_endedness=no', 'safety_concern=safe'], ['complexity_of_intents=simple', 'cosine_sim__min_val=0.0|max_val=0.33', 'languages=English', 'open_endedness=no', 'rouge__min_v

So now you have determined 10 feature combinations that seem to work well. The next step is to train RMs and evaluate them.


In [51]:
from beaker import Beaker, ExperimentSpec
from copy import deepcopy

In [48]:
def create_beaker_experiments(
    combinations, *, template="../../beaker/template.yml", output_file="experiments.yml"
):
    spec = ExperimentSpec.from_file(template)
    exp_spec = deepcopy(spec)
    template_task = exp_spec.tasks.pop(0)

    new_tasks = []
    for idx, combination in enumerate(combinations):
        feats_to_run = []
        for feat in combination:
            if "min_val" in feat:
                if "token_len_difference" in feat:
                    feat = feat.replace("difference", "diff")
                feats_to_run.append(feat.replace("__", "::"))
            else:
                feat_name, value = feat.split("=")
                category = find_meta_category(feat_name)
                if category == "closed_set":
                    key = "constraints"
                elif category == "scalar":
                    key = "value"
                elif category == "open_set":
                    key = "check_for_existence"
                feats_to_run.append(
                    f"{category}::feature_name={feat_name}|{key}={value}"
                )
        # Create beaker task
        task = deepcopy(template_task)
        task.name = f"get-features-datamodel-{idx}"
        task.arguments.extend(["--features"] + feats_to_run)
        new_tasks.append(task)

    exp_spec.tasks = new_tasks
    exp_spec.validate()
    exp_spec.to_file(output_file)

In [None]:
create_beaker_experiments(top_combinations)

Get finished jobs and download the subsets and create an `experiments.txt` file


In [49]:
def create_experiments_file(
    beaker_experiment_id: str, output_path: Path, cache_dir: Path
):
    beaker = Beaker.from_env("ai2/ljm-oe-adapt")
    experiment = beaker.experiment.get(beaker_experiment_id)

    experiment_ids = []
    for job in tqdm_notebook(experiment.jobs):
        if job.is_done:
            # Get output
            dataset_id = job.execution.result.beaker
            beaker.dataset.fetch(
                dataset_id,
                force=True,
                target=cache_dir,
                prefix="data/",
                quiet=True,
            )

            beaker.dataset.fetch(
                dataset_id,
                force=True,
                target=cache_dir,
                prefix="experiments.txt",
                quiet=True,
            )

            if (cache_dir / "experiments.txt").exists():
                with open(cache_dir / "experiments.txt", "r") as f:
                    data = f.read().splitlines()
                    if data:
                        id = data[0]
                        experiment_ids.append(id)
                    else:
                        print(f"No data found in cache for {job}")

    print(experiment_ids)
    with open(output_path, "a") as f:
        for id in set(experiment_ids):
            f.write("\n" + id)

In [None]:
# experiment_id = "01J6TS47Q2KNKYRCYHC8A0DE4B"
# experiment_id = "01J6WDKDPQCM92REXJ1VCNJ0NW"
# experiment_id = "01J6XJSWMSAM2ARJ1WXAP6PV8T"
experiment_id = "01J6Z8EX5D9M0EBER108Z4HJ5B"
top_subsets_dir = Path("top_n_subsets")
top_subsets_dir.mkdir(parents=True, exist_ok=True)
experiments_file = top_subsets_dir / "top_n_subsets_experiments.txt"

create_experiments_file(experiment_id, experiments_file)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for job in tqdm_notebook(experiment.jobs):


  0%|          | 0/100 [00:00<?, ?it/s]



KeyboardInterrupt: 

# Finding the feature combination that maximizes the score

Here, we try different approaches to get the best features. Let's see how that works.


In [27]:
n_features = (3, 5, 7, 9, 11)

## Recursive Feature Elimination

> Given an external estimator that assigns weights to features (e.g., the coefficients of a linear model), the goal of recursive feature elimination (RFE) is to select features by recursively considering smaller and smaller sets of features.


In [79]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold


rfe = RFECV(
    model,
    step=1,
    cv=4,  # StratifiedKFold(5),
    scoring="r2",
    min_features_to_select=3,
)
rfe.fit(X_train, y_train)
rfe_features = list(X_train.columns[rfe.support_])

## Select features according to the k highest scores


In [88]:
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression

# from sklearn.model_selection import cross_val_score

selections_kbest = {}
for n in n_features:
    selector = SelectKBest(score_func=f_regression, k=n)
    selector.fit(X_train, y_train)
    selected_features = np.array(X_train.columns)[selector.get_support()]
    selections_kbest[n] = selected_features

selections_kbest

{3: array(['bertscore_length__min_val=0.67|max_val=1.0',
        'complexity_of_intents=simple', 'expertise_level=general_public'],
       dtype=object),
 5: array(['bertscore_length__min_val=0.67|max_val=1.0',
        'complexity_of_intents=simple',
        'entity_sim__min_val=0.67|max_val=1.0',
        'expertise_level=general_public',
        'rouge__min_val=0.33|max_val=0.67'], dtype=object),
 7: array(['bertscore_length__min_val=0.67|max_val=1.0',
        'complexity_of_intents=simple',
        'entity_sim__min_val=0.67|max_val=1.0',
        'expertise_level=general_public',
        'rouge__min_val=0.33|max_val=0.67',
        'rouge__min_val=0.67|max_val=1.0',
        'subject_of_expertise=Social_work'], dtype=object),
 9: array(['bertscore_length__min_val=0.67|max_val=1.0',
        'complexity_of_intents=simple',
        'entity_sim__min_val=0.67|max_val=1.0',
        'expertise_level=general_public', 'format_constraints=1',
        'rouge__min_val=0.33|max_val=0.67',
        'r

## Grid Search


In [59]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel, SelectKBest

pipeline = Pipeline(
    [
        ("feature_selection", SelectKBest(score_func=mutual_info_regression)),
        # ("feature_selection", SelectFromModel(model)),
        ("model", model),
    ]
)

# param_grid = {
#     "feature_selection__threshold": ["mean", "median"]
# }

param_grid = {
    "feature_selection__k": [2, 3, 5, 10, 15]  # Example values for top-n features
}


grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring="r2")
grid_search.fit(X_train, y_train)

best_k = grid_search.best_params_["feature_selection__k"]
best_features = grid_search.best_estimator_.named_steps[
    "feature_selection"
].get_support()
selected_features = np.array(X_train.columns)[best_features]
print(selected_features)

['complexity_of_intents=simple' 'entity_sim__min_val=0.67|max_val=1.0']


## Combine all these top features into one experiment


In [85]:
best_combinations = (
    [rfe_features]
    # list(selections_rfe.values())
    + list(selections_kbest.values())
    + [selected_features]
)

create_beaker_experiments(
    best_combinations,
    output_file="experiments_best.yml",
)

In [87]:
# experiment_id = "01J6ZBPHM8GG5M560PPBFFYB28"
experiment_id = "01J70785358MQTVAQYGVW0CY3X"
top_subsets_dir = Path("best_feature_combinations")
top_subsets_dir.mkdir(parents=True, exist_ok=True)
experiments_file = top_subsets_dir / "best_features_experiments.txt"

create_experiments_file(experiment_id, experiments_file, cache_dir=top_subsets_dir)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for job in tqdm_notebook(experiment.jobs):


  0%|          | 0/8 [00:00<?, ?it/s]

['human_datamodel_7000_FEATS_03be1580ea90b94dbb7e1ae699e01efa_SWAPS_2022::scalar__feature_name-complexity_of_intents|value-simple___entity_sim__min_val-0.67|max_val-1.0', 'human_datamodel_7000_FEATS_03be1580ea90b94dbb7e1ae699e01efa_SWAPS_2022::scalar__feature_name-complexity_of_intents|value-simple___entity_sim__min_val-0.67|max_val-1.0', 'human_datamodel_7000_FEATS_03be1580ea90b94dbb7e1ae699e01efa_SWAPS_2022::scalar__feature_name-complexity_of_intents|value-simple___entity_sim__min_val-0.67|max_val-1.0']


In [83]:
len(best_combinations)

7