## Analysis for Paper

Later on you should make this into reproducible scripts


In [1]:
import sys

sys.path.append("../../")

%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import numpy as np
from pathlib import Path

from src.feature_extractor import get_all_features
from scripts.sample_best_subset import compute_gain_linear, compute_gain_quadratic

In [3]:
DATA_DIR = Path("../../data")

## Feature Importance and Correlation

Let's get the feature importances for the linear model.


In [20]:
coef = (
    # pd.read_json(DATA_DIR / "multipref_linear_model" / "coef.jsonl", lines=True)
    pd.read_json(
        DATA_DIR
        / "AAA_helpsteer2_feats_results"
        / "helpsteer2_linear_model"
        / "coef.jsonl",
        lines=True,
    )
    .sort_values(by="coef", ascending=False)
    .reset_index(drop=True)
)
n = 20
feats = pd.concat([coef.head(n), coef.tail(n)]).reset_index(drop=True)
model = joblib.load(DATA_DIR / "multipref_linear_model" / "model.pkl")

## Simulate preformance gain computation given a set of features


In [18]:
df = pd.read_json(DATA_DIR / "multipref_all_features" / "features.jsonl", lines=True)
df["len_longer"] = df["len_longer"].rank(pct=True)
df["len_shorter"] = df["len_shorter"].rank(pct=True)
df["token_len_diff"] = df["token_len_diff"].rank(pct=True)

### len_longer::min_val=0.33|max_val=0.67


In [25]:
feats["feat"].tail(20).to_list()

['analyzer_closed_set::feature_name=subject_of_expertise|constraints=Law',
 'len_longer::min_val=0.67|max_val=1.0',
 'analyzer_closed_set::feature_name=subject_of_expertise|constraints=Media studies and communication',
 'analyzer_closed_set::feature_name=subject_of_expertise|constraints=Library and museum studies',
 'analyzer_scalar::feature_name=safety_concern|value=safe',
 'analyzer_closed_set::feature_name=subject_of_expertise|constraints=Space sciences',
 'analyzer_scalar::feature_name=safety_concern|value=low',
 'len_shorter::min_val=0.33|max_val=0.67',
 'analyzer_closed_set::feature_name=subject_of_expertise|constraints=Military sciences',
 'token_len_diff::min_val=0.67|max_val=1.0',
 'analyzer_closed_set::feature_name=subject_of_expertise|constraints=Family and consumer science',
 'analyzer_scalar::feature_name=expertise_level|value=expert domain knowledge',
 'analyzer_closed_set::feature_name=subject_of_expertise|constraints=Materials science and engineering',
 'analyzer_scalar

In [13]:
len_longer_dfs = df[(df["len_longer"] <= 0.67) & (df["len_longer"] >= 0.33)]
len_longer_ids = len_longer_dfs["id"].to_list()

In [14]:
len_longer_ids

['cdccd42baceb4358888fda8c23a7fb32',
 'aa8af1e5ea024ad08e71273f5695993b',
 'fb0315fba0f1484994e2a9b3e7e1d4cb',
 'ddd1dfb23b984ef89af61a26146d3f31',
 'bf2f6a0585204d4e8f5bdecf11f5e6e4',
 '5304317faf42412fb838b3268e944239',
 '7f44c25a3cae40d3b2cada3d3d6bf40a',
 '7dc0853b417345c2ba85d5e4233fe2e4',
 'd0be61f1d511417f96d22ed5d15a3a16',
 '81c20e1103d1444ca37c6bdda38efa9f',
 '765c2c793fd7490888b251d5a9041bd9',
 '9c60e8a0356f40bda23275c03072a613',
 '163da37a34c445d1af7553f8bc33d7c2',
 'fad7b92d687a42eca8372218d679f2bb',
 '09be42593fb942d4ac935a52d94895ac',
 'a4924e29b7e14a60a2aa83910c653fd5',
 'a748dbcfac614e638dde29a6baa4b71a',
 '88b476d3547a4a5d9db6c1b416bfda5a',
 '8cde2bc64331498f964e5ceb17d5c979',
 'b70f3ac2945f45389b34b1e5e0af2c89',
 '8e63ea3a689e40e6981b27bf5e095e99',
 '25c47012f3db47c983b3b94689bc22f5',
 'ce9e1a11fcbb4e59bbddcf7c9c96e9ec',
 'c845e8878c084d89958788840e1b6a27',
 'cc5861b3d8f6416299898aa1bde40c15',
 '6050bd2c1950494082ac70d7272404d8',
 '7cdcd9c0975d4c13bf891c173e81a550',
 

In [9]:
top_n_feats_names = top_n_feats["feat"].to_list()
feature_progression = np.arange(0, 5000, 1000)

NameError: name 'top_n_feats' is not defined

In [51]:
simulation_map = {}
for feat_name in top_n_feats_names:
    print(feat_name)
    df_simulation = pd.DataFrame(
        0,
        index=np.arange(len(feature_progression)),
        columns=model.feature_names_in_,
    )
    df_simulation[feat_name] = feature_progression
    df_simulation["preds"] = model.predict(df_simulation)
    # Add to map
    simulation_map[feat_name] = df_simulation

bertscore::min_val=0.0|max_val=0.33
len_longer::min_val=0.33|max_val=0.67
len_longer::min_val=0.67|max_val=1.0
len_longer::min_val=0.0|max_val=0.33
analyzer_closed_set::feature_name=languages|constraints=English
analyzer_closed_set::feature_name=subject_of_expertise|constraints=Library and museum studies
analyzer_closed_set::feature_name=subject_of_expertise|constraints=Earth sciences
analyzer_closed_set::feature_name=subject_of_expertise|constraints=Public administration
analyzer_closed_set::feature_name=subject_of_expertise|constraints=Electrical engineering
analyzer_closed_set::feature_name=subject_of_expertise|constraints=Anthropology


In [52]:
simulation_map["analyzer_closed_set::feature_name=languages|constraints=English"][
    "preds"
].diff().fillna(0)

0    0.0000
1    2.7674
2    2.7674
3    2.7674
4    2.7674
Name: preds, dtype: float64

In [54]:
simulation_map["analyzer_closed_set::feature_name=languages|constraints=English"]

Unnamed: 0,bertscore::min_val=0.0|max_val=0.33,bertscore::min_val=0.33|max_val=0.67,bertscore::min_val=0.67|max_val=1.0,bertscore_length::min_val=0.0|max_val=0.33,bertscore_length::min_val=0.33|max_val=0.67,bertscore_length::min_val=0.67|max_val=1.0,cosine_sim::min_val=0.0|max_val=0.33,cosine_sim::min_val=0.33|max_val=0.67,cosine_sim::min_val=0.67|max_val=1.0,entity_sim::min_val=0.0|max_val=0.33,...,analyzer_scalar::feature_name=safety_concern|value=safe,analyzer_scalar::feature_name=safety_concern|value=low,analyzer_scalar::feature_name=safety_concern|value=moderate,analyzer_scalar::feature_name=safety_concern|value=high,analyzer_scalar::feature_name=complexity_of_intents|value=simple,analyzer_scalar::feature_name=complexity_of_intents|value=moderate,analyzer_scalar::feature_name=complexity_of_intents|value=complex,analyzer_open_set::feature_name=type_of_in_context_material|check_for_existence=1,analyzer_open_set::feature_name=format_constraints|check_for_existence=1,preds
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.657555
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.424954
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.192354
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8.959754
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,11.727153


In [66]:
df

Unnamed: 0,bertscore::min_val=0.0|max_val=0.33,bertscore::min_val=0.33|max_val=0.67,bertscore::min_val=0.67|max_val=1.0,bertscore_length::min_val=0.0|max_val=0.33,bertscore_length::min_val=0.33|max_val=0.67,bertscore_length::min_val=0.67|max_val=1.0,cosine_sim::min_val=0.0|max_val=0.33,cosine_sim::min_val=0.33|max_val=0.67,cosine_sim::min_val=0.67|max_val=1.0,entity_sim::min_val=0.0|max_val=0.33,...,analyzer_scalar::feature_name=open_endedness|value=high,analyzer_scalar::feature_name=safety_concern|value=safe,analyzer_scalar::feature_name=safety_concern|value=low,analyzer_scalar::feature_name=safety_concern|value=moderate,analyzer_scalar::feature_name=safety_concern|value=high,analyzer_scalar::feature_name=complexity_of_intents|value=simple,analyzer_scalar::feature_name=complexity_of_intents|value=moderate,analyzer_scalar::feature_name=complexity_of_intents|value=complex,analyzer_open_set::feature_name=type_of_in_context_material|check_for_existence=1,analyzer_open_set::feature_name=format_constraints|check_for_existence=1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
