In [1]:
import sys

sys.path.append("../../")

%load_ext autoreload
%autoreload 2

In [61]:
import lightgbm as lgb
import numpy as np
import pandas as pd
import hashlib
from ast import literal_eval

from src.utils import find_meta_category

For now, we'll combine the lexical-only features and those with Bowen's features

In [3]:
lexical = pd.read_csv("lexical_only.csv").dropna()
lexical_bowen = pd.read_csv("with_bowen.csv").dropna()


lexical_features = []

# Add hashes for lexical features
lexical = lexical.rename(
    columns={
        "Unnamed: 0": "experiment_name",
        "rouge>0.4": "rouge",
        "bertscore>0.8": "bertscore",
        "cosine_sim>0.8": "cosine_sim",
        "entity_sim>0.8": "entity_sim",
        "bertscore_length>0.4": "bertscore_length",
    }
)
lexical["hash"] = lexical["experiment_name"].apply(
    lambda x: hashlib.md5(
        x.split("FEATS")[-1].removeprefix("_").encode("utf-8")
    ).hexdigest(),
)
lexical_feats = lexical[
    [
        "hash",
        "rouge",
        "bertscore",
        "cosine_sim",
        "entity_sim",
        "bertscore_length",
        "label",
        "Overall",
    ]
]

lexical_bowen = lexical_bowen[
    ["hash"]
    + ["rouge", "bertscore", "cosine_sim", "entity_sim", "bertscore_length"]
    + [col for col in lexical_bowen.columns if "=" in col]
    + ["label", "Overall"]
]


df = pd.concat([lexical_feats, lexical_bowen]).fillna(0)

## Get proportion of instances that fulfill the conditions

1. For each row, get features that were activated
2. Then for each activated feature, we get the proportion by looking at the feature dataframe.
3. The proportion is computed as: `number_of_instance_that_fulfill_a_single_condition` / `total_number_of_instances`

In [88]:
features_df = pd.read_csv("helpsteer2_featureset.csv").dropna().reset_index(drop=True)

In [86]:
# Inspect nan columns
rows_with_nan = features_df[features_df.isna().any(axis=1)]
nan_columns = rows_with_nan.columns[rows_with_nan.isna().any()]
df_nan_columns = rows_with_nan[nan_columns]
df_nan_columns

So what you're going to do instead, is to take the binary_cols, and then for each element of that binary_cols, you compute the "weight"

In [89]:
def compute_instances(feat: str, features_df: "pd.DataFrame") -> float:
    total = len(features_df)
    # Hacky approach
    thresholds = {
        "rouge": 0.4,
        "bertscore": 0.8,
        "cosine_sim": 0.8,
        "entity_sim": 0.8,
        "bertscore_length": 0.4,
    }
    if feat in thresholds:
        thresh = thresholds[feat]
        return sum(features_df[feat] > thresh) / total
    else:
        # Parse the feature
        feat_name, value = feat.split("=")
        meta_category = find_meta_category(feat_name)
        if meta_category == "scalar":
            v = value.replace("_", " ")
            return features_df[feat_name].value_counts().get(v) / total
        elif meta_category == "closed_set":
            v = value.replace("_", " ")
            list_of_values = features_df[feat_name].apply(literal_eval).tolist()
            return sum([1 if v in listval else 0 for listval in list_of_values]) / total
        elif meta_category == "open_set":
            list_of_values = features_df[feat_name].apply(literal_eval).tolist()
            return sum([1 if listval else 0 for listval in list_of_values]) / total

        return find_meta_category(feat_name)

In [90]:
feats = df.columns[df.isin([0, 1]).all()]  # get binary columns
feat_map = {
    feat: compute_instances(feat, features_df) for feat in feats if feat != "label"
}

In [91]:
feat_map

{'rouge': 0.6213076014178811,
 'bertscore': 0.6832414336352894,
 'cosine_sim': 0.4565773926742812,
 'entity_sim': 0.25551398188263097,
 'bertscore_length': 0.5574044899566759,
 'complexity_of_intents=complex': 0.09531311539976368,
 'complexity_of_intents=moderate': 0.18294604174871998,
 'complexity_of_intents=simple': 0.7216423788893265,
 'expertise_level=basic_domain_knowledge': 0.49320598660890114,
 'expertise_level=expert_domain_knowledge': 0.10594722331626624,
 'expertise_level=general_public': 0.4005513981882631,
 'format_constraints=1': 0.265852697912564,
 'languages=English': 0.9988184324537219,
 'open_endedness=high': 0.3955297361165813,
 'open_endedness=low': 0.1209137455691217,
 'open_endedness=moderate': 0.44436786136274126,
 'open_endedness=no': 0.03909019298936589,
 'safety_concern=high': 0.0030523828278849943,
 'safety_concern=low': 0.01811736904293029,
 'safety_concern=moderate': 0.006301693580149665,
 'safety_concern=safe': 0.9725285545490351,
 'subject_of_expertise=Agr

Now, let's use those ratios and create features

In [105]:
ratio_df = df.apply(
    lambda row: row.map(lambda x: feat_map.get(row.name) if x == 1 else x)
)

Unnamed: 0,hash,rouge,bertscore,cosine_sim,entity_sim,bertscore_length,label,Overall,complexity_of_intents=complex,complexity_of_intents=moderate,...,subject_of_expertise=Political_science,subject_of_expertise=Psychology,subject_of_expertise=Public_administration,subject_of_expertise=Religion,subject_of_expertise=Social_work,subject_of_expertise=Sociology,subject_of_expertise=Space_sciences,subject_of_expertise=System_science,subject_of_expertise=Visual_arts,type_of_in_context_material=1
0,a9a4c59b129c0a6816b12b6ca883e049,0.000000,0.683241,0.000000,0.0,0.557404,,0.757970,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1,52009160febbb60893944fedd6d927fc,0.000000,0.683241,0.456577,0.0,0.000000,,0.735185,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
2,8275988da88b1aae55e874392fe1cb30,0.621308,0.000000,0.000000,0.0,0.557404,,0.732124,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
3,d8b9477149b0d921769c9e2041155645,0.000000,0.000000,0.000000,0.0,0.557404,,0.731826,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
4,d809a5a3c85aaf3e7fc43f648361f31b,0.621308,0.000000,0.000000,0.0,0.557404,,0.730485,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,01414f78b534bd2b158581a9e90bb978,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.656139,0.0,0.182946,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
118,7c8418ae17d6ca4b5c5c332685d31241,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.655546,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
119,78875c3aa32dbc58dbdd46096a9c877e,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.655163,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.107818
120,e01c9cf683de4b21fe2546acc5eefb64,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.648629,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
