In [1]:
import sys

sys.path.append("../../")

%load_ext autoreload
%autoreload 2

In [61]:
import lightgbm as lgb
import numpy as np
import pandas as pd
import hashlib
from ast import literal_eval

from src.utils import find_meta_category

For now, we'll combine the lexical-only features and those with Bowen's features

In [3]:
lexical = pd.read_csv("lexical_only.csv").dropna()
lexical_bowen = pd.read_csv("with_bowen.csv").dropna()


lexical_features = []

# Add hashes for lexical features
lexical = lexical.rename(
    columns={
        "Unnamed: 0": "experiment_name",
        "rouge>0.4": "rouge",
        "bertscore>0.8": "bertscore",
        "cosine_sim>0.8": "cosine_sim",
        "entity_sim>0.8": "entity_sim",
        "bertscore_length>0.4": "bertscore_length",
    }
)
lexical["hash"] = lexical["experiment_name"].apply(
    lambda x: hashlib.md5(
        x.split("FEATS")[-1].removeprefix("_").encode("utf-8")
    ).hexdigest(),
)
lexical_feats = lexical[
    [
        "hash",
        "rouge",
        "bertscore",
        "cosine_sim",
        "entity_sim",
        "bertscore_length",
        "label",
        "Overall",
    ]
]

lexical_bowen = lexical_bowen[
    ["hash"]
    + ["rouge", "bertscore", "cosine_sim", "entity_sim", "bertscore_length"]
    + [col for col in lexical_bowen.columns if "=" in col]
    + ["label", "Overall"]
]


df = pd.concat([lexical_feats, lexical_bowen]).fillna(0)

## Get proportion of instances that fulfill the conditions

1. For each row, get features that were activated
2. Then for each activated feature, we get the proportion by looking at the feature dataframe.
3. The proportion is computed as: `number_of_instance_that_fulfill_a_single_condition` / `total_number_of_instances`

In [88]:
features_df = pd.read_csv("helpsteer2_featureset.csv").dropna().reset_index(drop=True)

In [86]:
# Inspect nan columns
rows_with_nan = features_df[features_df.isna().any(axis=1)]
nan_columns = rows_with_nan.columns[rows_with_nan.isna().any()]
df_nan_columns = rows_with_nan[nan_columns]
df_nan_columns

So what you're going to do instead, is to take the binary_cols, and then for each element of that binary_cols, you compute the "weight"

In [89]:
def compute_instances(feat: str, features_df: "pd.DataFrame") -> float:
    total = len(features_df)
    # Hacky approach
    thresholds = {
        "rouge": 0.4,
        "bertscore": 0.8,
        "cosine_sim": 0.8,
        "entity_sim": 0.8,
        "bertscore_length": 0.4,
    }
    if feat in thresholds:
        thresh = thresholds[feat]
        return sum(features_df[feat] > thresh) / total
    else:
        # Parse the feature
        feat_name, value = feat.split("=")
        meta_category = find_meta_category(feat_name)
        if meta_category == "scalar":
            v = value.replace("_", " ")
            return features_df[feat_name].value_counts().get(v) / total
        elif meta_category == "closed_set":
            v = value.replace("_", " ")
            list_of_values = features_df[feat_name].apply(literal_eval).tolist()
            return sum([1 if v in listval else 0 for listval in list_of_values]) / total
        elif meta_category == "open_set":
            list_of_values = features_df[feat_name].apply(literal_eval).tolist()
            return sum([1 if listval else 0 for listval in list_of_values]) / total

        return find_meta_category(feat_name)

In [90]:
feats = df.columns[df.isin([0, 1]).all()]  # get binary columns
feat_map = {
    feat: compute_instances(feat, features_df) for feat in feats if feat != "label"
}

Now, let's use those ratios and create features

In [108]:
ratio_df = df.apply(
    lambda row: row.map(lambda x: feat_map.get(row.name, 1) if x == 1 else x)
)

In [111]:
ratio_df.columns

Index(['hash', 'rouge', 'bertscore', 'cosine_sim', 'entity_sim',
       'bertscore_length', 'label', 'Overall', 'complexity_of_intents=complex',
       'complexity_of_intents=moderate', 'complexity_of_intents=simple',
       'expertise_level=basic_domain_knowledge',
       'expertise_level=expert_domain_knowledge',
       'expertise_level=general_public', 'format_constraints=1',
       'languages=English', 'open_endedness=high', 'open_endedness=low',
       'open_endedness=moderate', 'open_endedness=no', 'safety_concern=high',
       'safety_concern=low', 'safety_concern=moderate', 'safety_concern=safe',
       'subject_of_expertise=Agriculture', 'subject_of_expertise=Anthropology',
       'subject_of_expertise=Biology', 'subject_of_expertise=Chemistry',
       'subject_of_expertise=Computer_sciences',
       'subject_of_expertise=Culinary_arts',
       'subject_of_expertise=Earth_sciences', 'subject_of_expertise=Economics',
       'subject_of_expertise=Electrical_engineering',
       

# Initial LightGBM training

In [110]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [126]:
X = ratio_df.drop(columns=["hash", "Overall", "label"])
y = ratio_df["Overall"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

In [127]:
params = {
    "objective": "regression",
    "metric": "mse",
    "boosting_type": "gbdt",
    "learning_rate": 0.1,
    "num_leaves": 31,
}

# Train the model
model = lgb.train(params, train_data, valid_sets=[test_data])

# Predict and evaluate
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002789 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16
[LightGBM] [Info] Number of data points in the train set: 127, number of used features: 8
[LightGBM] [Info] Start training from score 0.686147
Mean Squared Error: 0.00037589768804514254


In [128]:
importances = model.feature_importance(importance_type="gain")  # or 'gain'

# Create a DataFrame to view feature importances
feature_importance_df = pd.DataFrame(
    {"Feature": X.columns, "Importance": importances}
).sort_values(by="Importance", ascending=False)

print(feature_importance_df)

                                              Feature  Importance
3                                          entity_sim    0.035810
54                      type_of_in_context_material=1    0.035582
12                                  languages=English    0.009497
2                                          cosine_sim    0.008946
11                               format_constraints=1    0.002425
4                                    bertscore_length    0.001839
0                                               rouge    0.000101
7                        complexity_of_intents=simple    0.000000
8              expertise_level=basic_domain_knowledge    0.000000
33                       subject_of_expertise=History    0.000000
34  subject_of_expertise=Human_physical_performanc...    0.000000
35                    subject_of_expertise=Journalism    0.000000
36      subject_of_expertise=Linguistics_and_language    0.000000
37                    subject_of_expertise=Literature    0.000000
38        