# scripts/generate_feature_dataframe.py

In [80]:
import joblib
import numpy as np
import pandas as pd
from tqdm.autonotebook import tqdm

In [68]:
def vars_values_for_a_line(holder, i, variables, len_sub_variables):
    if not holder[i]["deliverable"]:
        values = [np.nan] * len_sub_variables
        return values
    values = []
    for var in variables:
        values.extend(holder[i]["deliverable"][var][0].tolist())
    return values


def generate_deliverable_variables(holder):
    variables = list(holder[0]["deliverable"].keys())
    sub_variables = []
    for var in variables:
        sub_variables.extend(
            [f"{var}_{j}" for j in range(holder[0]["deliverable"][var].shape[-1])]
        )
    return variables, sub_variables


def generate_vars_df(holder):
    variables, sub_variables = generate_deliverable_variables(holder)
    var_values = []
    for i in tqdm(holder):
        values = vars_values_for_a_line(holder, i, variables, len(sub_variables))
        var_values.append(values)
    var_values = np.array(var_values)
    return pd.DataFrame(var_values, columns=sub_variables)


def generate_index_df(holder):
    unique_id = np.array([holder[i]["unique_id"] for i in holder]).reshape(-1, 1)
    primary_key = np.array([holder[i]["primary_key"] for i in holder])
    index = [
        "unique_id",
        "attack_name",
        "attack_toolchain",
        "attack_id",
        "scenario",
        "target_model",
        "target_model_dataset",
        "attack_id_bis",
    ]
    df_values = np.hstack([unique_id, primary_key])
    index_df = pd.DataFrame(df_values, columns=index)
    return index_df


def get_df_from_holder(holder):
    index_df = generate_index_df(holder)
    var_df = generate_vars_df(holder)
    final_df = pd.concat([index_df, var_df], axis=1)

    return final_df

In [69]:
holder = joblib.load(
    "../data_tcab/reprs/samplewise/fr+small_distilcamembert_allocine_ALL.joblib"
)

In [70]:
df = get_df_from_holder(holder)

  0%|          | 0/28913 [00:00<?, ?it/s]

In [71]:
canine_holder = joblib.load(
    r"../data_tcab\reprs\samplewise\fr+canine_distilcamembert_allocine_ALL_TP.joblib"
)

In [72]:
canine_df = get_df_from_holder(canine_holder)

  0%|          | 0/28913 [00:00<?, ?it/s]

In [73]:
canine_df = canine_df.set_index("unique_id")
canine_df = canine_df[canine_df.columns[canine_df.columns.str.startswith("tp")]]
canine_df.columns = [
    "canine_" + col for col in canine_df.columns[canine_df.columns.str.startswith("tp")]
]

In [84]:
df_with_canine = pd.merge(
    df, canine_df, left_on="unique_id", right_index=True, how="left"
)

In [88]:
df_with_canine["canine_tp_num_lowercase_after_punctuation_0"].isna().sum()

1

In [75]:
df.to_pickle("../data_tcab/whole_feature_dataset.pickle")

In [76]:
df_with_canine.to_pickle("../data_tcab/whole_feature_dataset_with_canine.pickle")

In [79]:
df

Unnamed: 0,unique_id,attack_name,attack_toolchain,attack_id,scenario,target_model,target_model_dataset,attack_id_bis,tp_avg_word_length_0,tp_avg_word_length_1,...,tm_gradient_2073,tm_gradient_2074,tm_gradient_2075,tm_posterior_0,tm_posterior_1,tm_saliency_0,tm_saliency_1,tm_saliency_2,tm_saliency_3,tm_saliency_4
0,166201344496741554700726886801777804838,clean,none,0,sentiment,distilcamembert,allocine,0,6.000000,6.500000,...,0.459151,-2.980232e-08,1.998116e+00,0.000471,0.999529,0.018868,0.000557,0.002822,0.010654,0.022047
1,209587608850316071685484207225827196902,clean,none,1,sentiment,distilcamembert,allocine,1,4.333333,6.888889,...,0.076857,1.490116e-08,2.879451e-01,0.620563,0.379437,0.010417,0.000237,0.001063,0.004222,0.012058
2,2212791274013000891344938268017941685,clean,none,2,sentiment,distilcamembert,allocine,2,4.333333,2.888889,...,0.302853,-2.980232e-08,1.975316e+00,0.006190,0.993810,0.066667,0.006728,0.024232,0.039765,0.062128
3,117702114055223057842675184853669073226,clean,none,3,sentiment,distilcamembert,allocine,3,3.857143,6.408163,...,0.000131,3.195601e-08,1.502552e-07,0.999726,0.000274,0.024390,0.000841,0.007422,0.012783,0.025159
4,214365221041725899015601930476588894361,clean,none,4,sentiment,distilcamembert,allocine,4,4.000000,6.000000,...,0.000149,-1.001172e-08,2.015418e-07,0.999683,0.000317,0.034483,0.000998,0.012688,0.023816,0.053114
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28908,178870397565426230261279084696916458885,textfooler,textattack,2063,sentiment,distilcamembert,allocine,2063,4.500000,6.704545,...,0.091875,1.490116e-08,4.392960e-01,0.531334,0.468666,0.007634,0.000100,0.002760,0.004835,0.008063
28909,297009786167894233980202763751134804379,textfooler,textattack,2064,sentiment,distilcamembert,allocine,2064,4.647059,9.561707,...,0.077698,-1.490116e-08,2.199249e-01,0.668394,0.331606,0.003300,0.000016,0.000643,0.001924,0.004340
28910,328367081408891296663448180370005165130,textfooler,textattack,2065,sentiment,distilcamembert,allocine,2065,5.720930,9.038399,...,0.085552,-5.960464e-08,3.524275e-01,0.580222,0.419778,0.004082,0.000038,0.000741,0.002107,0.004601
28911,48183569796911915420194851451235655871,textfooler,textattack,2066,sentiment,distilcamembert,allocine,2066,4.187500,3.152344,...,0.109585,-2.980232e-08,5.981153e-01,0.453138,0.546862,0.011765,0.000153,0.003648,0.007564,0.016643
