To demonstrate the ``CPP().eval()`` method, we load the ``DOM_GSEC_PU`` example dataset and its respective feature set (see [Breimann24c]_):"

In [14]:
import aaanalysis as aa
aa.options["verbose"] = False
df_seq = aa.load_dataset(name="DOM_GSEC_PU", n=50)
labels = df_seq["label"].to_list()
sf = aa.SequenceFeature()
df_parts = sf.get_df_parts(df_seq=df_seq)
df_cat = aa.load_scales(name="scales_cat")
df_scales = aa.load_scales()
df_feat_best = aa.load_features()

We can now create feature sets using the ``CPP().run()`` method:

In [2]:
# Use all scales
cpp = aa.CPP(df_parts=df_parts)
df_feat_all_scales = cpp.run(labels=labels, label_ref=2)

In [3]:
# Use Conformation scales
scales_conformation = df_cat[df_cat["category"] == "Conformation"]["scale_id"].to_list()
cpp = aa.CPP(df_parts=df_parts, df_scales=df_scales[scales_conformation])
df_feat_conformation = cpp.run(labels=labels, label_ref=2)

In [4]:
# Use Energy scales
scales_energy = df_cat[df_cat["category"] == "Energy"]["scale_id"].to_list()
cpp = aa.CPP(df_parts=df_parts, df_scales=df_scales[scales_energy])
df_feat_energy = cpp.run(labels=labels, label_ref=2)

In [5]:
# Use Polarity scales
scales_polarity = df_cat[df_cat["category"] == "Polarity"]["scale_id"].to_list()
cpp = aa.CPP(df_parts=df_parts, df_scales=df_scales[scales_polarity])
df_feat_polarity = cpp.run(labels=labels, label_ref=2)

These sets can be evaluated using the ``CPP().eval()`` method, which needs the list of feature DataFrames (``list_df_feat``) and ``labels`` as input:

In [15]:
# Create new CPP object with all scales
list_df_feat = [df_feat_best, df_feat_all_scales, df_feat_conformation, df_feat_energy, df_feat_polarity]
cpp = aa.CPP(df_parts=df_parts, df_scales=df_scales)
df_eval = cpp.eval(list_df_feat=list_df_feat, labels=labels, label_ref=2, min_th=-0.3)
aa.display_df(df_eval)

Unnamed: 0,name,n_features,avg_ABS_AUC,max_ABS_AUC,avg_MEAN_DIF,avg_STD_TEST,n_clusters,avg_n_feat_per_clust,std_n_feat_per_clust
1,Set 1,"(150, [18, 0, 56, 27, 0, 16, 17, 16])",0.164,0.244,"(0.083, -0.08)",0.13,42,3.57,3.08
2,Set 2,"(100, [11, 9, 28, 14, 12, 14, 7, 5])",0.251,0.32,"(0.114, -0.105)",0.115,22,4.55,2.31
3,Set 3,"(100, [0, 0, 100, 0, 0, 0, 0, 0])",0.209,0.293,"(0.104, -0.095)",0.123,19,5.26,4.19
4,Set 4,"(53, [0, 0, 0, 53, 0, 0, 0, 0])",0.188,0.32,"(0.096, -0.089)",0.132,21,2.52,1.56
5,Set 5,"(60, [0, 0, 0, 0, 0, 60, 0, 0])",0.182,0.305,"(0.098, -0.094)",0.132,10,6.0,4.12


The feature sets can be named using the ``names_feature_sets`` parameter:

In [16]:
names_feature_sets = ["Best features", "All scales", "Conformation", "Energy", "Polarity"]
df_eval = cpp.eval(list_df_feat=list_df_feat, labels=labels, label_ref=2, names_feature_sets=names_feature_sets, min_th=-0.3)
aa.display_df(df_eval)

Unnamed: 0,name,n_features,avg_ABS_AUC,max_ABS_AUC,avg_MEAN_DIF,avg_STD_TEST,n_clusters,avg_n_feat_per_clust,std_n_feat_per_clust
1,Best features,"(150, [18, 0, 56, 27, 0, 16, 17, 16])",0.164,0.244,"(0.083, -0.08)",0.13,42,3.57,3.08
2,All scales,"(100, [11, 9, 28, 14, 12, 14, 7, 5])",0.251,0.32,"(0.114, -0.105)",0.115,22,4.55,2.31
3,Conformation,"(100, [0, 0, 100, 0, 0, 0, 0, 0])",0.209,0.293,"(0.104, -0.095)",0.123,19,5.26,4.19
4,Energy,"(53, [0, 0, 0, 53, 0, 0, 0, 0])",0.188,0.32,"(0.096, -0.089)",0.132,21,2.52,1.56
5,Polarity,"(60, [0, 0, 0, 0, 0, 60, 0, 0])",0.182,0.305,"(0.098, -0.094)",0.132,10,6.0,4.12


To compare feature sets with different sets of parts, provide a list of part DataFrames (``list_df_parts``) matching to the list of feature DataFrames:

In [19]:
# Load one of the provided top scale datasets
split_kws = sf.get_split_kws(split_types=["Segment"], n_split_max=5)
df_scales = aa.load_scales(top60_n=38)
list_parts = ["tmd", "tmd_jmd", "jmd_n_tmd_n" ,"tmd_c_jmd_c"]
list_df_feat = []
list_df_parts = []
for part in list_parts:
    df_parts = sf.get_df_parts(df_seq=df_seq, list_parts=part)
    cpp = aa.CPP(df_parts=df_parts, split_kws=split_kws, df_scales=df_scales)
    df_feat = cpp.run(labels=labels, label_ref=2, max_overlap=1, max_cor=1)
    list_df_feat.append(df_feat)
    list_df_parts.append(df_parts)

In [22]:
# Create evaluation for unfiltered features
df_eval = cpp.eval(list_df_feat=list_df_feat, labels=labels, label_ref=2, names_feature_sets=list_parts, list_df_parts=list_df_parts)
aa.display_df(df_eval)

Unnamed: 0,name,n_features,avg_ABS_AUC,max_ABS_AUC,avg_MEAN_DIF,avg_STD_TEST,n_clusters,avg_n_feat_per_clust,std_n_feat_per_clust
1,tmd,"(100, [9, 16, 28, 2, 10, 12, 10, 13])",0.139,0.21,"(0.055, -0.057)",0.108,22,4.55,2.25
2,tmd_jmd,"(100, [11, 13, 18, 14, 5, 23, 5, 11])",0.165,0.275,"(0.056, -0.053)",0.087,32,3.12,1.67
3,jmd_n_tmd_n,"(100, [14, 10, 25, 5, 10, 17, 9, 10])",0.148,0.246,"(0.054, -0.061)",0.106,42,2.38,1.41
4,tmd_c_jmd_c,"(100, [13, 17, 29, 18, 1, 17, 0, 5])",0.165,0.32,"(0.074, -0.07)",0.121,55,1.82,0.97
