To demonstrate the ``CPP().eval()`` method, we load the ``DOM_GSEC_PU`` example dataset and its respective feature set (see [Breimann25a]_):

In [1]:
import aaanalysis as aa
aa.options["verbose"] = False
df_seq = aa.load_dataset(name="DOM_GSEC_PU", n=50)
labels = df_seq["label"].to_list()
sf = aa.SequenceFeature()
df_parts = sf.get_df_parts(df_seq=df_seq)
df_cat = aa.load_scales(name="scales_cat")
df_scales = aa.load_scales()
df_feat_best = aa.load_features()

We can now create feature sets using the ``CPP().run()`` method:

In [2]:
# Use all scales
cpp = aa.CPP(df_parts=df_parts)
df_feat_all_scales = cpp.run(labels=labels, label_ref=2)

In [3]:
# Use Conformation scales
scales_conformation = df_cat[df_cat["category"] == "Conformation"]["scale_id"].to_list()
cpp = aa.CPP(df_parts=df_parts, df_scales=df_scales[scales_conformation])
df_feat_conformation = cpp.run(labels=labels, label_ref=2)

In [4]:
# Use Energy scales
scales_energy = df_cat[df_cat["category"] == "Energy"]["scale_id"].to_list()
cpp = aa.CPP(df_parts=df_parts, df_scales=df_scales[scales_energy])
df_feat_energy = cpp.run(labels=labels, label_ref=2)

In [5]:
# Use Polarity scales
scales_polarity = df_cat[df_cat["category"] == "Polarity"]["scale_id"].to_list()
cpp = aa.CPP(df_parts=df_parts, df_scales=df_scales[scales_polarity])
df_feat_polarity = cpp.run(labels=labels, label_ref=2)

These sets can be evaluated using the ``CPP().eval()`` method, which needs the list of feature DataFrames (``list_df_feat``) and ``labels`` as input:

In [6]:
# Create new CPP object with all scales
list_df_feat = [df_feat_best, df_feat_all_scales, df_feat_conformation, df_feat_energy, df_feat_polarity]
cpp = aa.CPP(df_parts=df_parts, df_scales=df_scales)
df_eval = cpp.eval(list_df_feat=list_df_feat, labels=labels, label_ref=2)
aa.display_df(df_eval)

Unnamed: 0,name,n_features,avg_ABS_AUC,range_ABS_AUC,avg_MEAN_DIF,n_clusters,avg_n_feat_per_clust,std_n_feat_per_clust
1,Set 1,"(150, [18, 0, 56, 27, 0, 16, 17, 16])",0.164,"[0.126, 0.142, 0.162, 0.181, 0.244]","(np.float64(0.083), np.float64(-0.08))",21,7.14,5.1
2,Set 2,"(100, [11, 9, 28, 14, 12, 14, 7, 5])",0.251,"[0.224, 0.238, 0.248, 0.264, 0.32]","(np.float64(0.114), np.float64(-0.105))",13,7.69,5.78
3,Set 3,"(100, [0, 0, 100, 0, 0, 0, 0, 0])",0.209,"[0.17, 0.183, 0.206, 0.229, 0.293]","(np.float64(0.104), np.float64(-0.095))",10,10.0,5.37
4,Set 4,"(53, [0, 0, 0, 53, 0, 0, 0, 0])",0.188,"[0.082, 0.153, 0.186, 0.225, 0.32]","(np.float64(0.096), np.float64(-0.089))",3,17.67,3.86
5,Set 5,"(60, [0, 0, 0, 0, 0, 60, 0, 0])",0.182,"[0.044, 0.142, 0.178, 0.222, 0.305]","(np.float64(0.098), np.float64(-0.094))",8,7.5,3.5


The feature sets can be named using the ``names_feature_sets`` parameter:

In [7]:
names_feature_sets = ["Best features", "All scales", "Conformation", "Energy", "Polarity"]
df_eval = cpp.eval(list_df_feat=list_df_feat, labels=labels, label_ref=2, names_feature_sets=names_feature_sets)
aa.display_df(df_eval)

Unnamed: 0,name,n_features,avg_ABS_AUC,range_ABS_AUC,avg_MEAN_DIF,n_clusters,avg_n_feat_per_clust,std_n_feat_per_clust
1,Best features,"(150, [18, 0, 56, 27, 0, 16, 17, 16])",0.164,"[0.126, 0.142, 0.162, 0.181, 0.244]","(np.float64(0.083), np.float64(-0.08))",24,6.25,4.58
2,All scales,"(100, [11, 9, 28, 14, 12, 14, 7, 5])",0.251,"[0.224, 0.238, 0.248, 0.264, 0.32]","(np.float64(0.114), np.float64(-0.105))",13,7.69,4.19
3,Conformation,"(100, [0, 0, 100, 0, 0, 0, 0, 0])",0.209,"[0.17, 0.183, 0.206, 0.229, 0.293]","(np.float64(0.104), np.float64(-0.095))",8,12.5,5.81
4,Energy,"(53, [0, 0, 0, 53, 0, 0, 0, 0])",0.188,"[0.082, 0.153, 0.186, 0.225, 0.32]","(np.float64(0.096), np.float64(-0.089))",3,17.67,9.03
5,Polarity,"(60, [0, 0, 0, 0, 0, 60, 0, 0])",0.182,"[0.044, 0.142, 0.178, 0.222, 0.305]","(np.float64(0.098), np.float64(-0.094))",11,5.45,2.93


The evaluation can be focused on specific scale categories using the ``list_cat`` parameter:

In [8]:
df_eval = cpp.eval(list_df_feat=list_df_feat, labels=labels, label_ref=2, list_cat=["Conformation", "Energy", "Polarity"])
aa.display_df(df_eval)

Unnamed: 0,name,n_features,avg_ABS_AUC,range_ABS_AUC,avg_MEAN_DIF,n_clusters,avg_n_feat_per_clust,std_n_feat_per_clust
1,Set 1,"(99, [56, 27, 16])",0.165,"[0.126, 0.142, 0.165, 0.181, 0.244]","(np.float64(0.083), np.float64(-0.079))",17,5.82,4.12
2,Set 2,"(56, [28, 14, 14])",0.252,"[0.224, 0.234, 0.248, 0.266, 0.32]","(np.float64(0.114), np.float64(-0.106))",9,6.22,3.08
3,Set 3,"(100, [100, 0, 0])",0.209,"[0.17, 0.183, 0.206, 0.229, 0.293]","(np.float64(0.104), np.float64(-0.095))",12,8.33,5.53
4,Set 4,"(53, [0, 53, 0])",0.188,"[0.082, 0.153, 0.186, 0.225, 0.32]","(np.float64(0.096), np.float64(-0.089))",5,10.6,5.71
5,Set 5,"(60, [0, 0, 60])",0.182,"[0.044, 0.142, 0.178, 0.222, 0.305]","(np.float64(0.098), np.float64(-0.094))",8,7.5,3.43


To compare feature sets with different sets of parts, provide a list of part DataFrames (``list_df_parts``) matching to the list of feature DataFrames:

In [9]:
# Load one of the provided top scale datasets
split_kws = sf.get_split_kws(split_types=["Segment"], n_split_max=5)
df_scales = aa.load_scales(top60_n=38)
list_parts = ["tmd", "tmd_jmd", "jmd_n_tmd_n" ,"tmd_c_jmd_c"]
list_df_feat1 = []
list_df_parts = []
for part in list_parts:
    df_parts = sf.get_df_parts(df_seq=df_seq, list_parts=part)
    cpp = aa.CPP(df_parts=df_parts, split_kws=split_kws, df_scales=df_scales)
    df_feat = cpp.run(labels=labels, label_ref=2, max_overlap=1, max_cor=1)
    list_df_feat1.append(df_feat)
    list_df_parts.append(df_parts)

In [10]:
# Create evaluation for unfiltered features
df_eval = cpp.eval(list_df_feat=list_df_feat1, labels=labels, label_ref=2, names_feature_sets=list_parts, list_df_parts=list_df_parts)
aa.display_df(df_eval)

Unnamed: 0,name,n_features,avg_ABS_AUC,range_ABS_AUC,avg_MEAN_DIF,n_clusters,avg_n_feat_per_clust,std_n_feat_per_clust
1,tmd,"(100, [9, 16, 28, 2, 10, 12, 10, 13])",0.139,"[0.067, 0.115, 0.142, 0.162, 0.21]","(np.float64(0.055), np.float64(-0.057))",11,9.09,3.34
2,tmd_jmd,"(100, [11, 13, 18, 14, 5, 23, 5, 11])",0.165,"[0.092, 0.135, 0.161, 0.19, 0.275]","(np.float64(0.056), np.float64(-0.053))",23,4.35,2.08
3,jmd_n_tmd_n,"(100, [14, 10, 25, 5, 10, 17, 9, 10])",0.148,"[0.077, 0.122, 0.143, 0.17, 0.246]","(np.float64(0.054), np.float64(-0.061))",10,10.0,7.71
4,tmd_c_jmd_c,"(100, [13, 17, 29, 18, 1, 17, 0, 5])",0.165,"[0.077, 0.134, 0.162, 0.193, 0.32]","(np.float64(0.074), np.float64(-0.07))",14,7.14,3.66
