To demonstrate the ``SequenceFeature().get_df_feat()`` method, we load the ``DOM_GSEC`` example dataset including its respective features  (see [Breimann24c]_):

In [14]:
import aaanalysis as aa
aa.options["verbose"] = False
df_seq = aa.load_dataset(name="DOM_GSEC")
labels = df_seq["label"].to_list()
df_feat = aa.load_features(name="DOM_GSEC")
features = df_feat["feature"].to_list()
sf = aa.SequenceFeature()
df_parts = sf.get_df_parts(df_seq=df_seq)
aa.display_df(df_feat, n_rows=5)


Unnamed: 0,feature,category,subcategory,scale_name,scale_description,abs_auc,abs_mean_dif,mean_dif,std_test,std_ref,p_val_mann_whitney,p_val_fdr_bh,positions,feat_importance,feat_importance_std
1,TMD_C_JMD_C-...)-KLEP840101,Energy,Charge,Charge,"Net charge (...t al., 1984)",0.244,0.104,0.104,0.107,0.111,0.0,0.0,3132333435,0.97,1.439
2,TMD_C_JMD_C-...)-FINA910104,Conformation,α-helix (C-cap),α-helix termination,"Helix termin...t al., 1991)",0.243,0.085,0.085,0.099,0.097,0.0,0.0,3132333435,0.0,0.0
3,TMD_C_JMD_C-...)-LEVM760105,Shape,Side chain length,Side chain length,"Radius of gy...evitt, 1976)",0.233,0.137,0.137,0.162,0.177,0.0,0.0,3233,1.555,2.11
4,TMD_C_JMD_C-...)-HUTJ700102,Energy,Entropy,Entropy,"Absolute ent...chens, 1970)",0.229,0.098,0.098,0.107,0.125,0.0,0.0,3132333435,3.111,3.11
5,TMD_C_JMD_C-...)-RADA880106,ASA/Volume,Volume,Accessible s...e area (ASA),"Accessible s...enden, 1988)",0.223,0.095,0.095,0.115,0.133,0.0,0.0,3233,0.0,0.0


``features``, ``df_parts``, and the ``labels`` of the respective samples of the sequence DataFrame must be provided to retrieve the feature DataFrame:

In [15]:
# Mean difference values are higher because here negative samples (instead of unlabeled ones in Breimann24c) are used as a reference dataset
df_feat = sf.get_df_feat(features=features, df_parts=df_parts, labels=labels)
aa.display_df(df_feat, n_rows=5)

Unnamed: 0,feature,category,subcategory,scale_name,scale_description,abs_auc,abs_mean_dif,mean_dif,std_test,std_ref,p_val_mann_whitney,p_val_fdr_bh,positions
1,TMD_C_JMD_C-...)-KLEP840101,Energy,Charge,Charge,"Net charge (...t al., 1984)",0.335,0.168,0.168,0.107,0.125,0.0,0.0,3132333435
2,TMD_C_JMD_C-...)-FINA910104,Conformation,α-helix (C-cap),α-helix termination,"Helix termin...t al., 1991)",0.333,0.151,0.151,0.099,0.12,0.0,0.0,3132333435
3,TMD_C_JMD_C-...)-LEVM760105,Shape,Side chain length,Side chain length,"Radius of gy...evitt, 1976)",0.33,0.247,0.247,0.162,0.197,0.0,0.0,3233
4,TMD_C_JMD_C-...)-HUTJ700102,Energy,Entropy,Entropy,"Absolute ent...chens, 1970)",0.327,0.162,0.162,0.107,0.135,0.0,0.0,3132333435
5,TMD_C_JMD_C-...)-RADA880106,ASA/Volume,Volume,Accessible s...e area (ASA),"Accessible s...enden, 1988)",0.322,0.184,0.184,0.115,0.165,0.0,0.0,3233


You can adjust the provided labels of the test and reference group using ``label_test`` and ``label_ref``:

In [16]:
df_feat = sf.get_df_feat(features=features, df_parts=df_parts, labels=labels, label_test=0, label_ref=1)
# Mean difference values display opposite signs because they represent the computed difference between the mean of the test group and the mean of the reference group
aa.display_df(df_feat, n_rows=5)

Unnamed: 0,feature,category,subcategory,scale_name,scale_description,abs_auc,abs_mean_dif,mean_dif,std_test,std_ref,p_val_mann_whitney,p_val_fdr_bh,positions
1,TMD_C_JMD_C-...)-KLEP840101,Energy,Charge,Charge,"Net charge (...t al., 1984)",0.335,0.168,-0.168,0.125,0.107,0.0,0.0,3132333435
2,TMD_C_JMD_C-...)-FINA910104,Conformation,α-helix (C-cap),α-helix termination,"Helix termin...t al., 1991)",0.333,0.151,-0.151,0.12,0.099,0.0,0.0,3132333435
3,TMD_C_JMD_C-...)-LEVM760105,Shape,Side chain length,Side chain length,"Radius of gy...evitt, 1976)",0.33,0.247,-0.247,0.197,0.162,0.0,0.0,3233
4,TMD_C_JMD_C-...)-HUTJ700102,Energy,Entropy,Entropy,"Absolute ent...chens, 1970)",0.327,0.162,-0.162,0.135,0.107,0.0,0.0,3132333435
5,TMD_C_JMD_C-...)-RADA880106,ASA/Volume,Volume,Accessible s...e area (ASA),"Accessible s...enden, 1988)",0.322,0.184,-0.184,0.165,0.115,0.0,0.0,3233


The residue positions can be adjusted using the ``start``, ``tmd_len``, ``jmd_n_len``, and ``jmd_c_len`` parameters:

In [17]:
# Shift positions by 10 residues
df_feat = sf.get_df_feat(features=features, df_parts=df_parts, labels=labels,
                         start=11)
aa.display_df(df_feat, n_rows=5)

Unnamed: 0,feature,category,subcategory,scale_name,scale_description,abs_auc,abs_mean_dif,mean_dif,std_test,std_ref,p_val_mann_whitney,p_val_fdr_bh,positions
1,TMD_C_JMD_C-...)-KLEP840101,Energy,Charge,Charge,"Net charge (...t al., 1984)",0.335,0.168,0.168,0.107,0.125,0.0,0.0,4142434445
2,TMD_C_JMD_C-...)-FINA910104,Conformation,α-helix (C-cap),α-helix termination,"Helix termin...t al., 1991)",0.333,0.151,0.151,0.099,0.12,0.0,0.0,4142434445
3,TMD_C_JMD_C-...)-LEVM760105,Shape,Side chain length,Side chain length,"Radius of gy...evitt, 1976)",0.33,0.247,0.247,0.162,0.197,0.0,0.0,4243
4,TMD_C_JMD_C-...)-HUTJ700102,Energy,Entropy,Entropy,"Absolute ent...chens, 1970)",0.327,0.162,0.162,0.107,0.135,0.0,0.0,4142434445
5,TMD_C_JMD_C-...)-RADA880106,ASA/Volume,Volume,Accessible s...e area (ASA),"Accessible s...enden, 1988)",0.322,0.184,0.184,0.115,0.165,0.0,0.0,4243


In [18]:
# Increase TMD length from 20 to 50
df_feat = sf.get_df_feat(features=features, df_parts=df_parts, labels=labels,
                         tmd_len=50)
aa.display_df(df_feat, n_rows=5)

Unnamed: 0,feature,category,subcategory,scale_name,scale_description,abs_auc,abs_mean_dif,mean_dif,std_test,std_ref,p_val_mann_whitney,p_val_fdr_bh,positions
1,TMD_C_JMD_C-...)-KLEP840101,Energy,Charge,Charge,"Net charge (...t al., 1984)",0.335,0.168,0.168,0.107,0.125,0.0,0.0,"53,54,55,56,...,58,59,60,61"
2,TMD_C_JMD_C-...)-FINA910104,Conformation,α-helix (C-cap),α-helix termination,"Helix termin...t al., 1991)",0.333,0.151,0.151,0.099,0.12,0.0,0.0,"53,54,55,56,...,58,59,60,61"
3,TMD_C_JMD_C-...)-LEVM760105,Shape,Side chain length,Side chain length,"Radius of gy...evitt, 1976)",0.33,0.247,0.247,0.162,0.197,0.0,0.0,55565758
4,TMD_C_JMD_C-...)-HUTJ700102,Energy,Entropy,Entropy,"Absolute ent...chens, 1970)",0.327,0.162,0.162,0.107,0.135,0.0,0.0,"53,54,55,56,...,58,59,60,61"
5,TMD_C_JMD_C-...)-RADA880106,ASA/Volume,Volume,Accessible s...e area (ASA),"Accessible s...enden, 1988)",0.322,0.184,0.184,0.115,0.165,0.0,0.0,55565758


T-test can be used instead of Mann-Whitney-U-test by setting ``parameteric=True``:

In [19]:
df_feat = sf.get_df_feat(features=features, df_parts=df_parts, labels=labels, parametric=True)
aa.display_df(df_feat, n_rows=5)

Unnamed: 0,feature,category,subcategory,scale_name,scale_description,abs_auc,abs_mean_dif,mean_dif,std_test,std_ref,p_val_ttest_indep,p_val_fdr_bh,positions
1,TMD_C_JMD_C-...)-KLEP840101,Energy,Charge,Charge,"Net charge (...t al., 1984)",0.335,0.168,0.168,0.107,0.125,0.0,0.0,3132333435
2,TMD_C_JMD_C-...)-FINA910104,Conformation,α-helix (C-cap),α-helix termination,"Helix termin...t al., 1991)",0.333,0.151,0.151,0.099,0.12,0.0,0.0,3132333435
3,TMD_C_JMD_C-...)-LEVM760105,Shape,Side chain length,Side chain length,"Radius of gy...evitt, 1976)",0.33,0.247,0.247,0.162,0.197,0.0,0.0,3233
4,TMD_C_JMD_C-...)-HUTJ700102,Energy,Entropy,Entropy,"Absolute ent...chens, 1970)",0.327,0.162,0.162,0.107,0.135,0.0,0.0,3132333435
5,TMD_C_JMD_C-...)-RADA880106,ASA/Volume,Volume,Accessible s...e area (ASA),"Accessible s...enden, 1988)",0.322,0.184,0.184,0.115,0.165,0.0,0.0,3233
