In [1]:
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

import aaanalysis as aa

In [10]:
# Load scales and scale categories from AAanalysis
df_scales = aa.load_scales()
df_cat = aa.load_scales(name="scale_classification")
# Select scales using AAclust
aac = aa.AAclust(model=AgglomerativeClustering, model_kwargs=dict(linkage="ward"))
X = np.array(df_scales).T
scales = aac.fit(X, n_clusters=10, names=list(df_scales))   # Number of clusters = number of selected scales (100 is recommended)
df_cat = df_cat[df_cat["scale_id"].isin(scales)]
df_scales = df_scales[scales]
# Load training data
df_info = aa.load_dataset()
df = aa.load_dataset(name="SEQ_DISULFIDE", min_len=300, n=100)
print(df_scales)

    YUTK870101  BROC820101  ISOY800101  NISK860101  CHOP780203  MIYS990104  \
AA                                                                           
A        0.506       0.565       0.908       0.406       0.248       0.479   
C        0.655       0.000       0.454       0.906       0.450       0.000   
D        0.506       0.216       0.532       0.006       0.963       0.803   
E        0.524       0.072       0.979       0.055       0.440       0.859   
F        0.667       0.973       0.688       0.968       0.174       0.000   
G        0.423       0.274       0.135       0.262       1.000       0.662   
H        0.601       0.243       0.553       0.559       0.440       0.479   
I        1.000       0.541       0.582       1.000       0.000       0.056   
K        0.470       0.188       0.716       0.000       0.661       1.000   
L        0.893       1.000       0.759       0.942       0.028       0.014   
M        0.792       0.507       1.000       0.788       0.119  

In [11]:
# Feature Engineering
y = list(df["label"])
sf = aa.SequenceFeature()
df_parts = sf.get_df_parts(df_seq=df, jmd_n_len=50, jmd_c_len=50)
args = dict(df_scales=df_scales, df_parts=df_parts, accept_gaps=True)

In [12]:
# Small set of features (300 features created)
split_kws = sf.get_split_kws(n_split_max=1, split_types=["Segment"])
cpp = aa.CPP(df_cat=df_cat, **args, split_kws=split_kws)
df_feat = cpp.run(labels=y, tmd_len=200, n_processes=8)
X = sf.feat_matrix(**args, features=df_feat["feature"])
# ML evaluation
rf = RandomForestClassifier()
cv = cross_val_score(rf, X, y, scoring="accuracy", cv=5, n_jobs=8) # Set n_jobs=1 to disable multi-processing
print(np.mean(cv))


1. CPP creates 30 features for 200 samples
   |#########################| 100.00%
2. CPP pre-filters 1 features (5%) with highest 'abs_mean_dif' and 'max_std_test' <= 0.2
3. CPP filtering algorithm
4. CPP returns df with 1 unique features including general information and statistics
0.5549999999999999


In [13]:
# Default set of features (around 100.000 features created)
split_kws = sf.get_split_kws()
cpp = aa.CPP(df_cat=df_cat, **args, split_kws=split_kws)
df_feat = cpp.run(labels=y, tmd_len=200, n_processes=8)
X = sf.feat_matrix(**args, features=df_feat["feature"])
# ML evaluation
rf = RandomForestClassifier()
cv = cross_val_score(rf, X, y, scoring="accuracy", cv=5, n_jobs=1)  # Set n_jobs=1 to disable multi-processing
print(np.mean(cv))

1. CPP creates 9900 features for 200 samples
   |#########################| 100.00%
2. CPP pre-filters 495 features (5%) with highest 'abs_mean_dif' and 'max_std_test' <= 0.2
3. CPP filtering algorithm
4. CPP returns df with 32 unique features including general information and statistics
0.71
