# Feature Engineering

Featurization has finished. Featurized data was saved in `mrp7pred/feats/all_features_cc` (cancer cell paper data) and `mrp7pred/feats/all_features_man` (manually curated data). 

In this notebook we experimented on different feature selection strategies to determine a subset of features for further model training.

In [13]:
# Autoreload modules
%load_ext autoreload
%autoreload 2

# Disable warnings
import warnings
warnings.filterwarnings("ignore")

from pandas import DataFrame
import pandas as pd
import numpy as np
from mrp7pred.feats.feature_selection import (
    _remove_low_variance_features,
    _remove_similar_features,
    _univariate,
    _from_model,
    _rfecv
)

DATA_CC = "../mrp7pred/feats/all_features_cc/full_features_828_20210105-194733.csv"
DATA_MAN = "../mrp7pred/feats/all_features_man/full_features_828_20210105-013717.csv"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. Load Cancer Cell paper and manually curated data

In [42]:
df_cc = pd.read_csv(DATA_CC, index_col=0)
df_cc.dropna(inplace=True, axis=0)
df_man = pd.read_csv(DATA_MAN, index_col=0)
df_man.dropna(inplace=True, axis=0)

In [21]:
df_cc.head()

Unnamed: 0,name,label,smiles,FractionCSP3,HeavyAtomCount,HeavyAtomMolWt,NHOHCount,NOCount,RingCount,NumAliphaticCarbocycles,...,ATSe7,ATSe8,ATSp1,ATSp2,ATSp3,ATSp4,ATSp5,ATSp6,ATSp7,ATSp8
0,Citriain,1,C=CCc1cc(-c2cc(CC=C)c(O)c(CN(CCCl)CCCl)c2)cc(C...,0.428571,36.0,538.132,2.0,4.0,2.0,0.0,...,4.264,4.176,3.573,3.818,3.917,4.057,4.099,4.149,4.09,4.031
2,AZS,0,Oc1ncc(F)c(O)n1,0.0,9.0,127.054,2.0,4.0,1.0,0.0,...,0.0,0.0,1.906,2.026,1.725,0.817,0.135,0.0,0.0,0.0
3,GT 41,0,Nc1nc(O)c2ncn([C@@H]3O[C@@H](CO)[C@H](O)[C@@H]...,0.5,20.0,270.14,6.0,10.0,3.0,0.0,...,3.1,2.497,2.771,3.107,3.03,2.924,2.744,2.531,2.21,1.479
4,TG,0,CCN(CCO)CCNc1ccc(C)c2sc3ccccc3c(=O)c12,0.35,25.0,332.299,2.0,4.0,3.0,0.0,...,3.281,3.217,3.271,3.603,3.744,3.561,3.359,3.239,3.12,3.044
5,MP,0,COc1cc2c(cc1Oc1c(Oc3c(O)c(OC)cc4c3[C@@H](CC(C)...,0.6,54.0,678.51,1.0,9.0,6.0,0.0,...,5.007,5.043,3.924,4.329,4.464,4.503,4.64,4.637,4.667,4.76


In [22]:
df_man.head()

Unnamed: 0,name,label,smiles,rdk_FractionCSP3,rdk_HeavyAtomCount,rdk_HeavyAtomMolWt,rdk_NHOHCount,rdk_NOCount,rdk_RingCount,rdk_NumAliphaticCarbocycles,...,pychem_ATSe7,pychem_ATSe8,pychem_ATSp1,pychem_ATSp2,pychem_ATSp3,pychem_ATSp4,pychem_ATSp5,pychem_ATSp6,pychem_ATSp7,pychem_ATSp8
0,paclitaxel,1,CC(=O)OC1C(=O)C2(C)C(O)CC3OCC3(OC(C)=O)C2C(OC(...,0.446809,62.0,802.51,4.0,15.0,7.0,3.0,...,5.327,5.184,4.058,4.421,4.61,4.691,4.805,4.894,4.829,4.748
1,vincristine,1,CCC1(O)CC2CN(CCc3c([nH]c4ccccc34)C(C(=O)OC)(c3...,0.565217,60.0,768.524,3.0,14.0,9.0,1.0,...,5.285,5.159,4.047,4.528,4.759,4.967,4.988,4.985,4.913,4.799
2,LTC4,1,CCCCCC=CCC=CC=CC=CC(SCC(NC(=O)CCC(N)C(=O)O)C(=...,0.566667,43.0,578.409,8.0,12.0,0.0,0.0,...,4.247,4.316,3.625,3.715,3.76,3.789,3.831,3.863,3.902,3.888
3,E217bG,1,CC12CCC3c4ccc(O)cc4CCC3C1CCC2OC1OC(C(=O)O)C(O)...,0.708333,32.0,416.256,5.0,8.0,5.0,3.0,...,3.89,3.811,3.451,3.843,3.915,3.791,3.648,3.59,3.502,3.258
4,gemcitabine,1,Nc1ccn(C2OC(CO)C(O)C2(F)F)c(=O)n1,0.555556,18.0,252.112,4.0,7.0,2.0,0.0,...,2.614,1.615,2.614,2.903,2.835,2.645,2.489,2.138,1.619,0.86


In [19]:
df_cc.shape, df_man.shape

((1115, 825), (117, 831))

In [43]:
df_man_feats_l = [col[col.find("_")+1:] for col in df_man.columns.values.tolist() if "_" in col]
both = set(df_man_feats_l) & set(df_cc.columns.values.tolist())
only_in_cc = set(df_cc.columns[3:].values.tolist()) - both
only_in_man = set(df_man_feats_l) - both
print(len(set(df_cc.columns[3:].values.tolist())), len(set(df_man_feats_l)))
only_in_cc, only_in_man

822 822


(set(), set())

`df_cc` has 825 columns, while `df_man` has 831 columns. Since feature names of `df_cc` do not have prefix, thus it's possible that there are duplicated features in `df_man` if all prefixes are removed.

Yes it is. Both datasets have 822 unique faetures

#### Remove duplicated features in `df_man`

In [50]:
df_man_renamed = df_man.rename(columns=lambda x: x[x.find("_")+1:] if "_" in x else x)
df_man_renamed.head()

Unnamed: 0,name,label,smiles,FractionCSP3,HeavyAtomCount,HeavyAtomMolWt,NHOHCount,NOCount,RingCount,NumAliphaticCarbocycles,...,ATSe7,ATSe8,ATSp1,ATSp2,ATSp3,ATSp4,ATSp5,ATSp6,ATSp7,ATSp8
0,paclitaxel,1,CC(=O)OC1C(=O)C2(C)C(O)CC3OCC3(OC(C)=O)C2C(OC(...,0.446809,62.0,802.51,4.0,15.0,7.0,3.0,...,5.327,5.184,4.058,4.421,4.61,4.691,4.805,4.894,4.829,4.748
1,vincristine,1,CCC1(O)CC2CN(CCc3c([nH]c4ccccc34)C(C(=O)OC)(c3...,0.565217,60.0,768.524,3.0,14.0,9.0,1.0,...,5.285,5.159,4.047,4.528,4.759,4.967,4.988,4.985,4.913,4.799
2,LTC4,1,CCCCCC=CCC=CC=CC=CC(SCC(NC(=O)CCC(N)C(=O)O)C(=...,0.566667,43.0,578.409,8.0,12.0,0.0,0.0,...,4.247,4.316,3.625,3.715,3.76,3.789,3.831,3.863,3.902,3.888
3,E217bG,1,CC12CCC3c4ccc(O)cc4CCC3C1CCC2OC1OC(C(=O)O)C(O)...,0.708333,32.0,416.256,5.0,8.0,5.0,3.0,...,3.89,3.811,3.451,3.843,3.915,3.791,3.648,3.59,3.502,3.258
4,gemcitabine,1,Nc1ccn(C2OC(CO)C(O)C2(F)F)c(=O)n1,0.555556,18.0,252.112,4.0,7.0,2.0,0.0,...,2.614,1.615,2.614,2.903,2.835,2.645,2.489,2.138,1.619,0.86


In [54]:
df_man_unique = df_man_renamed.loc[:,~df_man_renamed.columns.duplicated()]
df_man_unique.head()

Unnamed: 0,name,label,smiles,FractionCSP3,HeavyAtomCount,HeavyAtomMolWt,NHOHCount,NOCount,RingCount,NumAliphaticCarbocycles,...,ATSe7,ATSe8,ATSp1,ATSp2,ATSp3,ATSp4,ATSp5,ATSp6,ATSp7,ATSp8
0,paclitaxel,1,CC(=O)OC1C(=O)C2(C)C(O)CC3OCC3(OC(C)=O)C2C(OC(...,0.446809,62.0,802.51,4.0,15.0,7.0,3.0,...,5.327,5.184,4.058,4.421,4.61,4.691,4.805,4.894,4.829,4.748
1,vincristine,1,CCC1(O)CC2CN(CCc3c([nH]c4ccccc34)C(C(=O)OC)(c3...,0.565217,60.0,768.524,3.0,14.0,9.0,1.0,...,5.285,5.159,4.047,4.528,4.759,4.967,4.988,4.985,4.913,4.799
2,LTC4,1,CCCCCC=CCC=CC=CC=CC(SCC(NC(=O)CCC(N)C(=O)O)C(=...,0.566667,43.0,578.409,8.0,12.0,0.0,0.0,...,4.247,4.316,3.625,3.715,3.76,3.789,3.831,3.863,3.902,3.888
3,E217bG,1,CC12CCC3c4ccc(O)cc4CCC3C1CCC2OC1OC(C(=O)O)C(O)...,0.708333,32.0,416.256,5.0,8.0,5.0,3.0,...,3.89,3.811,3.451,3.843,3.915,3.791,3.648,3.59,3.502,3.258
4,gemcitabine,1,Nc1ccn(C2OC(CO)C(O)C2(F)F)c(=O)n1,0.555556,18.0,252.112,4.0,7.0,2.0,0.0,...,2.614,1.615,2.614,2.903,2.835,2.645,2.489,2.138,1.619,0.86


In [56]:
all(df_man_unique.columns == df_cc.columns)

True

In [58]:
# Merge two dataframe
df_all = pd.concat([df_man_unique, df_cc], ignore_index=True)
print(df_all.shape)
df_all.sample(n=10)

(1232, 825)


Unnamed: 0,name,label,smiles,FractionCSP3,HeavyAtomCount,HeavyAtomMolWt,NHOHCount,NOCount,RingCount,NumAliphaticCarbocycles,...,ATSe7,ATSe8,ATSp1,ATSp2,ATSp3,ATSp4,ATSp5,ATSp6,ATSp7,ATSp8
73,Sipholenone E,0,C[C@@H]1CC[C@H]2[C@@H](/C1=C/C[C@@H]1[C@@](C)(...,0.9,34.0,424.326,2.0,4.0,4.0,3.0,...,4.084,4.093,3.563,4.024,4.183,4.127,3.955,3.751,3.875,3.901
463,SIOMYCIN A,0,C=C(NC(=O)C(=C)NC(=O)c1csc(C2=N[C@@H]3c4csc(n4...,0.394366,113.0,1567.231,18.0,37.0,10.0,1.0,...,5.778,5.836,4.656,4.974,5.081,5.221,5.321,5.379,5.439,5.487
1013,"3,17-dihydroxy-2-(2,2,2-trifluoroethoxy)estra-...",0,C[C@@]12CC[C@@H]3c4cc(OCC(F)(F)F)c(O)cc4/C(=N/...,0.65,28.0,375.217,3.0,5.0,4.0,3.0,...,3.725,3.573,3.284,3.692,3.786,3.664,3.501,3.344,3.117,2.793
902,"(4-(1,3-dihydro-2H-pyrrolo[3,4-b]quinolin-2-yl...",0,CC(C)(C)C(=O)OC[C@H]1C(=O)OC[C@H]1C(=O)N1Cc2cc...,0.454545,29.0,372.251,0.0,7.0,4.0,0.0,...,3.827,3.588,3.307,3.695,3.554,3.443,3.444,3.416,3.333,3.111
819,14-Chloro-20(S)-camptothecin hydrate,0,CC[C@@]1(O)C(=O)OCc2c1c(Cl)c1n(c2=O)Cc2cc3cccc...,0.25,27.0,367.683,1.0,6.0,5.0,0.0,...,3.528,3.254,3.319,3.741,3.854,3.817,3.599,3.389,3.213,2.938
1217,5-(4-((4-chlorophenyl)thio)-2-pentenyl)-6-meth...,0,Cc1nnc(N)nc1C/C=C/[C@H](C)Sc1ccc(Cl)cc1,0.266667,21.0,303.713,2.0,4.0,2.0,0.0,...,2.974,2.866,3.093,3.367,3.294,3.075,2.986,2.857,2.907,2.669
706,2-(4-((4-(dimethylamino)phenyl)imino)-1(4H)-na...,0,CN(C)c1ccc(N=C2C=CC(=C(C#N)C#N)c3ccccc32)cc1,0.095238,25.0,308.259,0.0,4.0,3.0,1.0,...,3.232,3.082,3.234,3.555,3.634,3.556,3.447,3.253,3.058,2.951
436,"RESIBUFOGENIN, METHACRYLATE DERIV",0,C=C(C)C(=O)O[C@@H]1CC[C@@]2(C)[C@@H](CC[C@H]3[...,0.714286,33.0,416.303,0.0,5.0,6.0,4.0,...,3.834,3.692,3.545,3.97,4.099,4.007,3.867,3.743,3.553,3.403
881,"1-chloro-2-(2-chloroethyl)-3-methylpyrido[1,2-...",0,Cc1c(CCCl)c(Cl)n2c(nc3ccccc32)c1C#N,0.2,20.0,293.092,0.0,3.0,3.0,0.0,...,2.624,1.511,3.055,3.389,3.547,3.533,3.354,3.007,2.423,1.499
1062,3-(4-(2-((tert-butyl(dimethyl)silyl)oxy)-4-oxo...,0,CC(C)[C@@H]1CCC(=O)C[C@H]1CCCC[C@H]1CC(=O)C[C@...,0.916667,28.0,364.347,0.0,3.0,2.0,2.0,...,3.323,3.261,3.553,3.857,3.73,3.694,3.618,3.456,3.21,3.214


In [69]:
len_1 = len(df_all[df_all["label"]==1])
len_0 = len(df_all[df_all["label"]==0])
len_1, len_0, f"pos/neg = {round(len_1/len_0, 3)}"

(129, 1103, 'pos/neg = 0.117')