# Feature Engineering

Featurization has finished. Featurized data was saved in `mrp7pred/feats/all_features_cc` (cancer cell paper data) and `mrp7pred/feats/all_features_man` (manually curated data). 

In this notebook we experimented on different feature selection strategies to determine a subset of features for further model training.

In [1]:
# Autoreload modules
%load_ext autoreload
%autoreload 2

# Disable warnings
import warnings
warnings.filterwarnings("ignore")

from pandas import DataFrame
import pandas as pd
import numpy as np
from mrp7pred.feats.feature_selection import (
    _remove_low_variance_features,
    _remove_similar_features,
    _univariate,
    _from_model,
    _rfecv
)

DATA_CC = "../mrp7pred/feats/all_features_cc/full_features_828_20210105-194733.csv"
DATA_MAN = "../mrp7pred/feats/all_features_man/full_features_828_20210105-013717.csv"

## 1. Load Cancer Cell paper and manually curated data

In [2]:
df_cc = pd.read_csv(DATA_CC, index_col=0)
df_cc.dropna(inplace=True, axis=0)
df_man = pd.read_csv(DATA_MAN, index_col=0)
df_man.dropna(inplace=True, axis=0)

In [3]:
df_cc.head()

Unnamed: 0,name,label,smiles,FractionCSP3,HeavyAtomCount,HeavyAtomMolWt,NHOHCount,NOCount,RingCount,NumAliphaticCarbocycles,...,ATSe7,ATSe8,ATSp1,ATSp2,ATSp3,ATSp4,ATSp5,ATSp6,ATSp7,ATSp8
0,Citriain,1,C=CCc1cc(-c2cc(CC=C)c(O)c(CN(CCCl)CCCl)c2)cc(C...,0.428571,36.0,538.132,2.0,4.0,2.0,0.0,...,4.264,4.176,3.573,3.818,3.917,4.057,4.099,4.149,4.09,4.031
2,AZS,0,Oc1ncc(F)c(O)n1,0.0,9.0,127.054,2.0,4.0,1.0,0.0,...,0.0,0.0,1.906,2.026,1.725,0.817,0.135,0.0,0.0,0.0
3,GT 41,0,Nc1nc(O)c2ncn([C@@H]3O[C@@H](CO)[C@H](O)[C@@H]...,0.5,20.0,270.14,6.0,10.0,3.0,0.0,...,3.1,2.497,2.771,3.107,3.03,2.924,2.744,2.531,2.21,1.479
4,TG,0,CCN(CCO)CCNc1ccc(C)c2sc3ccccc3c(=O)c12,0.35,25.0,332.299,2.0,4.0,3.0,0.0,...,3.281,3.217,3.271,3.603,3.744,3.561,3.359,3.239,3.12,3.044
5,MP,0,COc1cc2c(cc1Oc1c(Oc3c(O)c(OC)cc4c3[C@@H](CC(C)...,0.6,54.0,678.51,1.0,9.0,6.0,0.0,...,5.007,5.043,3.924,4.329,4.464,4.503,4.64,4.637,4.667,4.76


In [4]:
df_man.head()

Unnamed: 0,name,label,smiles,rdk_FractionCSP3,rdk_HeavyAtomCount,rdk_HeavyAtomMolWt,rdk_NHOHCount,rdk_NOCount,rdk_RingCount,rdk_NumAliphaticCarbocycles,...,pychem_ATSe7,pychem_ATSe8,pychem_ATSp1,pychem_ATSp2,pychem_ATSp3,pychem_ATSp4,pychem_ATSp5,pychem_ATSp6,pychem_ATSp7,pychem_ATSp8
0,paclitaxel,1,CC(=O)OC1C(=O)C2(C)C(O)CC3OCC3(OC(C)=O)C2C(OC(...,0.446809,62.0,802.51,4.0,15.0,7.0,3.0,...,5.327,5.184,4.058,4.421,4.61,4.691,4.805,4.894,4.829,4.748
1,vincristine,1,CCC1(O)CC2CN(CCc3c([nH]c4ccccc34)C(C(=O)OC)(c3...,0.565217,60.0,768.524,3.0,14.0,9.0,1.0,...,5.285,5.159,4.047,4.528,4.759,4.967,4.988,4.985,4.913,4.799
2,LTC4,1,CCCCCC=CCC=CC=CC=CC(SCC(NC(=O)CCC(N)C(=O)O)C(=...,0.566667,43.0,578.409,8.0,12.0,0.0,0.0,...,4.247,4.316,3.625,3.715,3.76,3.789,3.831,3.863,3.902,3.888
3,E217bG,1,CC12CCC3c4ccc(O)cc4CCC3C1CCC2OC1OC(C(=O)O)C(O)...,0.708333,32.0,416.256,5.0,8.0,5.0,3.0,...,3.89,3.811,3.451,3.843,3.915,3.791,3.648,3.59,3.502,3.258
4,gemcitabine,1,Nc1ccn(C2OC(CO)C(O)C2(F)F)c(=O)n1,0.555556,18.0,252.112,4.0,7.0,2.0,0.0,...,2.614,1.615,2.614,2.903,2.835,2.645,2.489,2.138,1.619,0.86


In [5]:
df_cc.shape, df_man.shape

((1115, 825), (117, 831))

In [6]:
df_man_feats_l = [col[col.find("_")+1:] for col in df_man.columns.values.tolist() if "_" in col]
both = set(df_man_feats_l) & set(df_cc.columns.values.tolist())
only_in_cc = set(df_cc.columns[3:].values.tolist()) - both
only_in_man = set(df_man_feats_l) - both
print(len(set(df_cc.columns[3:].values.tolist())), len(set(df_man_feats_l)))
only_in_cc, only_in_man

822 822


(set(), set())

`df_cc` has 825 columns, while `df_man` has 831 columns. Since feature names of `df_cc` do not have prefix, thus it's possible that there are duplicated features in `df_man` if all prefixes are removed.

Yes it is. Both datasets have 822 unique faetures

#### Remove duplicated features in `df_man`

In [7]:
df_man_renamed = df_man.rename(columns=lambda x: x[x.find("_")+1:] if "_" in x else x)
df_man_renamed.head()

Unnamed: 0,name,label,smiles,FractionCSP3,HeavyAtomCount,HeavyAtomMolWt,NHOHCount,NOCount,RingCount,NumAliphaticCarbocycles,...,ATSe7,ATSe8,ATSp1,ATSp2,ATSp3,ATSp4,ATSp5,ATSp6,ATSp7,ATSp8
0,paclitaxel,1,CC(=O)OC1C(=O)C2(C)C(O)CC3OCC3(OC(C)=O)C2C(OC(...,0.446809,62.0,802.51,4.0,15.0,7.0,3.0,...,5.327,5.184,4.058,4.421,4.61,4.691,4.805,4.894,4.829,4.748
1,vincristine,1,CCC1(O)CC2CN(CCc3c([nH]c4ccccc34)C(C(=O)OC)(c3...,0.565217,60.0,768.524,3.0,14.0,9.0,1.0,...,5.285,5.159,4.047,4.528,4.759,4.967,4.988,4.985,4.913,4.799
2,LTC4,1,CCCCCC=CCC=CC=CC=CC(SCC(NC(=O)CCC(N)C(=O)O)C(=...,0.566667,43.0,578.409,8.0,12.0,0.0,0.0,...,4.247,4.316,3.625,3.715,3.76,3.789,3.831,3.863,3.902,3.888
3,E217bG,1,CC12CCC3c4ccc(O)cc4CCC3C1CCC2OC1OC(C(=O)O)C(O)...,0.708333,32.0,416.256,5.0,8.0,5.0,3.0,...,3.89,3.811,3.451,3.843,3.915,3.791,3.648,3.59,3.502,3.258
4,gemcitabine,1,Nc1ccn(C2OC(CO)C(O)C2(F)F)c(=O)n1,0.555556,18.0,252.112,4.0,7.0,2.0,0.0,...,2.614,1.615,2.614,2.903,2.835,2.645,2.489,2.138,1.619,0.86


In [8]:
df_man_unique = df_man_renamed.loc[:,~df_man_renamed.columns.duplicated()]
df_man_unique.head()

Unnamed: 0,name,label,smiles,FractionCSP3,HeavyAtomCount,HeavyAtomMolWt,NHOHCount,NOCount,RingCount,NumAliphaticCarbocycles,...,ATSe7,ATSe8,ATSp1,ATSp2,ATSp3,ATSp4,ATSp5,ATSp6,ATSp7,ATSp8
0,paclitaxel,1,CC(=O)OC1C(=O)C2(C)C(O)CC3OCC3(OC(C)=O)C2C(OC(...,0.446809,62.0,802.51,4.0,15.0,7.0,3.0,...,5.327,5.184,4.058,4.421,4.61,4.691,4.805,4.894,4.829,4.748
1,vincristine,1,CCC1(O)CC2CN(CCc3c([nH]c4ccccc34)C(C(=O)OC)(c3...,0.565217,60.0,768.524,3.0,14.0,9.0,1.0,...,5.285,5.159,4.047,4.528,4.759,4.967,4.988,4.985,4.913,4.799
2,LTC4,1,CCCCCC=CCC=CC=CC=CC(SCC(NC(=O)CCC(N)C(=O)O)C(=...,0.566667,43.0,578.409,8.0,12.0,0.0,0.0,...,4.247,4.316,3.625,3.715,3.76,3.789,3.831,3.863,3.902,3.888
3,E217bG,1,CC12CCC3c4ccc(O)cc4CCC3C1CCC2OC1OC(C(=O)O)C(O)...,0.708333,32.0,416.256,5.0,8.0,5.0,3.0,...,3.89,3.811,3.451,3.843,3.915,3.791,3.648,3.59,3.502,3.258
4,gemcitabine,1,Nc1ccn(C2OC(CO)C(O)C2(F)F)c(=O)n1,0.555556,18.0,252.112,4.0,7.0,2.0,0.0,...,2.614,1.615,2.614,2.903,2.835,2.645,2.489,2.138,1.619,0.86


In [9]:
all(df_man_unique.columns == df_cc.columns)

True

In [10]:
# Merge two dataframe
df_all = pd.concat([df_man_unique, df_cc], ignore_index=True)
print(df_all.shape)
df_all.sample(n=10)

(1232, 825)


Unnamed: 0,name,label,smiles,FractionCSP3,HeavyAtomCount,HeavyAtomMolWt,NHOHCount,NOCount,RingCount,NumAliphaticCarbocycles,...,ATSe7,ATSe8,ATSp1,ATSp2,ATSp3,ATSp4,ATSp5,ATSp6,ATSp7,ATSp8
53,﻿Probenecid,0,CCCN(CCC)S(=O)(=O)c1ccc(C(=O)O)cc1,0.461538,19.0,266.213,1.0,5.0,1.0,0.0,...,2.951,2.333,2.909,3.147,3.201,3.12,2.94,2.71,2.341,1.92
1086,"3-(4-(6-(3-(4,5-dihydro-1H-imidazol-2-ylhydraz...",0,CC(=O)O[C@@H](C)c1cc2c(s1)C(=O)c1scc([C@H](C)O...,0.333333,26.0,376.326,0.0,6.0,3.0,1.0,...,3.498,3.425,3.304,3.674,3.741,3.744,3.528,3.324,3.116,2.916
292,"9a,11a-dimethyl-1-(2-oxo-2H-pyran-5-yl)hexadec...",0,CC(=O)O[C@@H]1CC[C@@]2(C)[C@@H](CC[C@H]3[C@H]2...,0.769231,31.0,392.281,0.0,5.0,6.0,4.0,...,3.744,3.531,3.485,3.911,4.068,3.97,3.78,3.643,3.431,3.18
647,1-bromo-2-pentadecanone (ACD/Name),0,CCCCCCCCCCCCCC(=O)CBr,0.933333,17.0,276.068,0.0,1.0,0.0,0.0,...,2.442,2.351,2.844,2.812,2.771,2.652,2.579,2.5,2.415,2.321
787,"N-(3-chloro-1,4-dioxo-1,4-dihydro-2-naphthalen...",0,O=C(NC1=C(Cl)C(=O)c2ccccc2C1=O)C(=O)[C@H](C(=O...,0.0625,41.0,545.805,1.0,8.0,6.0,1.0,...,4.432,4.44,3.741,4.09,4.159,4.061,4.017,4.007,4.036,4.085
934,"5,6-dimethyl-2-(3,3,3-trifluoro-2-(trifluorome...",0,CC(=O)O[C@@H]1C(=O)[C@]2(C)[C@H](O)C[C@H]3OC[C...,0.555556,60.0,778.488,4.0,15.0,6.0,3.0,...,5.298,5.116,4.005,4.397,4.571,4.671,4.796,4.882,4.78,4.683
246,IPD,0,CS(=O)(=O)OCCCNCCCOS(C)(=O)=O,1.0,17.0,270.223,1.0,7.0,0.0,0.0,...,2.574,2.503,2.705,2.597,2.507,2.458,2.281,2.151,2.2,2.119
703,(-)-Roehybridine,0,COc1cc2[nH]c3c(c2cc1OC)CCN[C@]31CC[C@@]2(C[C@@...,0.548387,39.0,494.357,3.0,8.0,7.0,2.0,...,4.275,4.227,3.65,4.134,4.296,4.273,4.174,4.066,3.877,3.886
600,"2,4-Pyrimidinediamine, {6-ethyl-5-[4-[methyl(p...",0,CCc1nc(N)nc(N)c1-c1ccc(N(C)Cc2ccccc2)c([N+](=O...,0.2,28.0,356.26,4.0,8.0,3.0,0.0,...,3.815,3.543,3.251,3.564,3.643,3.605,3.527,3.372,3.257,3.058
113,CCTA-47,0,O=C(Nc1cccc(-c2cc(F)ccc2F)c1)C(Cl)Cl,0.071429,20.0,307.062,1.0,2.0,2.0,0.0,...,2.969,2.807,2.987,3.254,3.177,3.055,3.067,2.849,2.59,2.365


In [11]:
len_1 = len(df_all[df_all["label"]==1])
len_0 = len(df_all[df_all["label"]==0])
len_1, len_0, f"pos/neg = {round(len_1/len_0, 3)}"

(129, 1103, 'pos/neg = 0.117')

In [12]:
df_all.to_csv("../data/all_compounds_828_features.csv")

## Split train test

## Univariate