 **Removing Low Variance Features and Selecting Best Features Based on F-Score**

In [1]:
import pandas as pd
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif, RFECV
from sklearn.model_selection import StratifiedKFold

# Loading Data
url = 'https://raw.githubusercontent.com/YuanColab/Dengue-NS3-active-small-molecules-classifier/main/Data/combined_fingerprints.csv'
df = pd.read_csv(url)
X = df.iloc[:, 1:-1]
y = df['class']

# Removing Low Variance Features
def remove_low_variance(input_data, threshold=0.1):
    selection = VarianceThreshold(threshold)
    selection.fit(input_data)
    return input_data[input_data.columns[selection.get_support(indices=True)]]

X_var_thresh = remove_low_variance(X, threshold=0.1)

# Selecting Best Features Based on F-Score
selector = SelectKBest(f_classif, k=1000)
X_f_score = selector.fit_transform(X_var_thresh, y)
selected_features_f_score = X_var_thresh.columns[selector.get_support()]

In [None]:
# @title **SVM-RFE**
from sklearn.svm import SVC

# Create an instance of the SVM classifier
clf = SVC(kernel='linear')

# Define a cross-validation strategy
cv = StratifiedKFold(n_splits=10)
rfe_cv = RFECV(estimator=clf, step=1, cv=cv, scoring='accuracy')
rfe_cv.fit(X_f_score, y)

# Select features identified as important by RFECV
selected_features_rfe_cv = selected_features_f_score[rfe_cv.get_support()]
X_com_rfe_cv = X_var_thresh.iloc[:, selector.get_support()][selected_features_rfe_cv]

# Save the resulting dataset to a CSV file
NS3_feature_SVM_RFE = pd.concat([pd.DataFrame(X_com_rfe_cv), y], axis=1)
NS3_feature_SVM_RFE.to_csv('NS3_feature_SVM_RFE.csv', index=False)
NS3_feature_SVM_RFE

Unnamed: 0,AD2D13,AD2D93,AD2D102,AD2D238,AD2D248,AD2D316,AD2D336,AD2D414,AD2D482,AD2D492,...,PubchemFP704,PubchemFP716,PubchemFP779,SubFP1,SubFP18,SubFP23,SubFP100,SubFP135,SubFP303,class
0,0,0,1,1,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
1,0,0,0,1,1,1,1,1,1,1,...,1,0,0,1,1,1,0,0,0,1
2,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,1,1,0,1
3,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,0,0,1,1,1,1,0,1,0,0,...,1,0,0,0,0,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
586,1,0,0,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
587,0,1,0,1,1,1,1,1,1,0,...,1,0,0,0,0,0,1,0,0,1
588,0,0,0,0,1,0,0,1,0,0,...,1,1,1,1,1,0,0,0,1,0
589,1,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0


In [None]:
# @title **RF-RFE**
from sklearn.ensemble import RandomForestClassifier

# Create an instance of the RF classifier
rf_clf = RandomForestClassifier(n_estimators=500, random_state=42)
cv = StratifiedKFold(n_splits=10)
rfe_cv_rf = RFECV(estimator=rf_clf, step=1, cv=cv, scoring='accuracy')
rfe_cv_rf.fit(X_f_score, y)

# Select features identified as important by RFECV
selected_features_rfe_cv_rf = selected_features_f_score[rfe_cv_rf.get_support()]
X_com_rfe_cv_rf = X_var_thresh.iloc[:, selector.get_support()][selected_features_rfe_cv_rf]

# Save the resulting dataset to a CSV file
NS3_feature_RF_RFE = pd.concat([pd.DataFrame(X_com_rfe_cv_rf), y], axis=1)
NS3_feature_RF_RFE.to_csv('NS3_feature_RF_RFE.csv', index=False)
NS3_feature_RF_RFE

Unnamed: 0,AD2D4,AD2D82,AD2D91,AD2D93,AD2D102,AD2D160,AD2D169,AD2D248,AD2D258,AD2D316,...,PubchemFP714,PubchemFP716,PubchemFP779,SubFP1,SubFP18,SubFP23,SubFP135,SubFP275,SubFP303,class
0,1,1,0,0,1,1,0,0,1,1,...,1,0,0,1,0,0,0,0,0,1
1,1,1,0,0,0,1,1,1,1,1,...,1,0,0,1,1,1,0,1,0,1
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,1,0,1
3,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,1,0,0,1,0,0
4,1,1,0,0,1,1,1,1,0,1,...,0,0,0,0,0,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
586,0,0,1,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
587,1,1,1,1,0,1,1,1,0,1,...,0,0,0,0,0,0,0,1,0,1
588,0,0,1,0,0,0,0,1,1,0,...,1,1,1,1,1,0,0,1,1,0
589,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0


In [2]:
# @title **XGBoost-RFE**
from xgboost import XGBClassifier

# Create an instance of the XGBoost classifier
xgb_clf = XGBClassifier(n_estimators=500, random_state=42)
cv = StratifiedKFold(n_splits=10)
rfe_cv_xgb = RFECV(estimator=xgb_clf, step=1, cv=cv, scoring='accuracy')
rfe_cv_xgb.fit(X_f_score, y)

# Select features identified as important by RFECV
selected_features_rfe_cv_xgb = selected_features_f_score[rfe_cv_xgb.get_support()]
X_com_rfe_cv_xgb = X_var_thresh.iloc[:, selector.get_support()][selected_features_rfe_cv_xgb]

# Save the resulting dataset to a CSV file
NS3_feature_XGBoost_RFE = pd.concat([pd.DataFrame(X_com_rfe_cv_xgb), y], axis=1)
NS3_feature_XGBoost_RFE.to_csv('NS3_feature_XGBoost_RFEb.csv', index=False)
NS3_feature_XGBoost_RFE

Unnamed: 0,AD2D482,AD2D637,AD2D716,ExtFP8,ExtFP35,ExtFP82,ExtFP100,ExtFP128,ExtFP143,ExtFP155,...,MACCSFP97,MACCSFP149,PubchemFP16,PubchemFP193,PubchemFP385,PubchemFP553,PubchemFP685,PubchemFP703,SubFP100,class
0,0,0,0,0,0,1,0,1,1,0,...,0,1,0,1,0,0,0,0,0,1
1,1,1,1,0,0,0,0,0,0,1,...,1,1,1,1,1,0,1,0,0,1
2,0,0,1,0,0,0,0,0,0,1,...,1,0,1,0,1,1,1,0,1,1
3,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,1,0,0
4,0,1,1,0,1,0,0,0,1,0,...,1,0,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
586,0,0,0,1,0,1,0,0,0,1,...,1,0,1,0,0,1,0,1,0,0
587,1,0,0,1,0,0,0,0,0,0,...,1,0,1,1,0,1,1,1,1,1
588,0,0,1,0,0,0,0,1,0,0,...,1,1,0,1,0,1,1,0,0,0
589,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,1,0,0,0
