In [1]:
import sys
sys.path.append('..')

In [2]:
from core.dataset import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import xgboost as xgb
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [3]:
load_dir = 'D:/Data/hoffmanlab/featureselection/data/'
ligands = ['CpG', 'FLA', 'FSL', 'LPS', 'P3K', 'PIC', 'R84', 'TNF']
sheet_type = 'am'

## trying xgboost feature selection using 'gain' vs feature_importances_

In [4]:
data = Data(load_dir, ligands, sheet_type, merge=True)

In [7]:
Xdf, Ydf = data.iloc[:, [i for i in range(984)]], data.iloc[:, [984]]

In [8]:
# stored feature names for xgboost later
feature_names = list(Xdf.columns)

In [10]:
X, Y = Xdf.to_numpy(), Ydf.to_numpy().reshape(-1, )

In [11]:
model = xgb.XGBClassifier(tree_method='gpu_hist', use_label_encoder=False)

In [13]:
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.1, random_state=42)

In [14]:
model.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=0, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=16,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='gpu_hist', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [15]:
pred = model.predict(X_val)
cr = classification_report(y_val, pred, target_names=ligands)

In [16]:
print(cr)

              precision    recall  f1-score   support

         CpG       0.50      0.59      0.54       196
         FLA       0.52      0.50      0.51       143
         FSL       0.60      0.62      0.61       189
         LPS       0.63      0.76      0.69       149
         P3K       0.63      0.53      0.57       185
         PIC       0.77      0.68      0.72       165
         R84       0.84      0.83      0.83       249
         TNF       0.75      0.67      0.71       162

    accuracy                           0.66      1438
   macro avg       0.65      0.65      0.65      1438
weighted avg       0.66      0.66      0.66      1438



In [17]:
boost = model.get_booster()

In [19]:
# set feature names
boost.feature_names = feature_names

In [22]:
# 'gain'
gain_dict = boost.get_score(importance_type='gain')

# feature_importances_
rev_fi = model.feature_importances_.argsort()
fi = rev_fi[::-1]
feature_importances = Xdf.columns[fi]

# get score()
avg_importance = boost.get_score()

In [29]:
# sorted in ascending order so need to reverse after
sorted_gain_dict = dict(sorted(gain_dict.items(), key=lambda item: item[1]))

In [31]:
sorted_gain_dict_list_reversed = [k for k in sorted_gain_dict.keys()]

In [33]:
sorted_gain_dict_list = sorted_gain_dict_list_reversed[::-1]

In [35]:
# check how many entries match in terms of importance
# get an idea of how different feature importances is with get score()
count = 0
for i, j in zip(sorted_gain_dict_list, list(feature_importances)):
    if i == j:
        count += 1
print(count)

753


In [36]:
count = 0
for i, j in zip(sorted_gain_dict_list[:100], list(feature_importances)[:100]):
    if i == j:
        count += 1
print(count)

100


### gain vs holistic feature importance approach return very similar rankings