# Analyze top feature importances and train smaller models

* `full_output` : XGB all manually extracted features
* `full_output-with_embedding`: XGB all manually extracted features WITH autoencoder embeddings


In [67]:

import json
import pickle
from glob import glob
import pandas as pd

import numpy as np
import xgboost
from utils import ElapsedTimer


In [66]:
# Load the SNOMED_CT Mapping to DX
with open("data/snomed_ct_dx_map.json", "r") as f:
    SNOMED_CODE_MAP = json.load(f)
    


In [69]:
print("Loading features...")
with ElapsedTimer() as t:
    features = pd.read_csv("full_output/features.csv", index_col="header_file")
    features.sort_values(by=["header_file"], inplace=True)
    # features
print(f"Took {t.duration:.2f}s")

Loading features...
Took 444.94s


In [92]:
raw_model_paths = glob("full_output/*.pkl")

for raw_model_path in raw_model_paths:
    with open(raw_model_paths[0], "rb") as f:
        m = pickle.load(f)

    print(raw_model_path)
    print(raw_model_path.split("_models.pkl")[0])
    for key in m.keys():
        print(SNOMED_CODE_MAP[key])
        xgb_classifier = m[key]
        print(xgb_classifier.feature_importances_)
        sorted_feat_idxs = xgb_classifier.feature_importances_.argsort()
        print(sorted_feat_idxs) # ascending order, so 0 is worst

        top_100 = sorted_feat_idxs[::-1][:100]
        # print(xgb_classifier.feature_importances_[top_100])
        print(top_100)
        
        # print(features.columns[top_100])
        labels = list(features.columns[top_100])
        # print(labels)
        # top 100 dataframe features
        print(features.iloc[:, top_100])
        break

    break

full_output/v17_models.pkl
full_output/v17
['IAVB', '1st degree av block']
[1.3748114e-03 3.7994151e-04 5.2086471e-05 ... 0.0000000e+00 0.0000000e+00
 0.0000000e+00]
[ 9474 11977 11978 ... 15794  4746     4]
[    4  4746 15794   926  4741  5667   925  5663   437 12641 14238   745
     7 10078 15792 17374  9481  7061  5252 11991 10067  2074  2027 10982
 10401 13031  5107   391  9922  1586  5563 16558 12634  2061 11151 11048
 10398  7242 11073  9482  2127 12792  4040  2105  8039  3178  5041 17378
  7904  3160  5024 17384  2005 14240  6340 17371 11061 10133 10044     0
  9744 17961  8164 11058  7902 10156 14219 10233  2216  8288 18484  1606
  9527 10552 10405  8521 14770  9743 14225 10066 18252  5566 11100 10372
  8310 17383 12643  5130  9614  2194 10357  2518  1849 10682  2016 10407
 15801 12647 11165   499]
['I_HRV_MeanNN', 'aVR_HRV_MedianNN', 'V5_HRV_MeanNN', 'I_sig__change_quantiles__f_agg_"mean"__isabs_True__qh_0.8__ql_0.2', 'aVR_HRV_MeanNN', 'aVR_sig__change_quantiles__f_agg_"mean"_

(18950,)