# Differentiable feature selection (DFS) demo

Performs feature subset selection on a dataset and then evaluates those features with a linear model.

In [None]:
import subprocess
import os
import yaml
from sklearn.metrics import roc_auc_score

# the following helper class is adapted from 
# https://stackoverflow.com/questions/4417546/constantly-print-subprocess-output-while-process-is-running
class Runner():
    
    def __init__(self, cmd, print=True):
        self.cmd = cmd
        self.print = print
        self.stdout = ''
        if self.print:
            stderr = subprocess.STDOUT
        else:
            stderr = subprocess.PIPE
            self.stderr = ''
        self.proc = subprocess.Popen(self.cmd.split(), stdout=subprocess.PIPE,
                                     stderr=stderr, universal_newlines=True)
        self.returncode = None
        
    def __call__(self):
        for stdout_line in iter(self.proc.stdout.readline, ''):
            yield stdout_line 
        self.proc.stdout.close()
        self.returncode = self.proc.wait()
        
    def run(self):
        if self.print:
            for l in self():
                if self.print:
                    print(l, end='')
                self.stdout += l
        else:
            self.stdout,self.stderr = self.proc.communicate()
            self.returncode = self.proc.returncode

## Perform subset selection w/ DFS on train

In [None]:
"""
usage: dfs.py [-h] [--dataset_config PATH] [--dn_data DIR] [--device DEVICE]
              [--order ORDER] [--penalty PENALTY] [--lr LR] [--epochs EPOCHS]
              [--workers WORKERS] [--seed SEED] [--batch BATCH]
              [--path_output PATH]
              dataset
              
optional arguments:
  -h, --help            show this help message and exit
  --dataset_config PATH
                        yml file holding values for fn_train, fn_eval, ncols,
                        nrows, nrows_test, zero_based, neg_label, binary
                        (default: ./datasets.yml)
  --dn_data DIR         location of train/test files; mappings/datastats files
                        stored here (default: .)
  --device DEVICE       (default: 'cuda' if available, else 'cpu')
  --order ORDER         {1..12} (default: 4)
  --penalty PENALTY     (0,infty) (default: 10)
  --lr LR               Adam learning rate (default: 0.1)
  --epochs EPOCHS       (default: 1.0)
  --workers WORKERS     for the pytorch dataloader (default: 4)
  --seed SEED           pytorch seed (default: 0)
  --batch BATCH         target batchsize (default: 1000)
  --path_output PATH    output text file with selected features (default:
                        ./dfs.features.NUM_SELECTED_FEATURES.txt)
"""

# the urls for the train/test files are provided in the README
dataset = 'rcv1'
dn_data = os.path.expanduser('~/dfs_data')

order = 4
w_penalty = 2e1
path_output = '%s.dfs.%d.%g.features.txt' % (dataset,order,w_penalty)

cmd = ('python dfs.py %s --dn_data %s --penalty %g --path_output %s'%(dataset,dn_data,w_penalty,path_output))

"""
for the first run on a dataset, DFS:
    locates newlines for the train file,
    estimates means/standard deviations,
    estimates spectral norm iteratively (10 iters).
the estimates are based on the first 10000 examples/labels.
the newline mappings and dataset statistics are then saved (in dn_data) and loaded for subsequent runs.
"""

runner = Runner(cmd)
runner.run()
assert not runner.returncode, 'Failed.'

## Train a linear model on train with selected features using MISSION's SGD. Then, predict on test.

In [None]:
# NOTE: mission_logistic_eval in MISSION/src/ needs to be compiled first

mission_eval_exec = 'MISSION/src/mission_logistic_eval'
datasets = yaml.safe_load(open('datasets.yml','rt'))

cmd = ('%s %s %s %s %d'%(mission_eval_exec,
                         os.path.join(dn_data,datasets[dataset]['fn_train']),
                         os.path.join(dn_data,datasets[dataset]['fn_eval']),
                         path_output, datasets[dataset]['neg_label']))

runner = Runner(cmd, print=False)
runner.run()
assert not runner.returncode, 'Failed: %s'%runner.stderr

## Evaluate predictions

In [None]:
Y = [[float(z) for z in x.split(' ')] for x in runner.stdout.splitlines()]
print('DFS AUC on test: %g'%roc_auc_score([y[0] for y in Y],[y[1] for y in Y]))

## Compare w/ MISSION feature selection

In [None]:
# NOTE: mission_logistic in MISSION/src/ needs to be compiled first

mission_exec = 'MISSION/src/mission_logistic'

n_feats = len(open(path_output,'rt').read().strip().splitlines())

cmd = ('%s %s %s %s %d'%(mission_exec,
                         os.path.join(dn_data,datasets[dataset]['fn_train']),
                         os.path.join(dn_data,datasets[dataset]['fn_eval']),
                         n_feats, datasets[dataset]['neg_label']))

runner = Runner(cmd, print=False)
runner.run()
assert not runner.returncode, 'Failed: %s'%runner.stderr

In [None]:
Y = [[float(z) for z in x.split(' ')] for x in runner.stdout.splitlines()]
print('MISSION AUC on test: %g'%roc_auc_score([y[0] for y in Y],[y[1] for y in Y]))