# Feature evaluation testing
#### Algos tested:
1. reliefF
2. reliefe
3. Mutual information
4. XGBoost
5. RF classifier
6. Linear regression
7. Permutation importance
8. Chi-squared

TODO: Check more packages

TODO: export good candidates into rank_algos.py

TODO: evaluate rankings for small subsets

In [37]:
import helper_functions
import pandas as pd
import numpy as np
import ReliefF
import reliefe
import rank_algos
from sklearn.pipeline import make_pipeline
from skrebate import SURF
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import xgboost as xgb

In [2]:
# read the whole dataset
data = pd.read_csv('data/full_data.csv')

In [3]:
# subsampling and splitting into X and y
X, y = helper_functions.sample_and_split(data, frac=0.001, seed=0)

### reliefF

In [4]:
# reliefF
reliefF_ranker = ReliefF.ReliefF()
reliefF_ranker.fit(X, y)
reliefF_results = dict(zip(reliefF_ranker.top_features, reliefF_ranker.feature_scores[::-1]))

### reliefe - time complexity is brutal, unlikely to be useful

In [5]:
reliefe_ranker = reliefe.ReliefE()
reliefe_ranker.fit(X, y)
reliefe_results = {i:score for i, score in enumerate(reliefe_ranker.feature_importances_)}

### mutual information

In [6]:
mutual_info_results = rank_algos.mutual_info_score(X, y)
mutual_info_results = {i:score for i, score in enumerate(mutual_info_results)}

### XGBoost

In [14]:
xgb_ranker = xgb.XGBClassifier()
xgb_ranker.fit(X, y)
xgb_results = {i:score for i, score in enumerate(xgb_ranker.feature_importances_)}

### Random Forest Classifier

In [13]:
rf_ranker = RandomForestClassifier()
rf_ranker.fit(X, y)
rf_results = {i:score for i, score in enumerate(rf_ranker.feature_importances_)}

### Linear regression

In [45]:
lr_ranker = LinearRegression()
lr_ranker.fit(X, y)
lr_results = {i:score for i, score in enumerate(lr_ranker.coef_)}

### Permutation Importance

In [36]:
perm_model = KNeighborsClassifier()
perm_model.fit(X, y)
perm_results = permutation_importance(perm_model, X, y, n_repeats=5, random_state=0, n_jobs=-1)
perm_results["importances_mean"]

array([-2.54291163e-03, -1.90718373e-03, -6.35727908e-03, -3.05149396e-03,
        0.00000000e+00,  7.62873490e-04,  0.00000000e+00, -3.43293071e-03,
       -1.65289256e-03, -1.01716465e-03,  2.54291163e-04,  1.78003814e-03,
        1.78003814e-03,  0.00000000e+00,  5.21296885e-03,  1.27145582e-04,
        0.00000000e+00, -1.27145582e-03, -3.17863954e-03, -3.17863954e-03,
       -6.35727908e-04, -5.46726001e-03,  5.08582327e-04, -6.35727908e-04,
       -1.39860140e-03, -4.19580420e-03, -3.17863954e-03,  5.08582327e-04,
       -1.78003814e-03, -6.66133815e-17, -6.66133815e-17,  2.03432931e-03,
       -5.08582327e-04, -2.67005722e-03, -1.39860140e-03, -1.01716465e-03,
        1.52574698e-03,  1.01716465e-03,  6.35727908e-04, -1.52574698e-03,
       -5.08582327e-04,  1.65289256e-03, -2.16147489e-03,  1.65289256e-03,
       -2.28862047e-03, -3.05149396e-03, -5.08582327e-04,  2.54291163e-04,
        1.52574698e-03, -5.08582327e-04, -1.27145582e-04, -1.65289256e-03,
        1.65289256e-03, -

### Chi-Squared

In [43]:
cs_ranker = SelectKBest(score_func=chi2, k="all")
cs_ranker.fit(X, y)
cs_results = {i:score for i, score in enumerate(cs_ranker.scores_)}
np.array(list(cs_results.values()))/np.max(list(cs_results.values()))

array([2.22328058e-03, 8.09056739e-04, 1.07480566e-04, 1.29943673e-03,
       9.75816874e-07, 2.53735300e-03, 5.38888520e-07, 1.63706550e-06,
       5.46708858e-04, 1.21500156e-03, 4.67873961e-04, 8.69789416e-04,
       2.52597607e-05, 7.11428003e-08, 4.37042483e-03, 1.40376864e-06,
       3.31636034e-05, 4.85055367e-04, 9.79206347e-05, 2.20614904e-04,
       3.48664665e-04, 2.12898870e-03, 3.64047017e-03, 2.25474590e-03,
       1.31430532e-05, 1.75185521e-04, 1.48502419e-03, 3.47907716e-04,
       6.83131802e-04, 6.23105782e-05, 2.38383158e-05, 1.90815514e-04,
       2.70343235e-03, 4.61411664e-04, 9.25568476e-04, 2.25976068e-04,
       4.51959405e-04, 6.37791410e-05, 6.97239513e-04, 1.17568631e-03,
       2.74715390e-04, 1.16152818e-03, 1.56777989e-03, 1.26778699e-04,
       7.66137713e-04, 3.65602072e-04, 2.80000755e-04, 1.27275641e-03,
       3.23139231e-03, 3.98035422e-04, 2.97548051e-03, 3.66323871e-03,
       3.93944109e-03, 1.22700940e-04, 2.24816740e-04, 1.42372371e-03,
      