# re-running the feature selection analysis

In [36]:
%matplotlib inline
import numpy as np
import scipy as sp
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import csv as csv
from sklearn import metrics
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import Lasso

In [39]:
# Nicely prints coefficients of linear models [0].
# [0]: http://blog.datadive.net/selecting-good-features-part-ii-linear-models-and-regularization/
def prettyprint(coefs, names=None, sort=False, n_coefs=20):
    if names == None:
        names = ["X%s" % x for x in range(len(coefs))]
    lst = zip(coefs, names)
    if sort:
        lst = sorted(lst, key = lambda x:-np.abs(x[0]))
    return " + \n".join("%s * %s" % (round(coef, 3), name) for coef, name in lst)

### Data IO and variable selection

In [4]:
aging      = pd.read_csv('/Users/jorge/Dropbox/research/data/2015-11/lassomodel/data/madb_intclasses_use.csv', header=0).astype(np.float) # Full dataset
superagers = pd.read_csv('/Users/jorge/Dropbox/research/data/2015-11/lassomodel/data/super-agers.csv', header=0).astype(np.float)         # Only super-agers
mci        = pd.read_csv('/Users/jorge/Dropbox/research/data/2015-11/lassomodel/data/mcis.csv', header=0).astype(np.float)                # Only MCIs
train_set  = pd.read_csv('/Users/jorge/Dropbox/research/data/2015-11/lassomodel/data/train_data.csv', header=0).astype(np.float)          # Remaining set from below
test_set   = pd.read_csv('/Users/jorge/Dropbox/research/data/2015-11/lassomodel/data/test_data.csv', header=0).astype(np.float)           # Small set with mix of all

In [29]:
aging.columns      = map(str.upper, aging.columns)
superagers.columns = map(str.upper, superagers.columns)
mci.columns        = map(str.upper, mci.columns)
train_set.columns  = map(str.upper, train_set.columns)
test_set.columns   = map(str.upper, test_set.columns)

In [30]:
col = list(aging.columns.values)
col.remove('SUBJECT')
col.remove('RAVLT_DEL')

In [31]:
superagers

Unnamed: 0,SUBJECT,CLASS,RAVLT_DEL,AGE,EDUCATION,SEX_NUMBERIC,ANIMALS,BNT_30,LM_1,LM_2,...,C3_BETAPOWER_FIXED,C3_GAMMAPOWER_FIXED,O1_DELTAPOWER_FIXED,O1_THETAPOWER_FIXED,O1_ALPHAPOWER_FIXED,O1_ALPHA1POWER_FIXED,O1_ALPHA2POWER_FIXED,O1_ALPHA3POWER_FIXED,O1_BETAPOWER_FIXED,O1_GAMMAPOWER_FIXED
0,101.0,2.0,3.0,80.84873,15.0,2.0,18.0,27.0,10.0,9.0,...,0.36673,0.020071,4.1044,4.8808,3.6693,3.6693,2.1157,0.97352,0.47525,0.10681
1,104.0,2.0,8.0,83.61396,14.0,2.0,22.0,27.0,16.0,15.0,...,0.58826,0.052404,3.5768,6.7582,23.6668,23.6668,36.5612,6.3716,1.3978,0.091309
2,105.0,2.0,7.0,80.69541,18.0,2.0,17.0,28.0,20.0,15.0,...,0.21081,0.069191,4.7605,1.3096,0.53454,0.53454,0.60461,0.45338,0.20894,0.047831
3,107.0,3.0,11.0,91.57837,14.0,2.0,16.0,24.0,12.0,10.0,...,0.2219,0.040332,1.7243,1.1502,1.658,1.658,2.5476,1.0033,0.3387,0.053188
4,108.0,2.0,6.0,80.42984,16.0,2.0,17.0,30.0,16.0,17.0,...,0.26934,0.032368,1.6509,0.70736,2.0272,2.0272,3.8226,2.2,0.29825,0.04938
5,109.0,2.0,9.0,85.39357,14.0,2.0,16.0,29.0,11.0,10.0,...,0.083566,0.032703,1.4273,0.68391,0.65195,0.65195,0.50853,0.49838,0.19491,0.049281
6,110.0,2.0,6.0,80.76112,20.0,2.0,19.0,29.0,18.0,16.0,...,0.2011,0.040893,2.1515,0.73455,1.2196,1.2196,1.1431,0.86858,0.28955,0.052375
7,113.0,3.0,10.0,83.34839,14.0,2.0,20.0,28.0,16.0,16.0,...,0.42152,0.043721,2.7651,3.4892,4.0525,4.0525,4.6633,1.7231,1.4536,0.06323
8,114.0,2.0,5.0,80.12868,18.0,2.0,21.0,26.0,13.0,8.0,...,0.55598,0.088651,3.9469,2.6632,3.1626,3.1626,2.5106,1.693,0.55662,0.079901
9,115.0,3.0,10.0,84.73374,14.0,2.0,24.0,26.0,9.0,10.0,...,0.093661,0.01898,2.0989,0.95206,1.0637,1.0637,1.6378,0.68903,0.23498,0.058289


### Construct training sets and their targets

In [32]:
X_aging, y_aging = aging[col], aging[interest]
X_sa, y_sa       = superagers[col], superagers[interest]
X_mci, y_mci     = mci[col], mci[interest]
X_train, y_train = train_set[col], train_set[interest]
X_test, y_test   = test_set[col], test_set[interest]

score = 'mean_squared_error'
tuned_params_lasso = [{'alpha': np.linspace(-1, 1, 100),
                       'normalize': [True, False]}]

### Across whole dataset

In [40]:
skf = StratifiedKFold(y_aging, n_folds=6)
# regr_cv = GridSearchCV(Lasso(max_iter=100000), tuned_params_lasso, cv=skf)
# regr_cv.fit( X_aging, y_aging )
# regr = regr_cv.best_estimator_

regr = 

print("Best estimator for WHOLE DATASET: \n{0}\n".format(regr))
print("Percent variance explained: {0}".format(regr.score( X_aging, y_aging)))
print("Coefficients found: \n{0}\n".format(prettyprint(regr.coef_, col, sort=True)))



Best estimator for WHOLE DATASET: 
Lasso(alpha=0.010101010101010166, copy_X=True, fit_intercept=True,
   max_iter=100000, normalize=True, positive=False, precompute=False,
   random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

Percent variance explained: 0.8700043573799712
Coefficients found: 
11.542 * RB2_ACC_F + 
-7.21 * RB2_N_ACC + 
3.468 * RB_CHECK_N_ACC + 
2.243 * C19_DELTAPOWER + 
-0.994 * RB1_N_ACC + 
0.925 * O1_DIAGNOSIS_1.42 + 
0.838 * C19_DELTAPOWER_FIXED + 
-0.46 * O1_DELTAPOWER_FIXED + 
-0.431 * EDUCATION + 
0.407 * MMSE + 
-0.3 * C19_ALPHA32RATIO + 
-0.287 * C3_ALPHATHETARATIO_FIXED + 
0.244 * BNT_30 + 
0.24 * DSPAN_B + 
-0.2 * C3_TF + 
0.174 * O1_ALPHA3POWER_FIXED + 
0.156 * LM_2 + 
0.126 * C3_ALPHA2POWER + 
0.09 * ANIMALS + 
-0.067 * C3_DELTAPOWER_FIXED + 
0.045 * DSYMBOL + 
-0.044 * AGE + 
0.031 * GNG_GO_RT + 
0.024 * LM_1 + 
0.012 * O1_THETAPOWER_FIXED + 
0.003 * RB2_F_RT + 
0.001 * GNG_GO_HR + 
0.001 * ALPHATHETA_DIAGNOSIS_33 + 
-0.0 * RB1_N_RT + 
-0

In [43]:
# plot coefficient progression
m_log_alphas = -np.log10(regr_cv.best_estimator_.alphas_)
ax = plt.gca()
plt.plot(m_log_alphas, regr.coef_path_.T)
plt.axvline(-np.log10(regr.alpha_), linestyle='--', color='k',
            label='alpha CV')
plt.ylabel('Regression Coefficients')
plt.xlabel('-log(alpha)')
plt.title('Regression Coefficients Progression for Lasso Paths')

AttributeError: 'Lasso' object has no attribute 'alphas_'

In [45]:
regr_cv.grid_scores_

[mean: -67315821285.81667, std: 106497666267.27710, params: {'normalize': True, 'alpha': -1.0},
 mean: -1634884391092.78711, std: 4068248207087.63086, params: {'normalize': False, 'alpha': -1.0},
 mean: -64623276711.89996, std: 102238304541.60092, params: {'normalize': True, 'alpha': -0.97979797979797978},
 mean: -1569496912616.50024, std: 3905534181867.81201, params: {'normalize': False, 'alpha': -0.97979797979797978},
 mean: -61985866452.82086, std: 98065757740.28726, params: {'normalize': True, 'alpha': -0.95959595959595956},
 mean: -1505442553270.33276, std: 3746141364379.60596, params: {'normalize': False, 'alpha': -0.95959595959595956},
 mean: -56666248265.44531, std: 86751739924.24515, params: {'normalize': True, 'alpha': -0.93939393939393945},
 mean: -1442722663548.33276, std: 3590069234014.60645, params: {'normalize': False, 'alpha': -0.93939393939393945},
 mean: -54254859467.40997, std: 83059908912.62259, params: {'normalize': True, 'alpha': -0.91919191919191923},
 mean: -138