In [24]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier

In [2]:
data = pd.read_csv('study_72_processed.csv', index_col=0, dtype={'Disease': str})

In [3]:
feature_data = data.T[['npeaks','pcgroup','drt']].T.drop('Disease', axis=1)
data_d = data.T.drop(labels=['npeaks','pcgroup', 'drt'], axis=1).T

In [4]:
(X,y) = (data_d.drop('Disease', axis=1), data_d['Disease'])
dummies=pd.get_dummies(y)
y = dummies.iloc[:,0].values

In [5]:
feature_scaler = StandardScaler()
feature_data_scaled = feature_scaler.fit_transform(feature_data.T[['npeaks','drt']])
feature_data.T[['npeaks', 'drt']] = feature_data_scaled 

In [6]:
from sklearn.cross_validation import train_test_split



In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)

In [8]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_data, npeaks=0, drt=0, group=False, log_scale=False):
        self.npeaks = npeaks
        self.drt = drt
        self.group = group
        self.feature_data = feature_data
        self.log_scale = log_scale
        
    def fit(self, x, y = None):
        return self
    
    def transform(self, data):
        if self.log_scale:
            data = data.fillna(1).apply(np.log10)
        else:
            data = data.fillna(0)
        data = data.astype(float)
        data = pd.concat([data,feature_data], axis=0)
        data = data.T
        index_to_drop=[]
        for index,row in data.iterrows():
            npeaks_data = row['npeaks']
            drt_data = row['drt']
            if npeaks_data<self.npeaks or drt_data<self.drt:
                index_to_drop.append(index)
        data = data.drop(index_to_drop).drop(['npeaks','drt'], axis=1)
        if self.group:
            data = data.astype(float).groupby('pcgroup').mean().T.values
        else:
            data = data.astype(float).drop('pcgroup', axis=1).T.values
        return data

In [9]:
feature_selector = FeatureSelector(feature_data=feature_data, npeaks = 0.5, drt = 0.5, group=True, log_scale=True)
scaler = StandardScaler()
pca = PCA(n_components=2)
logistic = LogisticRegression()
estimators = [('select_features',feature_selector),('scale',scaler), ('reduce_dim',pca), ('clf',logistic)]
pipe = Pipeline(estimators)

In [25]:
npeaks = [0, 0.2 ,0.5]
drts = [0, 0.2 ,0.5]
Cs = np.logspace(-4,4,3)
param_grid = dict(select_features__npeaks = npeaks,
                  select_features__drt = drts,
                  select_features__group = [True, False], 
                  select_features__log_scale=[True],
                  scale = [StandardScaler()], 
                  reduce_dim__n_components = [None, 5, 50],
                  clf = [LogisticRegression(C=1000, penalty='l1'), LinearDiscriminantAnalysis(), RandomForestClassifier()])

In [None]:
estimator = GridSearchCV(pipe,param_grid=param_grid, verbose=9, n_jobs=-1)
estimator.fit(X_train,y_train)

In [12]:
results = pd.DataFrame(estimator.cv_results_)
results

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_clf__C,param_clf__penalty,param_reduce_dim__n_components,param_scale,param_select_features__drt,param_select_features__group,...,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,5.226706,3.993078,0.684729,0.692121,0.0001,l2,,"StandardScaler(copy=True, with_mean=True, with...",0,True,...,0.705882,0.681481,0.661765,0.703704,0.686567,0.691176,0.259680,0.086443,0.018101,0.009097
1,4.718307,4.121351,0.684729,0.694572,0.0001,l2,,"StandardScaler(copy=True, with_mean=True, with...",0,True,...,0.705882,0.681481,0.661765,0.703704,0.686567,0.698529,0.071623,0.088945,0.018101,0.009494
2,5.355288,3.828359,0.694581,0.699473,0.0001,l2,,"StandardScaler(copy=True, with_mean=True, with...",0,True,...,0.705882,0.681481,0.661765,0.703704,0.716418,0.713235,0.466248,0.115597,0.023683,0.013304
3,4.955198,3.788267,0.694581,0.697023,0.0001,l2,,"StandardScaler(copy=True, with_mean=True, with...",0,False,...,0.705882,0.681481,0.661765,0.703704,0.716418,0.705882,0.483461,0.014992,0.023683,0.011025
4,4.278636,3.695289,0.694581,0.699473,0.0001,l2,,"StandardScaler(copy=True, with_mean=True, with...",0,False,...,0.705882,0.681481,0.661765,0.703704,0.716418,0.713235,0.106192,0.018094,0.023683,0.013304
5,4.501654,3.671533,0.684729,0.709277,0.0001,l2,,"StandardScaler(copy=True, with_mean=True, with...",0,False,...,0.705882,0.681481,0.661765,0.703704,0.686567,0.742647,0.242606,0.028505,0.018101,0.025280
6,4.554237,4.393705,0.684729,0.692121,0.0001,l2,,"StandardScaler(copy=True, with_mean=True, with...",0.2,True,...,0.705882,0.681481,0.661765,0.703704,0.686567,0.691176,0.267409,0.991077,0.018101,0.009097
7,6.591768,4.627092,0.684729,0.694572,0.0001,l2,,"StandardScaler(copy=True, with_mean=True, with...",0.2,True,...,0.705882,0.681481,0.661765,0.703704,0.686567,0.698529,0.419671,0.356342,0.018101,0.009494
8,8.375657,4.961098,0.694581,0.699473,0.0001,l2,,"StandardScaler(copy=True, with_mean=True, with...",0.2,True,...,0.705882,0.681481,0.661765,0.703704,0.716418,0.713235,0.656714,0.298754,0.023683,0.013304
9,5.573445,4.151487,0.694581,0.697023,0.0001,l2,,"StandardScaler(copy=True, with_mean=True, with...",0.2,False,...,0.705882,0.681481,0.661765,0.703704,0.716418,0.705882,0.449605,0.559134,0.023683,0.011025


In [13]:
y_pred = estimator.predict(X_test)

In [14]:
y_pred

array([0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1], dtype=uint8)

In [15]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test,y_pred)
print(confusion_matrix)

[[ 8  2]
 [ 7 34]]


In [21]:
estimator.best_estimator_.steps[0]

('select_features', FeatureSelector(drt=0,
         feature_data=                  0          1        2         3         4         5  \
 new_index
 npeaks     0.881725  -0.870168  1.80835 -0.392379   2.70601 -0.602317
 pcgroup          41        612      114       127       180        4...        41        41        41
 drt       -0.666853 -0.683101 -0.755192
 
 [3 rows x 10946 columns],
         group=False, log_scale=True, npeaks=0))