In [24]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.cross_decomposition import PLSCanonical
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import logistic
from statsmodels.discrete.discrete_model import Logit
from statsmodels.tools import add_constant

In [2]:
data = pd.read_csv('study_19_processed.csv', index_col=0, dtype={'Disease': str})

In [3]:
feature_data = data.T[['npeaks','pcgroup','drt']].T.drop('Disease', axis=1)
data_d = data.T.drop(labels=['npeaks','pcgroup', 'drt'], axis=1).T

In [4]:
(X,y) = (data_d.drop('Disease', axis=1), data_d['Disease'])
dummies=pd.get_dummies(y)
y = dummies.iloc[:,0].values

In [5]:
feature_scaler = StandardScaler()
feature_data_scaled = feature_scaler.fit_transform(feature_data.T[['npeaks','drt']])
feature_data.T[['npeaks', 'drt']] = feature_data_scaled 

In [6]:
from sklearn.cross_validation import train_test_split



In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)

In [8]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_data, npeaks=0, drt=0, group=False, log_scale=False):
        self.npeaks = npeaks
        self.drt = drt
        self.group = group
        self.feature_data = feature_data
        self.log_scale = log_scale
        
    def fit(self, x, y = None):
        return self
    
    def transform(self, data):
        if self.log_scale:
            data = data.fillna(1).replace(0,1).apply(np.log10)
        else:
            data = data.fillna(0)
        data = data.astype(float)
        data = pd.concat([data,feature_data], axis=0)
        data = data.T
        index_to_drop=[]
        for index,row in data.iterrows():
            npeaks_data = row['npeaks']
            drt_data = row['drt']
            if npeaks_data<self.npeaks or drt_data<self.drt:
                index_to_drop.append(index)
        data = data.drop(index_to_drop).drop(['npeaks','drt'], axis=1)
        if self.group:
            data = data.astype(float).groupby('pcgroup').mean().T.values
        else:
            data = data.astype(float).drop('pcgroup', axis=1).T.values
        return data

In [9]:
feature_selector = FeatureSelector(feature_data=feature_data, npeaks = 0, drt = 0, group=False, log_scale=True)
scaler = StandardScaler()
pca = PCA(n_components=None)
estimators = [('select_features',feature_selector),('scale',scaler), ('reduce_dim',pca)]
pipe = Pipeline(estimators)

In [10]:
(X_fit) = pipe.fit_transform(X_train)

In [28]:
pd.DataFrame(X_fit).to_csv('X_fit.csv')
pd.DataFrame(y_train).to_csv('y_fit.csv')