In [170]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import GridSearchCV
from sklearn import ensemble
from sklearn import neighbors
from sklearn import svm
from sklearn import linear_model

# preprocessing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# metrics
from sklearn.metrics import accuracy_score, log_loss

# classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

# global variables
dataset_file_path = 'data/processed.cleveland.data'

def load_data(filename):
    '''
    Reads specified .csv file and returns an X and y dataframe.
    > 0. age
    > 1. sex
    > 2. chest pain type(4 values)
    > 3. resting blood pressure
    > 4. serum cholestoral in mg/dl
    > 5. fasting blood sugar > 120 mg/dl
    > 6. resting electrocardiographic results(values 0, 1, 2)
    > 7. maximum heart rate achieved
    > 8. exercise induced angina
    > 9. oldpeak = ST depression induced by exercise relative to rest
    > 10. the slope of the peak exercise ST segment
    > 11. number of major vessels(0-3) colored by flourosopy
    > 12. thal: 3 = normal, 6 = fixed defect, 7 = reversable defect
    > 13. num: 0 = no presence, 4 = present
    '''

    # reading the data
    try:
        print("Reading .csv")
        data = pd.read_csv(filename, header=None)
        print("Finished reading .csv")
    except:
        print("Unable to read .csv")

    # set column names
    attributes = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
                'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']
    data.columns = attributes

    X, y = data.iloc[:, 0:-1], data.iloc[:, -1]

    return X, y

def preprocess_data(data):
    '''
    Arguments: Pandas Dataframe (X_train or X_test)
    Return: Preprocessed np array
    '''
    # saving columns and indices since ColumnTransformer removes them
    columns = data.columns
    index = data.index

    # defining categorical and numerical features (and categorical feature value range)
    categorical_features = ['sex', 'cp', 'fbs', 'restecg', 
                             'exang', 'slope', 'ca', 'thal']
    categories = [[0,1], [1,2,4], [0,1], [0,1,2], 
                  [0,1], [1,2,3], [0,1,2,3], [3,6,7]]
    numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

    # creating transformers
    # categorical_transformer = Pipeline[('onehot', OneHotEncoder())]
    # numerical_transformer = Pipeline[('scaler', StandardScaler())]

    # creating and applying ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[('num', StandardScaler(), numerical_features),
                      ('cat', OneHotEncoder(categories=categories, 
                                            handle_unknown='ignore'),
                       categorical_features)],
        n_jobs=-1)

    data = preprocessor.fit_transform(data)
    
    return data

In [142]:
X_train = pd.read_csv('data/train_data.csv').drop(["Unnamed: 0"], axis = 1)
y_train = pd.read_csv('data/train_labels.csv', header=None).drop([0], axis = 1)
X_test = pd.read_csv('data/test_data.csv').drop(["Unnamed: 0"], axis = 1)
y_test = pd.read_csv('data/test_labels.csv', header=None).drop([0], axis = 1)

y_test.rename(columns = {1:"label"}, inplace = True)
y_train.rename(columns = {1:"label"}, inplace = True)

In [143]:
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X_train,y_train)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X_train.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['feature','Score']  #naming the dataframe columns
print(featureScores.nlargest(14,'Score'))  #print feature importance order

     feature       Score
7    thalach  130.638311
11        ca   67.494039
9    oldpeak   58.660191
12      thal   49.734096
4       chol   28.444992
8      exang   27.189943
0        age   14.737128
2         cp   12.416770
3   trestbps    9.563405
1        sex    7.681980
6    restecg    6.465654
10     slope    5.019455
5        fbs    0.105166


In [144]:
feat_ordered = featureScores.sort_values(['Score'])["feature"]
feat_ordered = feat_ordered.reset_index(drop=True)
feats = list(feat_ordered)

## Random Forest

In [171]:
feats = list(feat_ordered)
test_scores_RF = []
for i in range(len(feat_ordered)):
    Xi_train = X_train[feats]
    f_list = list(range(1,len(Xi_train.values[0])+1))
    parameters = {'max_features': f_list}
    #finds best max_features parameter
    rf = GridSearchCV(ensemble.RandomForestClassifier(criterion = "entropy", 
                                                          n_estimators = 100), 
                         parameters, cv = 5, iid = True, 
                         return_train_score = True, n_jobs=-1)
    rf.fit(Xi_train.values, y_train.values.ravel())
    test_score = rf.score(X_test[feats].values, y_test.values)
    test_scores_RF.insert(0, test_score)
    del feats[0]

#### where the test score of n features is at test_scores_RF[n-1]

In [169]:
print(test_scores_RF)

[0.6333333333333333, 0.6833333333333333, 0.7333333333333333, 0.8, 0.8, 0.8, 0.8, 0.8333333333333334, 0.85, 0.8166666666666667, 0.8333333333333334, 0.8, 0.8333333333333334]


## KNN

In [None]:
# feats = list(feat_ordered)
# test_scores_KNN = []
# for i in range(len(feat_ordered)):
#     Xi_train = X_train[feats]
#     parameters = {'n_neighbors': n_list}
#     kn = GridSearchCV(neighbors.KNeighborsClassifier(), 
#                              parameters, cv = 5, iid = True, 
#                                   return_train_score = True, n_jobs=-1)
#     kn.fit(Xi_train.values, y_train.values.ravel())
#     test_score = kn.score(X_test[feats].values, y_test.values)
#     test_scores_KNN.insert(0, test_score)
#     del feats[0]

## Log Regression

In [None]:
# feats = list(feat_ordered)
# test_scores_LR = []
# for i in range(len(feat_ordered)):
#     Xi_train = X_train[feats]
#     parameters = {'max_features': f_list}
#     lr = GridSearchCV(linear_model.LogisticRegression())
#     lr.fit(Xi_train.values, y_train.values.ravel())
#     test_score = lr.score(X_test[feats].values, y_test.values)
#     test_scores_LR.insert(0, test_score)
#     del feats[0]

## SVM with Linear Kernel

In [None]:
# feats = list(feat_ordered)
# test_scores_SVM = []
# for i in range(len(feat_ordered)):
#     Xi_train = X_train[feats]
#     parameters = {'C': c_list}
#     svmclf = GridSearchCV(svm.SVC(kernel = 'linear'), parameters, cv = 5, 
#                       return_train_score = True, iid = True, n_jobs=-1)
#     svmclf.fit(Xi_train.values, y_train.values.ravel())
#     test_score = svmclf.score(X_test[feats].values, y_test.values)
#     test_scores_SVM.insert(0, test_score)
#     del feats[0]