In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# global variables
dataset_file_path = 'data/processed.cleveland.data'

In [3]:
def load_data(filename):
    '''
    Description: reads specified .csv file and returns an X and y dataframe.
    Arguments: .csv file to name
    Return: data - dataframe
            X - dataframe containing features
            y - dataframe containing labels
    
    > 0. age
    > 1. sex
    > 2. chest pain type(4 values)
    > 3. resting blood pressure
    > 4. serum cholestoral in mg/dl
    > 5. fasting blood sugar > 120 mg/dl
    > 6. resting electrocardiographic results(values 0, 1, 2)
    > 7. maximum heart rate achieved
    > 8. exercise induced angina
    > 9. oldpeak = ST depression induced by exercise relative to rest
    > 10. the slope of the peak exercise ST segment
    > 11. number of major vessels(0-3) colored by flourosopy
    > 12. thal: 3 = normal, 6 = fixed defect, 7 = reversable defect
    > 13. num: 0 = no presence, 4 = present
    '''

    # reading the data
    try:
        print("Reading .csv")
        data = pd.read_csv(filename, header=None)
        print("Finished reading .csv")
    except:
        print("Unable to read .csv")

    # set column names
    attributes = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
                'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']
    data.columns = attributes

    X, y = data.iloc[:, 0:-1], data.iloc[:, -1]

    return data, X, y

In [4]:
# loading data
data, X, y = load_data(dataset_file_path)

y = y>0 # setting label outputs to 0 for no heart disease and 1 for heart disease

Reading .csv
Finished reading .csv


In [5]:
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

In [6]:
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns

In [7]:
print(featureScores.nlargest(10,'Score'))  #print 10 best features

       Specs       Score
7    thalach  187.053104
11        ca   82.730613
9    oldpeak   68.570533
12      thal   65.221093
8      exang   35.508090
0        age   22.917697
4       chol   20.855084
3   trestbps   16.707463
2         cp   14.591587
6    restecg    8.134652
