This notebook filters the data, normalizes the features, and additionally plots the data correlation.

In [17]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Read in the input data first
input_file = "cardio_train.csv"
cardio_train = pd.read_csv(input_file)


In [18]:
#doing some initial checks


cardio_map = {True:1, False:0}
cardio_train['cardio']=cardio_train['cardio'].map(cardio_map)
num_obs = len(cardio_train)
num_true = len(cardio_train.loc[cardio_train['cardio'] == 1])  #finding how many people have cardio failure
num_false = len(cardio_train.loc[cardio_train['cardio'] == 0]) #finding how many people don't have cardio failure
print("Number of True cases:  {0} ({1:2.2f}%)".format(num_true, (num_true/num_obs) * 100))
print("Number of False cases: {0} ({1:2.2f}%)".format(num_false, (num_false/num_obs) * 100))

num_all = len(cardio_train)
num_women = len(cardio_train.loc[cardio_train['gender'] == 1])
num_men = len(cardio_train.loc[cardio_train['gender'] == 2])
print("Number of Female cases:  {0} ({1:2.2f}%)".format(num_women, (num_women/num_all) * 100))
print("Number of Male cases: {0} ({1:2.2f}%)".format(num_men, (num_men/num_all) * 100))

Number of True cases:  34979 (49.97%)
Number of False cases: 35021 (50.03%)
Number of Female cases:  45530 (65.04%)
Number of Male cases: 24470 (34.96%)


In [19]:

def remove_outliers(dataframe, column_name,  threshold_low, threshold_hi):
    """
    Removes outliers

    """
    low_valid = dataframe[column_name] > threshold_low   #creating a desginated chunk to remove
    hi_valid = dataframe[column_name] < threshold_hi
    column_valid = [a and b for a,b in zip(low_valid,hi_valid)]
    truncated_data = dataframe[column_valid]
    removeAmount = list(dataframe.shape)[0] - list(truncated_data.shape)[0]
    print('Number removed:',removeAmount)
    return truncated_data

def plot_features(x,feature_names):
    """
    This plots the features in a histogram.
    """
    for feat in feature_names:
        plt.hist(x[feat],32)
        plt.xlabel(feat)
        if(feat == "bmi"):
            plt.savefig('BMI_graph_updated' + '.png', bbox_inches='tight')
        plt.show()
        
        
def normalize(X):
    """
    Normalizes features
    """
    meanX= np.mean(X) #calculating mean
    X =X-meanX  
    stdX= np.std(X) #calculating standard deviation
    X=X/stdX
    return X


def plot_corr(df,size=11): 
    """
    Function plots a graphical correlation matrix for each pair (of features) in the dataframe

    Displays:
        matrix of correlation between columns.  Yellow means that they are highly correlated.
                                           
    """
    
    corr = df.corr()
    corr# calling the correlation function (on dataframe)
    fig, ax = plt.subplots(figsize=(size,size))
    c = ax.matshow(np.abs(corr)) # color code based on correlation
    plt.colorbar(c)
    plt.xticks(range(len(corr.columns)),corr.columns) # draw x tickmarks
    plt.yticks(range(len(corr.columns)),corr.columns) # draw y tickmarks
    plt.show()
   

In [20]:

def print_corr_info(x):
     """
     creates a bar graph to plot the correlations

     """
     feat_names = x.index
     unsorted_corr = np.abs(x)
     print("Unsorted Abs", unsorted_corr)
     sorted_order = np.argsort(unsorted_corr)[::-1]
     print('sorted order', sorted_order)
     sorted_corr = unsorted_corr[sorted_order[1:]] 
     sorted_feat_names = feat_names[sorted_order[1:]]
     for id in range(len(sorted_corr)):
            print('{0:<20} {1:.4f}'.format(sorted_feat_names[id], sorted_corr[id]))

     plt.bar(sorted_feat_names, sorted_corr, width = 0.3, color=(0.75, 0.6, 0, 1))
     ax = plt.gca()
     plt.grid()
     plt.title('Correlation Analysis')
     plt.xlabel('Feature')
     plt.ylabel('Correlation')
     plt.setp(ax.get_xticklabels(), fontsize = 10, rotation = 45)
     plt.show() 


In [21]:
#adding BMI to the dataframe (using the formula that BMI is weight/height^2)
height = list(cardio_train['height'])
weight = list(cardio_train['weight'])
bmi = []

for i in range(len(height)):
    height[i] = height[i]/100
    bmi.append(weight[i]/((height[i])**2))
cardio_train.insert(0,'bmi',bmi)
cardio_train.drop("weight", inplace=True, axis=1)
cardio_train.drop("height", inplace=True, axis=1)
cardio_train.drop("id", inplace=True, axis=1)

feature_cols = list(cardio_train.columns[:-1])


bmi_truncated_data = remove_outliers(cardio_train,'bmi',10,50)

ap_hi_truncated_data = remove_outliers(bmi_truncated_data, "ap_hi", 0, 250)
final_data = remove_outliers(ap_hi_truncated_data, "ap_lo", 0, 250)



Number removed: 250
Number removed: 47
Number removed: 960


In [23]:
valid = final_data['ap_hi'] > final_data['ap_lo']
truncated_valid_bp_data = final_data[valid]
print('Removed {numsamp:d} samples'.format(numsamp=list(final_data.shape)[0] - list(truncated_valid_bp_data.shape)[0]))


Removed 275 samples


In [None]:
#plotting correlation
plot_corr(final_data)
corr = final_data.corr()
x = corr['cardio']
print_corr_info(x)

#plt.savefig('Correlation Graph' + '.png', bbox_inches='tight')

plt.show()

In [None]:
print(feature_cols)
print(final_data.head())

for feature in feature_cols:
    final_data[feature] = normalize(final_data[feature]) #calling normalizing function 
    
#giving nice names
final_data.rename(columns={'ap_hi':'systolic', 'ap_lo':'diastolic', 'gluc': 'glucose', 'alco': 'alcoholic', 'smoke': 'smoking'}, inplace = True)
final_data.to_csv("cardio_train_filtered.csv",index=False )

