In [1]:
import pandas as pd
import numpy as np
from scipy.stats import mstats
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Getting the data from csv file to pandas dataframe
df = pd.read_csv('heart.csv')

<b>Check for missing values</b>

In [3]:
# count the number of NaN values in each column
print(df.isnull().sum())

hr        0
bp        0
pkhr      0
sbp       0
mphr      0
age       0
baseef    0
gender    0
dtype: int64


In [4]:
df.isnull().values.any()

False

In [5]:
df.describe()

Unnamed: 0,hr,bp,pkhr,sbp,mphr,age,baseef,gender
count,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0
mean,75.290323,135.324373,120.550179,146.915771,78.569892,67.344086,55.603943,0.605735
std,15.415197,20.770106,22.567835,36.529544,15.121101,12.049415,10.321534,0.489131
min,42.0,85.0,52.0,40.0,38.0,26.0,20.0,0.0
25%,64.0,120.0,106.25,120.0,69.0,60.0,52.0,0.0
50%,74.0,133.0,122.0,141.0,78.0,69.0,57.0,1.0
75%,84.0,150.0,135.0,170.0,88.0,75.0,62.0,1.0
max,210.0,203.0,210.0,309.0,133.0,93.0,83.0,1.0


In [6]:
z, pval = mstats.normaltest(df)

#Using the confidence value as 95% 
if(pval.all() < 0.05):
    print('Not normal distribution')
else:
    print('normal distribution')

Not normal distribution


In [7]:
# Reading the target variable into y and removing it from original dataframe
print(df.head(5))

y = df['gender']
del(df['gender'])

print(y.head(5))
print(df.head(5))

   hr   bp  pkhr  sbp  mphr  age  baseef  gender
0  92  103   114   86    74   85      27       0
1  62  139   120  158    82   73      39       0
2  62  139   120  157    82   73      39       0
3  93  118   118  105    72   57      42       1
4  89  103   129  173    69   34      45       0
0    0
1    0
2    0
3    1
4    0
Name: gender, dtype: int64
   hr   bp  pkhr  sbp  mphr  age  baseef
0  92  103   114   86    74   85      27
1  62  139   120  158    82   73      39
2  62  139   120  157    82   73      39
3  93  118   118  105    72   57      42
4  89  103   129  173    69   34      45


In [8]:
scalar = StandardScaler()
scalar.fit(df)

  return self.partial_fit(X, y)


StandardScaler(copy=True, with_mean=True, with_std=True)

In [9]:
scaled_data = scalar.transform(df)
scaled_data

  """Entry point for launching an IPython kernel.


array([[ 1.08494689, -1.55768954, -0.29050445, ..., -0.30249072,
         1.46660701, -2.77377457],
       [-0.86293073,  0.17712598, -0.02440078, ...,  0.22704598,
         0.46981442, -1.61011349],
       [-0.86293073,  0.17712598, -0.02440078, ...,  0.22704598,
         0.46981442, -1.61011349],
       ...,
       [ 0.17593733, -1.7022575 ,  0.10865105, ..., -0.17010655,
        -0.27778001, -0.34948065],
       [-1.05771849, -0.2565779 , -0.60095873, ..., -0.50106698,
         0.22061628,  1.00812394],
       [ 1.47452242,  0.2253153 ,  1.79397426, ...,  1.08754311,
        -0.9423084 ,  0.81418043]])

In [10]:
for i in range(1, df.shape[1]+1):
    pca = PCA(n_components=i)
    pca.fit(scaled_data)
    print(sum(pca.explained_variance_ratio_))

0.3246546654792435
0.5191665730938422
0.6876284497415968
0.8352642177010621
0.9286993074588749
0.9941558675995965
1.0000000000000002


Therefore after applying PCA we can see that one variable can be removed as rest are capturing approximately 99% of the variance. 
Hence we are considering 6 features for our analysis.

In [11]:
pca = PCA(n_components = 6)
pca.fit(scaled_data)

PCA(copy=True, iterated_power='auto', n_components=6, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [12]:
x_pca = pca.transform(scaled_data)

In [13]:
scaled_data.shape

(558, 7)

In [14]:
x_pca.shape

(558, 6)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(x_pca, y, test_size = 0.2)

In [16]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [17]:
predicted = logreg.predict(X_test)
print(metrics.accuracy_score(y_test, predicted)) 
print(classification_report(y_test, predicted))

0.5892857142857143
              precision    recall  f1-score   support

           0       0.50      0.26      0.34        46
           1       0.61      0.82      0.70        66

   micro avg       0.59      0.59      0.59       112
   macro avg       0.56      0.54      0.52       112
weighted avg       0.57      0.59      0.55       112

