In [36]:
#-----------------------------------------------#
#      Libraries used in this Project           #
#-----------------------------------------------#
import sys
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt

from pandas.plotting import scatter_matrix


In [37]:
#--------------------------------#
#      Reading the dataset       #
#--------------------------------#
data = pd.read_csv('dataset.csv')  

In [38]:
#------------------------------------------#
# Just Checking the sample of the dataset  #
#------------------------------------------#
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [39]:
#---------------------------------------#
#  Printing the shape of the DataFrane  #
#---------------------------------------#
print('DataFrame: {}'.format(data.shape))
print(data.loc[1])

DataFrame: (303, 14)
age          37.0
sex           1.0
cp            2.0
trestbps    130.0
chol        250.0
fbs           0.0
restecg       1.0
thalach     187.0
exang         0.0
oldpeak       3.5
slope         0.0
ca            0.0
thal          2.0
target        1.0
Name: 1, dtype: float64


In [40]:
#---------------------------------------------#
# Printing the last thirty or so data points  #
#---------------------------------------------#
data.loc[270:]

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
270,46,1,0,120,249,0,0,144,0,0.8,2,0,3,0
271,61,1,3,134,234,0,1,145,0,2.6,1,2,2,0
272,67,1,0,120,237,0,1,71,0,1.0,1,0,2,0
273,58,1,0,100,234,0,1,156,0,0.1,2,1,3,0
274,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
275,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
276,58,1,0,146,218,0,1,105,0,2.0,1,1,3,0
277,57,1,1,124,261,0,1,141,0,0.3,2,0,3,0
278,58,0,1,136,319,1,0,152,0,0.0,2,2,2,0
279,61,1,0,138,166,0,0,125,1,3.6,1,1,2,0


In [41]:
#---------------------------------------------------------#
# Cleaning and Removing the missing data from the dataset #
#---------------------------------------------------------#
data1 = data[~data.isin(['?'])]
data1.loc[270:]

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
270,46,1,0,120,249,0,0,144,0,0.8,2,0,3,0
271,61,1,3,134,234,0,1,145,0,2.6,1,2,2,0
272,67,1,0,120,237,0,1,71,0,1.0,1,0,2,0
273,58,1,0,100,234,0,1,156,0,0.1,2,1,3,0
274,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
275,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
276,58,1,0,146,218,0,1,105,0,2.0,1,1,3,0
277,57,1,1,124,261,0,1,141,0,0.3,2,0,3,0
278,58,0,1,136,319,1,0,152,0,0.0,2,2,2,0
279,61,1,0,138,166,0,0,125,1,3.6,1,1,2,0


In [42]:
#--------------------------------------------------#
# Dropping rows with NaN values from the DataFrame #
#--------------------------------------------------#
data = data.dropna(axis=0)
data.loc[270:]

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
270,46,1,0,120,249,0,0,144,0,0.8,2,0,3,0
271,61,1,3,134,234,0,1,145,0,2.6,1,2,2,0
272,67,1,0,120,237,0,1,71,0,1.0,1,0,2,0
273,58,1,0,100,234,0,1,156,0,0.1,2,1,3,0
274,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
275,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
276,58,1,0,146,218,0,1,105,0,2.0,1,1,3,0
277,57,1,1,124,261,0,1,141,0,0.3,2,0,3,0
278,58,0,1,136,319,1,0,152,0,0.0,2,2,2,0
279,61,1,0,138,166,0,0,125,1,3.6,1,1,2,0


In [43]:
#-------------------------------------------------------#
# Printing the shape and the data type of the dataframe #
#-------------------------------------------------------#
print(data1.shape)
print(data1.dtypes)

(303, 14)
age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object


In [44]:
#----------------------------------------------------------#
#  Transforming data to numeric to enable further analysis #
#----------------------------------------------------------#
data1 = data.apply(pd.to_numeric)
data1.dtypes

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

In [45]:
#---------------------------------------------------------------------------#
# Printing data characteristics, usings pandas built-in describe() function #
#---------------------------------------------------------------------------#
data1.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0
