In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Reading file

In [2]:
data1 = pd.read_csv('heart_disease_data.csv')
data2 = pd.read_csv('ECGmy.csv')

# Merging Files on the basis of "SEX" & "AGE"

In [3]:
df = pd.merge(data1, data2, on=('sex','age'))
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,PR interval,RR interval,HR
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1,114,1.65,62
1,63,1,0,130,254,0,0,147,0,1.4,1,1,3,0,114,1.65,62
2,63,1,0,130,330,1,0,132,1,1.8,2,3,3,0,114,1.65,62
3,63,1,0,140,187,0,0,144,1,4.0,2,2,3,0,114,1.65,62
4,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1,114,1.66,62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1426,60,1,0,140,293,0,0,170,0,1.2,1,2,3,0,114,1.65,62
1427,61,0,0,130,330,0,0,169,0,0.0,2,0,2,0,114,1.66,73
1428,61,0,0,145,307,0,0,146,1,1.0,1,0,3,0,114,1.66,73
1429,59,0,0,174,249,0,1,143,1,0.0,1,0,2,0,114,1.66,71


# Writing the merged data into a new file. (dont compile this section again!)

In [4]:
dd = df.to_csv("Heart.csv")
dd1 = pd.read_csv("Heart.csv")

In [5]:
dd1.rename(columns = {'Unnamed: 0':'SL_No'}, inplace = True)
dd1
dd1.drop(['SL_No'], axis=1)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,PR interval,RR interval,HR
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1,114,1.65,62
1,63,1,0,130,254,0,0,147,0,1.4,1,1,3,0,114,1.65,62
2,63,1,0,130,330,1,0,132,1,1.8,2,3,3,0,114,1.65,62
3,63,1,0,140,187,0,0,144,1,4.0,2,2,3,0,114,1.65,62
4,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1,114,1.66,62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1426,60,1,0,140,293,0,0,170,0,1.2,1,2,3,0,114,1.65,62
1427,61,0,0,130,330,0,0,169,0,0.0,2,0,2,0,114,1.66,73
1428,61,0,0,145,307,0,0,146,1,1.0,1,0,3,0,114,1.66,73
1429,59,0,0,174,249,0,1,143,1,0.0,1,0,2,0,114,1.66,71


# file INFO!

In [6]:
dd1.info()

#just checking for any missing values!
dd1.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1431 entries, 0 to 1430
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   SL_No        1431 non-null   int64  
 1   age          1431 non-null   int64  
 2   sex          1431 non-null   int64  
 3   cp           1431 non-null   int64  
 4   trestbps     1431 non-null   int64  
 5   chol         1431 non-null   int64  
 6   fbs          1431 non-null   int64  
 7   restecg      1431 non-null   int64  
 8   thalach      1431 non-null   int64  
 9   exang        1431 non-null   int64  
 10  oldpeak      1431 non-null   float64
 11  slope        1431 non-null   int64  
 12  ca           1431 non-null   int64  
 13  thal         1431 non-null   int64  
 14  target       1431 non-null   int64  
 15  PR interval  1431 non-null   int64  
 16  RR interval  1431 non-null   float64
 17  HR           1431 non-null   int64  
dtypes: float64(2), int64(16)
memory usage: 201.4 KB


SL_No          0
age            0
sex            0
cp             0
trestbps       0
chol           0
fbs            0
restecg        0
thalach        0
exang          0
oldpeak        0
slope          0
ca             0
thal           0
target         0
PR interval    0
RR interval    0
HR             0
dtype: int64

# Checking the target class -> count!

In [7]:
dd1['target'].value_counts()

1    756
0    675
Name: target, dtype: int64

# I am using a large data set so that no OVER FIT occours!
# Now just splitting the features and targets

In [8]:
x = dd1.drop(columns='target', axis=1)
y = dd1['target']

In [9]:
print(x)

      SL_No  age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  \
0         0   63    1   3       145   233    1        0      150      0   
1         1   63    1   0       130   254    0        0      147      0   
2         2   63    1   0       130   330    1        0      132      1   
3         3   63    1   0       140   187    0        0      144      1   
4         4   37    1   2       130   250    0        1      187      0   
...     ...  ...  ...  ..       ...   ...  ...      ...      ...    ...   
1426   1426   60    1   0       140   293    0        0      170      0   
1427   1427   61    0   0       130   330    0        0      169      0   
1428   1428   61    0   0       145   307    0        0      146      1   
1429   1429   59    0   0       174   249    0        1      143      1   
1430   1430   59    0   0       174   249    0        1      143      1   

      oldpeak  slope  ca  thal  PR interval  RR interval  HR  
0         2.3      0   0     1      

In [10]:
print(y)

0       1
1       0
2       0
3       0
4       1
       ..
1426    0
1427    0
1428    0
1429    0
1430    0
Name: target, Length: 1431, dtype: int64


# Now splitting the data into test and training data set!

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=2)
print(x.shape, x_train.shape, x_test.shape)

(1431, 17) (1144, 17) (287, 17)


# Now training the LogisticRegression model with training dataset!

In [12]:
model = LogisticRegression()
model.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# accuracy on training data

In [13]:
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction, y_train)

In [14]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.8304195804195804


# accuracy on test data

In [15]:
x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction, y_test)

In [16]:
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Test data :  0.8501742160278746


# Building a predictive System 
# (With no disease example)

In [17]:
input_data = (63,1,0,130,254,0,0,147,0,1.4,1,1,3,0,114,1.65,62) #with no disease
             

# change the input data to a numpy array
input_data_as_numpy_array= np.asarray(input_data)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('The Person does not have a Heart Disease')
else:
  print('The Person has Heart Disease')

[1]
The Person has Heart Disease




# With disease exmple!

In [18]:
input_data = (57,1,0,110,201,0,1,126,1,1.5,1,0,1,1,114,1.66,72) #with disease

# change the input data to a numpy array
input_data_as_numpy_array= np.asarray(input_data)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('The Person does not have a Heart Disease')
else:
  print('The Person has Heart Disease')

[1]
The Person has Heart Disease




# LableEncoder

In [19]:
from sklearn.preprocessing import LabelEncoder

#target

hr = LabelEncoder()
dd1['HR'] = hr.fit_transform(dd1['HR'])
dd1['HR'].unique()


array([16, 19, 18, 15,  0, 21, 24, 25, 23, 26, 22, 27,  6,  7, 20, 28, 30,
       55,  1,  5, 50, 10,  9,  8, 12, 13, 14, 29, 17, 34, 51, 52, 40, 37,
       38, 32, 33, 39, 46, 57,  3, 43, 56, 53, 41, 36, 42, 45, 47, 31, 48,
       49, 35, 59,  2, 54, 44, 11, 60, 58,  4], dtype=int64)

In [20]:

#age

age = LabelEncoder()
dd1['age'] = age.fit_transform(dd1['age'])
dd1['age'].unique()

array([29,  3,  7, 22, 23, 10, 18, 20, 14, 15, 24, 16,  9, 25,  8, 27,  6,
       17, 19, 30, 12, 11,  5, 13, 31, 28,  1,  2,  0, 21,  4, 26],
      dtype=int64)

In [21]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

In [22]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor(random_state = 0)
dtr.fit(x , y.values)

In [23]:
y_pred = dtr.predict(x)
error = np.sqrt(mean_squared_error(y, y_pred))
error

0.0

In [24]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(random_state = 0)
rfr.fit(x, y.values)

In [25]:
y_pred = rfr.predict(x)
error = np.sqrt(mean_squared_error(y, y_pred))
error

0.023574672832542675

# Finding the grid search


In [26]:
from sklearn.model_selection import GridSearchCV

max_depth = [None, 2, 4, 6, 8, 10, 12]
parameters = {"max_depth" : max_depth}

regressor = DecisionTreeRegressor(random_state = 0)
gs = GridSearchCV(regressor, parameters , scoring = 'neg_mean_squared_error')
gs.fit(x, y.values)

In [27]:
regressor = gs.best_estimator_
regressor.fit(x, y.values)
y_pred = regressor.predict(x)
error = np.sqrt(mean_squared_error(y, y_pred))
error


0.3686615080119556

In [28]:
x

Unnamed: 0,SL_No,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,PR interval,RR interval,HR
0,0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,114,1.65,62
1,1,63,1,0,130,254,0,0,147,0,1.4,1,1,3,114,1.65,62
2,2,63,1,0,130,330,1,0,132,1,1.8,2,3,3,114,1.65,62
3,3,63,1,0,140,187,0,0,144,1,4.0,2,2,3,114,1.65,62
4,4,37,1,2,130,250,0,1,187,0,3.5,0,0,2,114,1.66,62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1426,1426,60,1,0,140,293,0,0,170,0,1.2,1,2,3,114,1.65,62
1427,1427,61,0,0,130,330,0,0,169,0,0.0,2,0,2,114,1.66,73
1428,1428,61,0,0,145,307,0,0,146,1,1.0,1,0,3,114,1.66,73
1429,1429,59,0,0,174,249,0,1,143,1,0.0,1,0,2,114,1.66,71


In [29]:
#full data frame
x = np.array([[63,1,0,130,254,0,0,147,0,1.4,1,1,3,0,114,1.65,62]])
x

array([[ 63.  ,   1.  ,   0.  , 130.  , 254.  ,   0.  ,   0.  , 147.  ,
          0.  ,   1.4 ,   1.  ,   1.  ,   3.  ,   0.  , 114.  ,   1.65,
         62.  ]])

In [30]:
y_pred = regressor.predict(x)
y_pred



array([0.84527221])

In [31]:
import pickle
data = {"model" : regressor , "HR" : hr, "age" : age}
with open('saved.pkl' , 'wb') as file:
    pickle.dump(data, file)

In [32]:
with open('saved.pkl' , 'wb') as file:
    pickle.dump(data, file)

regressor_loaded = data['model']
hr = data['HR']
age = data['age']

In [36]:
y_pred = regressor_loaded.predict(x)
y_pred



array([0.84527221])