In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split as split

#for selecting K best features
from sklearn.feature_selection import SelectKBest,chi2

#for selecting optimal hyperparameyters - hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV


from sklearn.tree import DecisionTreeClassifier as decision_tree
from sklearn.ensemble import RandomForestClassifier as random_forest
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier as xgb


import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

%matplotlib inline 

In [3]:
heart = pd.read_csv('heart.csv')
heart.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
5,58,0,0,100,248,0,0,122,0,1.0,1,0,2,1
6,58,1,0,114,318,0,2,140,0,4.4,0,3,1,0
7,55,1,0,160,289,0,0,145,1,0.8,1,1,3,0
8,46,1,0,120,249,0,0,144,0,0.8,2,0,3,0
9,54,1,0,122,286,0,0,116,1,3.2,1,2,2,0


In [4]:
x = heart.iloc[:,0:13]
y = heart.iloc[:,13]

In [5]:
cat_col = []
for col in heart.columns:
    if len(heart[col].unique()) < 10:
        cat_col.append(col)
        print(col ,"->" , heart[col].unique())
        
cat_col

sex -> [1 0]
cp -> [0 1 2 3]
fbs -> [0 1]
restecg -> [1 0 2]
exang -> [0 1]
slope -> [2 0 1]
ca -> [2 0 1 3 4]
thal -> [3 2 1 0]
target -> [0 1]


['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal', 'target']

In [6]:
# One hot encoding all the categorical features
cat_col = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
x_new = pd.get_dummies(x, columns = cat_col)

In [7]:
#The dataset have to be scaled in standardized form
#Scaling on -> age, trestbps, chol, thalach, oldpeak

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scale_col = ["age", "trestbps", "chol", "thalach", "oldpeak"]
x_new[scale_col] = scaler.fit_transform(x[scale_col]) 

x_new.head().T

Unnamed: 0,0,1,2,3,4
age,-0.268437,-0.158157,1.716595,0.724079,0.834359
trestbps,-0.377636,0.479107,0.764688,0.936037,0.364875
chol,-0.659332,-0.833861,-1.396233,-0.833861,0.930822
thalach,0.821321,0.255968,-1.048692,0.5169,-1.874977
oldpeak,-0.060888,1.727137,1.301417,-0.912329,0.705408
sex_0,0.0,0.0,0.0,0.0,1.0
sex_1,1.0,1.0,1.0,1.0,0.0
cp_0,1.0,1.0,1.0,1.0,1.0
cp_1,0.0,0.0,0.0,0.0,0.0
cp_2,0.0,0.0,0.0,0.0,0.0


# Inside Flask Pre-processing

In [8]:
import pickle as p
model_file='heart_disease.pickle'
model=p.load(open(model_file,'rb'))



data = {"age":[67],"sex":[1],"cp":[2],"trestbps":[133],"chol":[182],
            "fbs":[1], "restecg":[2],"thalach":[168],"exang":[0],
            "oldpeak":[3.2],"slope":[2],"ca" : [3],"thal":[2]}

data = pd.DataFrame(data)
heart = pd.read_csv('heart.csv')
x_test = heart.iloc[:,0:13]
x_test = pd.concat([x_test,data],axis = 0)
x_test = x_test.reset_index(drop = True) 

x_test

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2
1024,54,1,0,120,188,0,1,113,0,1.4,1,1,3


In [9]:
cat_col = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
x_test_new = pd.get_dummies(x_test, columns = cat_col)
x_test_new

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,sex_0,sex_1,cp_0,cp_1,cp_2,...,slope_2,ca_0,ca_1,ca_2,ca_3,ca_4,thal_0,thal_1,thal_2,thal_3
0,52,125,212,168,1.0,0,1,1,0,0,...,1,0,0,1,0,0,0,0,0,1
1,53,140,203,155,3.1,0,1,1,0,0,...,0,1,0,0,0,0,0,0,0,1
2,70,145,174,125,2.6,0,1,1,0,0,...,0,1,0,0,0,0,0,0,0,1
3,61,148,203,161,0.0,0,1,1,0,0,...,1,0,1,0,0,0,0,0,0,1
4,62,138,294,106,1.9,1,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1021,60,125,258,141,2.8,0,1,1,0,0,...,0,0,1,0,0,0,0,0,0,1
1022,47,110,275,118,1.0,0,1,1,0,0,...,0,0,1,0,0,0,0,0,1,0
1023,50,110,254,159,0.0,1,0,1,0,0,...,1,1,0,0,0,0,0,0,1,0
1024,54,120,188,113,1.4,0,1,1,0,0,...,0,0,1,0,0,0,0,0,0,1


In [10]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scale_col = ["age", "trestbps", "chol", "thalach", "oldpeak"]
x_test_new[scale_col] = scaler.fit_transform(x_test[scale_col]) 
x_test_new

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,sex_0,sex_1,cp_0,cp_1,cp_2,...,slope_2,ca_0,ca_1,ca_2,ca_3,ca_4,thal_0,thal_1,thal_2,thal_3
0,-0.269667,-0.377896,-0.657950,0.820650,-0.062585,0,1,1,0,0,...,1,0,0,1,0,0,0,0,0,1
1,-0.159436,0.479262,-0.832433,0.255208,1.723456,0,1,1,0,0,...,0,1,0,0,0,0,0,0,0,1
2,1.714478,0.764982,-1.394657,-1.049659,1.298208,0,1,1,0,0,...,0,1,0,0,0,0,0,0,0,1
3,0.722406,0.936413,-0.832433,0.516181,-0.913081,0,1,1,0,0,...,1,0,1,0,0,0,0,0,0,1
4,0.832636,0.364974,0.931787,-1.876075,0.702861,1,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1021,0.612175,-0.377896,0.233854,-0.353730,1.468307,0,1,1,0,0,...,0,0,1,0,0,0,0,0,0,1
1022,-0.820818,-1.235054,0.563433,-1.354128,-0.062585,0,1,1,0,0,...,0,0,1,0,0,0,0,0,1,0
1023,-0.490127,-1.235054,0.156306,0.429190,-0.913081,1,0,1,0,0,...,1,1,0,0,0,0,0,0,1,0
1024,-0.049206,-0.663615,-1.123238,-1.571606,0.277613,0,1,1,0,0,...,0,0,1,0,0,0,0,0,0,1


In [11]:
a = x_test_new.iloc[-1,:]
new_data = []

for i in range(len(a.index)):
    new_data.append(a[i])

print(len(new_data))

30


In [12]:
new_data = [new_data]

In [13]:
a = np.array2string(model.predict(new_data))

In [19]:
a[1]

'1'