In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data = pd.read_csv("drive/MyDrive/HeartData/heartData.csv")

In [4]:
data.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [5]:
data.shape

(4240, 16)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4240 entries, 0 to 4239
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             4240 non-null   int64  
 1   age              4240 non-null   int64  
 2   education        4135 non-null   float64
 3   currentSmoker    4240 non-null   int64  
 4   cigsPerDay       4211 non-null   float64
 5   BPMeds           4187 non-null   float64
 6   prevalentStroke  4240 non-null   int64  
 7   prevalentHyp     4240 non-null   int64  
 8   diabetes         4240 non-null   int64  
 9   totChol          4190 non-null   float64
 10  sysBP            4240 non-null   float64
 11  diaBP            4240 non-null   float64
 12  BMI              4221 non-null   float64
 13  heartRate        4239 non-null   float64
 14  glucose          3852 non-null   float64
 15  TenYearCHD       4240 non-null   int64  
dtypes: float64(9), int64(7)
memory usage: 530.1 KB


In [7]:
data.dropna(axis=0, inplace=True)

In [8]:
data.shape

(3658, 16)

In [9]:
X = data.drop("TenYearCHD", axis=1)
y = data["TenYearCHD"]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [11]:
X_train.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose
704,1,58,1.0,1,20.0,0.0,0,0,0,251.0,135.0,85.5,21.24,88.0,103.0
2623,0,43,2.0,0,0.0,0.0,0,0,0,308.0,110.0,70.0,24.83,55.0,83.0
2194,1,42,3.0,1,43.0,0.0,0,1,0,272.0,128.0,83.0,33.26,80.0,63.0
2985,0,57,1.0,0,0.0,1.0,0,1,0,432.0,153.0,85.0,26.13,98.0,75.0
570,0,49,3.0,1,3.0,0.0,0,0,0,247.0,121.0,82.0,29.07,72.0,69.0


In [12]:
from sklearn.model_selection import StratifiedShuffleSplit

In [13]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=10)

In [14]:
for train_index, test_index in split.split(data, data["TenYearCHD"]):
    strat_train_set = data.iloc[train_index]
    strat_test_set = data.iloc[test_index]

In [15]:
strat_test_set["TenYearCHD"].value_counts()/len(strat_test_set)

0    0.848361
1    0.151639
Name: TenYearCHD, dtype: float64

In [16]:
strat_train_set["TenYearCHD"].value_counts()/len(strat_train_set)

0    0.847573
1    0.152427
Name: TenYearCHD, dtype: float64

In [17]:
print(strat_train_set.shape)
print(strat_test_set.shape)

(2926, 16)
(732, 16)


In [18]:
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [19]:
import seaborn as sns
from sklearn.metrics import confusion_matrix

In [20]:
imp_features = ["age", "totChol", "sysBP", "diaBP", "BMI", "heartRate", "glucose", "male"]

In [21]:
X_train = strat_train_set.drop("TenYearCHD", axis=1)
y_train = strat_train_set["TenYearCHD"]

In [22]:
X_test = strat_test_set.drop("TenYearCHD", axis=1)
y_test = strat_test_set["TenYearCHD"]

In [23]:
#grid search for optimum parameters
Cs = [0.001, 0.01, 0.1, 1, 10]
gammas = [0.001, 0.01, 0.1, 1]
param_grid = {'C': Cs, 'gamma' : gammas}
svm_clf = GridSearchCV(SVC(kernel='rbf', probability=True), param_grid, cv=10)

In [None]:
svm_clf.fit(X_train,y_train)
svm_clf.best_params_

In [None]:
svm_clf = SVC(C=10, gamma=1, probability=True)

In [None]:
svm_clf.fit(X_train, y_train)

In [None]:
p = svm_clf.predict(X_test)

In [None]:
X_train_x = X_train[imp_features]

In [None]:
X_test_x = X_test[imp_features]

In [None]:
svm_clf_x = GridSearchCV(SVC(kernel='rbf', probability=True), param_grid, cv=10)

In [None]:
svm_clf_x.fit(X_train_x, y_train)
svm_clf_x.best_params_

In [None]:
# predictions
svm_predict = svm_clf.predict(X_test)
svm_predict_x = svm_clf_x.predict(X_test_x)

In [None]:
accuracy_score(y_test, svm_predict)

In [None]:
print(svm_predict.shape)
print(svm_predict_x.shape)
print(y_test.shape)

In [None]:
accuracy_score(y_test, svm_predict_x)

In [None]:
cm=confusion_matrix(y_test,p)
conf_matrix=pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
plt.figure(figsize = (8,5))
sns.heatmap(conf_matrix, annot=True,fmt='d',cmap="YlGnBu")

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

### Scaling the data

In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train_scaled)

X_test_scaled = scaler.transform(X_test)
X_test = pd.DataFrame(X_test_scaled)

In [None]:
svm_clf = SVC(C=10, gamma=1, probability=True)

In [None]:
svm_clf.fit(X_train_scaled, y_train)

In [None]:
pred = svm_clf.predict(X_test_scaled)

In [None]:
cm=confusion_matrix(y_test,pred)
conf_matrix=pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
plt.figure(figsize = (8,5))
sns.heatmap(conf_matrix, annot=True,fmt='d',cmap="YlGnBu")

In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train_x)
X_train = pd.DataFrame(X_train_scaled)

X_test_scaled = scaler.transform(X_test_x)
X_test = pd.DataFrame(X_test_scaled)

In [None]:
svm_clf = SVC(C=10, gamma=2, probability=True)

In [None]:
svm_clf.fit(X_train, y_train)

In [None]:
pred = svm_clf.predict(X_test_scaled)

In [None]:
cm=confusion_matrix(y_test,pred)
conf_matrix=pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
plt.figure(figsize = (8,5))
sns.heatmap(conf_matrix, annot=True,fmt='d',cmap="YlGnBu")

### Removing Data Biasing i.e Balancing the data

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from collections import Counter

In [None]:
data = data.dropna()

In [None]:
imp_features = ["age", "totChol", "sysBP", "diaBP", "BMI", "heartRate", "glucose", "male"]
X = data[imp_features]
y = data.iloc[:,-1]

In [None]:
# the numbers before smote
num_before = dict(Counter(y))

#perform smoting

# define pipeline
over = SMOTE(sampling_strategy=0.8)
under = RandomUnderSampler(sampling_strategy=0.8)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

# transform the dataset
X_smote, y_smote = pipeline.fit_resample(X, y)


#the numbers after smote
num_after =dict(Counter(y_smote))

In [None]:
print(num_before)

In [None]:
print(num_after)

In [None]:
X_smote.head(1)

In [None]:
n_data = pd.concat([pd.DataFrame(X_smote), pd.DataFrame(y_smote)], axis=1)
n_data.columns = ["age", "totChol", "sysBP", "diaBP", "BMI", "heartRate", "glucose", "male", "TenYearCHD"]
n_data.head() 


In [None]:
n_data["TenYearCHD"].value_counts()

In [None]:
X_n = n_data.drop("TenYearCHD",axis=1)
y_n = n_data["TenYearCHD"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_n, y_n, test_size=0.2, random_state=10)

In [None]:
scaler = StandardScaler()
X_scaled_train = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_scaled_train)

In [None]:
X_scaled_test = scaler.transform(X_test)
X_test = pd.DataFrame(X_scaled_test)

In [None]:
X_train.head(1)

In [None]:
X_test.head(1)

In [None]:
#grid search for optimum parameters
Cs = [0.001, 0.01, 0.1, 1, 10]
gammas = [0.001, 0.01, 0.1, 1]
param_grid = {'C': Cs, 'gamma' : gammas}
svm_clf = GridSearchCV(SVC(kernel='rbf', probability=True), param_grid, cv=10)

In [None]:
svm_clf.fit(X_train,y_train)
svm_clf.best_params_

In [None]:
pred = svm_clf.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
cm=confusion_matrix(y_test,pred)
conf_matrix=pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
plt.figure(figsize = (8,5))
sns.heatmap(conf_matrix, annot=True,fmt='d',cmap="YlGnBu")

In [None]:
svm_clf

In [None]:
svm_clf_s = SVC(gamma=1, C=10, random_state=10)

In [None]:
svm_clf_s.fit(X_train, y_train)

In [None]:
pred_s = svm_clf.predict(X_test)

In [None]:
accuracy_score(y_test, pred_s)

In [None]:
cm=confusion_matrix(y_test,pred_s)
conf_matrix=pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
plt.figure(figsize = (8,5))
sns.heatmap(conf_matrix, annot=True,fmt='d',cmap="YlGnBu")

In [None]:
f1_score(y_test, pred_s)

### Saving the model

In [None]:
import pickle

In [None]:
# save the model to disk
filename = 'SVM_final.sav'
pickle.dump(svm_clf_s, open(filename, 'wb'))

In [None]:
#"age", "totChol", "sysBP", "diaBP", "BMI", "heartRate", "glucose", "male", "TenYearCHD"
i = [[61,225,150,95,28.58,65,103,0]]

In [None]:
t = scaler.transform(i)

In [None]:
t

In [None]:
with open('SVM_final.sav', 'rb') as file:
        heart = pickle.load(file)

In [None]:
pred = heart.predict(t)

In [None]:
pred

In [None]:
filename_s = 'Scaler.pkl'
pickle.dump(scaler, open(filename_s, 'wb'))