## 1.import data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
columns = ["age", "sex", "cp", "restbp", "chol", "fbs", "restecg", 
           "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num"]
data= pd.read_table('./heart_disease_all.csv', sep=',', header=None, names=columns)

In [3]:
data.describe()

Unnamed: 0,age,sex,cp,restbp,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
count,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0
mean,54.542088,0.676768,3.158249,131.693603,247.350168,0.144781,0.996633,149.599327,0.326599,1.055556,1.602694,0.676768,4.73064,0.946128
std,9.049736,0.4685,0.964859,17.762806,51.997583,0.352474,0.994914,22.941562,0.469761,1.166123,0.618187,0.938965,1.938629,1.234551
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.0,0.0,0.0,1.0,0.0,3.0,0.0
50%,56.0,1.0,3.0,130.0,243.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0,3.0,0.0
75%,61.0,1.0,4.0,140.0,276.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0,2.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0,4.0


## 2. data preprocessing

### 2.1 one-hot

In [4]:
#change data into one-hot code
dummies = pd.get_dummies(data["cp"],prefix="cp")
data      = data.join(dummies)
del data["cp"]
dummies = pd.get_dummies(data["restecg"],prefix="recg")
data      = data.join(dummies)
del data["restecg"]

dummies = pd.get_dummies(data["slope"],prefix="slope")
data      = data.join(dummies)
del data["slope"]

dummies = pd.get_dummies(data["ca"],prefix="ca")
data      = data.join(dummies)
del data["ca"]

dummies = pd.get_dummies(data["thal"],prefix="thal")
data      = data.join(dummies)
del data["thal"]

In [5]:
data.head()

Unnamed: 0,age,sex,restbp,chol,fbs,thalach,exang,oldpeak,num,cp_1.0,...,slope_1.0,slope_2.0,slope_3.0,ca_0.0,ca_1.0,ca_2.0,ca_3.0,thal_3.0,thal_6.0,thal_7.0
0,63.0,1.0,145.0,233.0,1.0,150.0,0.0,2.3,0.0,1,...,0,0,1,1,0,0,0,0,1,0
1,67.0,1.0,160.0,286.0,0.0,108.0,1.0,1.5,2.0,0,...,0,1,0,0,0,0,1,1,0,0
2,67.0,1.0,120.0,229.0,0.0,129.0,1.0,2.6,1.0,0,...,0,1,0,0,0,1,0,0,0,1
3,37.0,1.0,130.0,250.0,0.0,187.0,0.0,3.5,0.0,0,...,0,0,1,1,0,0,0,1,0,0
4,41.0,0.0,130.0,204.0,0.0,172.0,0.0,1.4,0.0,0,...,1,0,0,1,0,0,0,1,0,0


### 2.2 data normalization

In [6]:
from sklearn import preprocessing
scaler=preprocessing.MinMaxScaler()
data.iloc[:,0]= scaler.fit_transform(data.iloc[:,0].reshape(-1,1))
data.iloc[:,2]= scaler.fit_transform(data.iloc[:,2].reshape(-1,1))
data.iloc[:,3]= scaler.fit_transform(data.iloc[:,3].reshape(-1,1))
data.iloc[:,5]= scaler.fit_transform(data.iloc[:,5].reshape(-1,1))
data.iloc[:,7]= scaler.fit_transform(data.iloc[:,7].reshape(-1,1))

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """
  
  import sys


In [7]:
data.head()

Unnamed: 0,age,sex,restbp,chol,fbs,thalach,exang,oldpeak,num,cp_1.0,...,slope_1.0,slope_2.0,slope_3.0,ca_0.0,ca_1.0,ca_2.0,ca_3.0,thal_3.0,thal_6.0,thal_7.0
0,0.708333,1.0,0.481132,0.244292,1.0,0.603053,0.0,0.370968,0.0,1,...,0,0,1,1,0,0,0,0,1,0
1,0.791667,1.0,0.622642,0.365297,0.0,0.282443,1.0,0.241935,2.0,0,...,0,1,0,0,0,0,1,1,0,0
2,0.791667,1.0,0.245283,0.23516,0.0,0.442748,1.0,0.419355,1.0,0,...,0,1,0,0,0,1,0,0,0,1
3,0.166667,1.0,0.339623,0.283105,0.0,0.885496,0.0,0.564516,0.0,0,...,0,0,1,1,0,0,0,1,0,0
4,0.25,0.0,0.339623,0.178082,0.0,0.770992,0.0,0.225806,0.0,0,...,1,0,0,1,0,0,0,1,0,0


### 2.3 split train data and test data

In [8]:
Y_train=data.iloc[:,8]
X_train=data.copy()
del X_train["num"]

In [9]:
X_train,X_test,Y_train,Y_test = train_test_split(X_train,Y_train,test_size=0.2,random_state=42)

In [10]:
X_train.head()

Unnamed: 0,age,sex,restbp,chol,fbs,thalach,exang,oldpeak,cp_1.0,cp_2.0,...,slope_1.0,slope_2.0,slope_3.0,ca_0.0,ca_1.0,ca_2.0,ca_3.0,thal_3.0,thal_6.0,thal_7.0
273,0.208333,0.0,0.415094,0.214612,0.0,0.618321,0.0,0.0,0,0,...,0,1,0,1,0,0,0,1,0,0
259,0.645833,0.0,0.528302,0.260274,0.0,0.763359,0.0,0.145161,1,0,...,1,0,0,1,0,0,0,1,0,0
30,0.833333,0.0,0.433962,0.257991,0.0,0.610687,0.0,0.290323,1,0,...,1,0,0,0,0,1,0,1,0,0
22,0.604167,1.0,0.245283,0.360731,0.0,0.679389,0.0,0.290323,0,1,...,0,1,0,1,0,0,0,1,0,0
277,0.375,1.0,0.339623,0.289954,0.0,0.824427,0.0,0.0,0,0,...,1,0,0,1,0,0,0,1,0,0


## 3.SVM_baseline model

In [11]:
from sklearn.svm import SVC
svm_clf = SVC()
svm_clf.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [12]:
y_pre=svm_clf.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(Y_test,y_pre)  

0.6166666666666667

In [13]:
df=data.copy()
df.loc[df['num']>0,'num']=1

In [14]:
Y_train_two=df.iloc[:,8]
X_train_two=df.copy()
del X_train_two["num"]

In [15]:
X_train_two,X_test_two,Y_train_two,Y_test_two = train_test_split(X_train_two,Y_train_two,test_size=0.2,random_state=42)

In [16]:
from sklearn.svm import SVC
svm_clf = SVC()
svm_clf.fit(X_train_two, Y_train_two)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [17]:
y_pre=svm_clf.predict(X_test_two)
from sklearn.metrics import accuracy_score
accuracy_score(Y_test_two,y_pre)  

0.8833333333333333