# Heart Attack Analysis & Prediction Dataset

In this task you are asked to use `heart-data.csv` to train a support vector machine to predict heart attacks.

See `Data description.docx` or `Data description.pdf` for description of dataset.

# Reading Dataset

In [1]:
import pandas as pd

data = pd.read_csv('heart-data.csv')

data.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,Male,Non-anginal pain,145,233,High,Hypertrophy,150,No,2.3,Down-sloping,0.0,Fixed defect,1
1,37,Male,Atypical angina,130,250,Low,Normal,187,No,3.5,Down-sloping,0.0,Normal,1
2,41,Female,Typical angina,130,204,Low,Hypertrophy,172,No,1.4,Up-sloping,0.0,Normal,1
3,56,Male,Typical angina,120,236,Low,Normal,178,No,0.8,Up-sloping,0.0,Normal,1
4,57,Female,Asymptomatic,120,354,Low,Normal,163,Yes,0.6,Up-sloping,0.0,Normal,1


# TODO
1. Remove samples with missing data (there are **7 samples** with missing data).
2. Split the data to input and output.
3. Replace categorical values with numeric values (Use numeric encoding and one-hot encoding when suitable).
4. Split the dataset to (train - validation - test) by calling `train_test_split` two times:
    - First time: use `test_size=0.20` and `random_state=0`.
    - Second time: use `test_size=0.25` and `random_state=0`.
5. Apply feature scaling using `MinMaxScaler`.
6. Train a support vector machine classifier using suitable hyper-parameter values. 
7. Print the accuracy of both training and validation. Try to achieve **validation accuracy > 82%**.
8. Test your support vector machine and print accuracy of testing.

## Remove samples with missing data (there are 7 samples with missing data).

In [2]:
mask = data.isnull().any(axis=1)                # create a mask of rows with missing data
num_of_rows_with_nan =  mask.sum()              # calculate number of rows with missing data
print(num_of_rows_with_nan/len(data))           # print the ratio of rows with missing data

0.0231023102310231


In [3]:
data_clean = data[~mask]  #  remove rows with missing data

## Split the data to input and output.

In [4]:
data_input = data_clean.drop(columns=['output'])  #  Select all columns from `data_clean` except 'output'
data_output = data_clean['output']             #Select 'output' column from `data_clean`

In [5]:
data_input.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall
0,63,Male,Non-anginal pain,145,233,High,Hypertrophy,150,No,2.3,Down-sloping,0.0,Fixed defect
1,37,Male,Atypical angina,130,250,Low,Normal,187,No,3.5,Down-sloping,0.0,Normal
2,41,Female,Typical angina,130,204,Low,Hypertrophy,172,No,1.4,Up-sloping,0.0,Normal
3,56,Male,Typical angina,120,236,Low,Normal,178,No,0.8,Up-sloping,0.0,Normal
4,57,Female,Asymptomatic,120,354,Low,Normal,163,Yes,0.6,Up-sloping,0.0,Normal


In [6]:
data_output

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: output, Length: 296, dtype: int64

## Replace categorical values with numeric values (Use numeric encoding and one-hot encoding when suitable).

## numeric encoding

In [7]:
#  print unique values of categorical features

print('sex',data['sex'].unique())
print('cp',data ['cp'].unique()) 
print('fbs', data['fbs'].unique()) 
print('restecg', data['restecg'].unique())
print('exng', data['exng'].unique()) 
print('slp', data['slp'].unique())
print('thall', data['thall'].unique()) 

sex ['Male' 'Female']
cp ['Non-anginal pain' 'Atypical angina' 'Typical angina' 'Asymptomatic']
fbs ['High' 'Low']
restecg ['Hypertrophy' 'Normal' 'ST-T wave abnormality']
exng ['No' 'Yes']
slp ['Down-sloping' 'Up-sloping' 'Flat']
thall ['Fixed defect' 'Normal' 'Reversable defect' nan]


In [8]:
#  use numeric encoding to encode `sex`, `fbs`, and `exng `
data_input_encoded_1 = data_input.replace({
    'sex':{'Male':0 ,'Female':1},
    'fbs':{'Low':0, 'High':1},
    'exng':{'No':0 ,'Yes':1}
})

In [9]:
data_input_encoded_1.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall
0,63,0,Non-anginal pain,145,233,1,Hypertrophy,150,0,2.3,Down-sloping,0.0,Fixed defect
1,37,0,Atypical angina,130,250,0,Normal,187,0,3.5,Down-sloping,0.0,Normal
2,41,1,Typical angina,130,204,0,Hypertrophy,172,0,1.4,Up-sloping,0.0,Normal
3,56,0,Typical angina,120,236,0,Normal,178,0,0.8,Up-sloping,0.0,Normal
4,57,1,Asymptomatic,120,354,0,Normal,163,1,0.6,Up-sloping,0.0,Normal


## one-hot encoding

In [10]:
data_input_encoded_2 =pd.get_dummies(data_input_encoded_1, prefix='is')  # apply one-hot encoding to data_input_encoded_1

In [11]:
data_input_encoded_2.head()

Unnamed: 0,age,sex,trtbps,chol,fbs,thalachh,exng,oldpeak,caa,is_Asymptomatic,...,is_Typical angina,is_Hypertrophy,is_Normal,is_ST-T wave abnormality,is_Down-sloping,is_Flat,is_Up-sloping,is_Fixed defect,is_Normal.1,is_Reversable defect
0,63,0,145,233,1,150,0,2.3,0.0,0,...,0,1,0,0,1,0,0,1,0,0
1,37,0,130,250,0,187,0,3.5,0.0,0,...,0,0,1,0,1,0,0,0,1,0
2,41,1,130,204,0,172,0,1.4,0.0,0,...,1,1,0,0,0,0,1,0,1,0
3,56,0,120,236,0,178,0,0.8,0.0,0,...,1,0,1,0,0,0,1,0,1,0
4,57,1,120,354,0,163,1,0.6,0.0,1,...,0,0,1,0,0,0,1,0,1,0


## Split the dataset to (train - validation - test) 

## First time: use test_size=0.20 and random_state=0.
## Second time: use test_size=0.25 and random_state=0.

In [14]:
from sklearn.model_selection import train_test_split

  
    
X, X_test, y, y_test = train_test_split(data_input_encoded_2, data_output, test_size=0.20 , random_state=0)

    

X_train, X_val, y_train, y_val = train_test_split(X, y ,test_size=0.25 , random_state=0)


In [15]:
print(X_train.shape)
print(y_train.shape)
print('---------------------')
print(X_val.shape)
print(y_val.shape)
print('---------------------')
print(X_test.shape)
print(y_test.shape)

(177, 22)
(177,)
---------------------
(59, 22)
(59,)
---------------------
(60, 22)
(60,)


## Apply feature scaling using MinMaxScaler

In [16]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() 

scaler.fit(X_train)  
X_train_scaled =scaler.transform(X_train)   
X_val_scaled =scaler.transform(X_val) 
X_test_scaled =scaler.transform(X_test)  

## Train a support vector machine classifier using suitable hyper-parameter values.

## Linear SVM

In [19]:
from sklearn.svm import SVC        
from sklearn.metrics import accuracy_score
svc = SVC(kernel='linear', random_state=0, C=0.2) 
svc.fit(X_train_scaled, y_train)

y_pred_train = svc.predict(X_train_scaled)
y_pred_val = svc.predict(X_val_scaled)

print(accuracy_score(y_train, y_pred_train))
print(accuracy_score(y_val, y_pred_val))




0.864406779661017
0.847457627118644


## Poly SVM

In [27]:
# Poly SVM =====> non-linear
svc = SVC(kernel='poly', degree=2, random_state=0, C=0.2)
# degree=2 ====> if degree big الانحناءات هتزيد
svc.fit(X_train_scaled, y_train)

y_pred_train = svc.predict(X_train_scaled)
y_pred_val = svc.predict(X_val_scaled)

print(accuracy_score(y_train, y_pred_train))
print(accuracy_score(y_val, y_pred_val))

0.8700564971751412
0.8305084745762712


## RBF SVM

In [28]:
svc = SVC(kernel='rbf', gamma=0.01, random_state=0, C=120)
svc.fit(X_train_scaled, y_train)

y_pred_train = svc.predict(X_train_scaled)
y_pred_val = svc.predict(X_val_scaled)

print(accuracy_score(y_train, y_pred_train))
print(accuracy_score(y_val, y_pred_val))

0.903954802259887
0.8305084745762712


## Test your support vector machine and print accuracy of testing.

In [29]:
y_pred_test = svc.predict(X_test_scaled)
print(accuracy_score(y_test, y_pred_test))

0.8333333333333334
