In [1]:
import pandas as pd   #for data frames, reading data, data processing, and analysis
import numpy as np    #for numerical computations
from sklearn.preprocessing import StandardScaler   #for scaling features
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

In [2]:
# Read data
data = pd.read_excel(r'C:\Users\LENOVO\Desktop\Heart Attack.xlsx', sheet_name='Heart Attack')

In [3]:
data

Unnamed: 0,age,gender,impluse,pressurehight,pressurelow,glucose,kcm,troponin,class
0,64,1,66,160,83,160.0,1.80,0.012,negative
1,21,1,94,98,46,296.0,6.75,1.060,positive
2,55,1,64,160,77,270.0,1.99,0.003,negative
3,64,1,70,120,55,270.0,13.87,0.122,positive
4,55,1,64,112,65,300.0,1.08,0.003,negative
...,...,...,...,...,...,...,...,...,...
1314,44,1,94,122,67,204.0,1.63,0.006,negative
1315,66,1,84,125,55,149.0,1.33,0.172,positive
1316,45,1,85,168,104,96.0,1.24,4.250,positive
1317,54,1,58,117,68,443.0,5.80,0.359,positive


In [4]:
# Display int columns and their counts, mean, standard deviation, minimum, maximum,and three quantiles
data.describe()

Unnamed: 0,age,gender,impluse,pressurehight,pressurelow,glucose,kcm,troponin
count,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0
mean,56.191812,0.659591,78.336619,127.170584,72.269143,146.634344,15.274306,0.360942
std,13.647315,0.474027,51.63027,26.12272,14.033924,74.923045,46.327083,1.154568
min,14.0,0.0,20.0,42.0,38.0,35.0,0.321,0.001
25%,47.0,0.0,64.0,110.0,62.0,98.0,1.655,0.006
50%,58.0,1.0,74.0,124.0,72.0,116.0,2.85,0.014
75%,65.0,1.0,85.0,143.0,81.0,169.5,5.805,0.0855
max,103.0,1.0,1111.0,223.0,154.0,541.0,300.0,10.3


In [5]:
# Display each variable with the number of not null values and their data types
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319 entries, 0 to 1318
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   age            1319 non-null   int64  
 1   gender         1319 non-null   int64  
 2   impluse        1319 non-null   int64  
 3   pressurehight  1319 non-null   int64  
 4   pressurelow    1319 non-null   int64  
 5   glucose        1319 non-null   float64
 6   kcm            1319 non-null   float64
 7   troponin       1319 non-null   float64
 8   class          1319 non-null   object 
dtypes: float64(3), int64(5), object(1)
memory usage: 92.9+ KB


In [6]:
# Count number of nulls in each column
data.isna().sum()

age              0
gender           0
impluse          0
pressurehight    0
pressurelow      0
glucose          0
kcm              0
troponin         0
class            0
dtype: int64

In [7]:
# Display shape of data (number of rows and columns)
data.shape

(1319, 9)

In [8]:
# Display types of data
data.dtypes

age                int64
gender             int64
impluse            int64
pressurehight      int64
pressurelow        int64
glucose          float64
kcm              float64
troponin         float64
class             object
dtype: object

In [9]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform 'class' column
data['class_encoded'] = label_encoder.fit_transform(data['class'])
data.drop('class',axis=1,inplace=True)
data

Unnamed: 0,age,gender,impluse,pressurehight,pressurelow,glucose,kcm,troponin,class_encoded
0,64,1,66,160,83,160.0,1.80,0.012,0
1,21,1,94,98,46,296.0,6.75,1.060,1
2,55,1,64,160,77,270.0,1.99,0.003,0
3,64,1,70,120,55,270.0,13.87,0.122,1
4,55,1,64,112,65,300.0,1.08,0.003,0
...,...,...,...,...,...,...,...,...,...
1314,44,1,94,122,67,204.0,1.63,0.006,0
1315,66,1,84,125,55,149.0,1.33,0.172,1
1316,45,1,85,168,104,96.0,1.24,4.250,1
1317,54,1,58,117,68,443.0,5.80,0.359,1


In [10]:
data.describe()

Unnamed: 0,age,gender,impluse,pressurehight,pressurelow,glucose,kcm,troponin,class_encoded
count,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0
mean,56.191812,0.659591,78.336619,127.170584,72.269143,146.634344,15.274306,0.360942,0.614102
std,13.647315,0.474027,51.63027,26.12272,14.033924,74.923045,46.327083,1.154568,0.486991
min,14.0,0.0,20.0,42.0,38.0,35.0,0.321,0.001,0.0
25%,47.0,0.0,64.0,110.0,62.0,98.0,1.655,0.006,0.0
50%,58.0,1.0,74.0,124.0,72.0,116.0,2.85,0.014,1.0
75%,65.0,1.0,85.0,143.0,81.0,169.5,5.805,0.0855,1.0
max,103.0,1.0,1111.0,223.0,154.0,541.0,300.0,10.3,1.0


In [11]:
def remove_outliers_iqr(data, columns):
    for col in columns:
        # Calculate the first quartile (Q1)
        Q1 = data[col].quantile(0.25)
        # Calculate the third quartile (Q3)
        Q3 = data[col].quantile(0.75)
        # Calculate the interquartile range (IQR)
        IQR = Q3 - Q1
        # Define the lower bound
        lower_bound = Q1 - 1.5 * IQR
        # Define the upper bound
        upper_bound = Q3 + 1.5 * IQR
        # Remove outliers
        data = data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]
    return data

In [12]:
# Specify the columns to remove outliers from
columns_to_clean = ['impluse', 'pressurehight','pressurelow','glucose','kcm','troponin']
#
# Remove outliers using the interquartile range
data = remove_outliers_iqr(data, columns_to_clean)

# Print the first few rows of the cleaned DataFrame to verify
data

Unnamed: 0,age,gender,impluse,pressurehight,pressurelow,glucose,kcm,troponin,class_encoded
0,64,1,66,160,83,160.0,1.80,0.012,0
2,55,1,64,160,77,270.0,1.99,0.003,0
5,58,0,61,112,58,87.0,1.83,0.004,0
6,32,0,40,179,68,102.0,0.71,0.003,0
8,44,0,60,154,81,135.0,2.35,0.004,0
...,...,...,...,...,...,...,...,...,...
1311,85,1,112,115,69,114.0,2.19,0.062,1
1312,48,1,84,118,68,96.0,5.33,0.006,0
1313,86,0,40,179,68,147.0,5.22,0.011,0
1314,44,1,94,122,67,204.0,1.63,0.006,0


In [13]:
data.describe()

Unnamed: 0,age,gender,impluse,pressurehight,pressurelow,glucose,kcm,troponin,class_encoded
count,790.0,790.0,790.0,790.0,790.0,790.0,790.0,790.0,790.0
mean,55.496203,0.624051,74.989873,125.637975,71.832911,129.989494,3.110443,0.022247,0.432911
std,13.860224,0.484674,13.855671,23.49329,13.375154,47.782404,2.150951,0.033013,0.495793
min,14.0,0.0,36.0,65.0,38.0,35.0,0.321,0.002,0.0
25%,45.0,0.0,63.0,110.0,61.0,96.0,1.6,0.005,0.0
50%,56.0,1.0,74.0,124.0,71.0,111.0,2.49,0.01,0.0
75%,66.0,1.0,84.0,142.75,81.0,152.0,4.065,0.02275,1.0
max,91.0,1.0,116.0,193.0,105.0,277.0,11.94,0.193,1.0


In [14]:
# Put data in variable X excluding column"area"
X = data.drop(['class_encoded'],axis=1)
Y= data['class_encoded']

In [15]:
# drop column customer id as it is not needed for normalization
# normlize the data
data_normalized =StandardScaler().fit_transform(X) 
data_normalized

array([[ 0.61392836,  0.77616627, -0.64923371, ...,  0.62846394,
        -0.60962474, -0.31058655],
       [-0.03582315,  0.77616627, -0.79367038, ...,  2.93202501,
        -0.52123577, -0.58338092],
       [ 0.18076069, -1.28838374, -1.01032539, ..., -0.90026294,
        -0.59566858, -0.55307043],
       ...,
       [ 2.20220983, -1.28838374, -2.52691046, ...,  0.35622491,
         0.98137658, -0.34089704],
       [-0.82996388,  0.77616627,  1.37287972, ...,  1.54988837,
        -0.6887096 , -0.49244946],
       [ 0.75831758,  0.77616627,  0.65069636, ...,  0.39810784,
        -0.82827112,  4.53909107]])

In [16]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data_normalized, Y, test_size=0.2, random_state=42)

In [17]:
# Hard SVM 
print("Hard SVM:")
hard_svm = SVC(kernel='linear', C=1e6)
hard_svm.fit(X_train, y_train)
print("Accuracy:", hard_svm.score(X_test, y_test))
print("Support Vectors:", len(hard_svm.support_))

# Printing alphas
print("Alphas (Coefficients):", hard_svm.dual_coef_)

# Cross-validation for Hard SVM
hard_svm_cv_scores = cross_val_score(hard_svm, data_normalized, Y, cv=3)
print("Cross-Validation Scores:", hard_svm_cv_scores)
print("Mean CV Accuracy:", np.mean(hard_svm_cv_scores))

Hard SVM:
Accuracy: 0.930379746835443
Support Vectors: 98
Alphas (Coefficients): [[-1000000.         -1000000.           -89254.04593006 -1000000.
  -1000000.          -483529.09951058 -1000000.         -1000000.
  -1000000.         -1000000.         -1000000.         -1000000.
  -1000000.         -1000000.          -214429.42305897  -916116.29007593
  -1000000.         -1000000.         -1000000.         -1000000.
  -1000000.         -1000000.         -1000000.         -1000000.
  -1000000.         -1000000.         -1000000.         -1000000.
  -1000000.         -1000000.         -1000000.         -1000000.
   -926067.98494653  -543107.62890963 -1000000.          -821958.98075334
  -1000000.         -1000000.         -1000000.         -1000000.
  -1000000.         -1000000.         -1000000.         -1000000.
  -1000000.         -1000000.         -1000000.         -1000000.
  -1000000.         -1000000.           380388.62961509  1000000.
   1000000.          1000000.          100000

In [18]:
# Soft SVM with a range of C values for tuning
print("\nSoft SVM:")
soft_svm = SVC(kernel='linear')
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
soft_svm_cv = GridSearchCV(soft_svm, param_grid, cv=3)
soft_svm_cv.fit(X_train, y_train)
print("Best Parameters:", soft_svm_cv.best_params_)
print("Accuracy:", soft_svm_cv.best_estimator_.score(X_test, y_test))
print("Support Vectors:", len(soft_svm_cv.best_estimator_.support_))

# Cross-validation for Soft SVM
soft_svm_cv_scores = cross_val_score(soft_svm_cv.best_estimator_, data_normalized, Y, cv=3)
print("Cross-Validation Scores:", soft_svm_cv_scores)
print("Mean CV Accuracy:", np.mean(soft_svm_cv_scores))


Soft SVM:
Best Parameters: {'C': 100}
Accuracy: 0.930379746835443
Support Vectors: 114
Cross-Validation Scores: [0.91287879 0.9391635  0.91634981]
Mean CV Accuracy: 0.9227973652878596


In [19]:
# Kernel SVM with a range of C and gamma values for tuning
print("\nKernel SVM:")
kernel_svm = SVC(kernel='rbf')
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
kernel_svm_cv = GridSearchCV(kernel_svm, param_grid, cv=3)
kernel_svm_cv.fit(X_train, y_train)
print("Best Parameters:", kernel_svm_cv.best_params_)
print("Accuracy:", kernel_svm_cv.best_estimator_.score(X_test, y_test))
print("Support Vectors:", len(kernel_svm_cv.best_estimator_.support_))

# Cross-validation for Kernel SVM
kernel_svm_cv_scores = cross_val_score(kernel_svm_cv.best_estimator_, data_normalized, Y, cv=3)
print("Cross-Validation Scores:", kernel_svm_cv_scores)
print("Mean CV Accuracy:", np.mean(kernel_svm_cv_scores))


Kernel SVM:
Best Parameters: {'C': 100, 'gamma': 0.01}
Accuracy: 0.9493670886075949
Support Vectors: 142
Cross-Validation Scores: [0.90530303 0.94296578 0.9391635 ]
Mean CV Accuracy: 0.92914410262319
