In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

# Reading the data set 

In [2]:
df = pd.read_csv("https://github.com/mpourhoma/CS4661/raw/master/Heart_s2.csv")


In [3]:
df.head()

Unnamed: 0,Age,ChestPain,RestBP,Chol,RestECG,MaxHR,Oldpeak,Thal,AHD
0,63,typical,145,233,2,150,2.3,fixed,No
1,67,asymptomatic,160,286,2,108,1.5,normal,Yes
2,67,asymptomatic,120,229,2,129,2.6,reversable,Yes
3,37,nonanginal,130,250,0,187,3.5,normal,No
4,41,nontypical,130,204,2,172,1.4,normal,No


# info about the dataset

- contains age, chestpain, restECG, maxHR, oldpeak, thal and AHD

In [4]:
feature_cols = ['Age','RestECG', 'Chol', 'RestECG', 'MaxHR', 'Oldpeak']
y = df['AHD']
X = df[feature_cols]  

# Splitting the dataset and trainign sets with following parameters: 
test_size=0.20, random_state=9

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=9)


# Using Knn, decision Tree and Logistic Regression

Use KNN (with k=7), Decision Tree (with random_state=5 (this random state is used when you define your decision tree classifier, and it is different from the random state that you used to split the data in part D)), and Logistic Regression Classifiers (with max_iter=400) to predict Heart Disease based on the training/testing datasets that you built in part (D). Then check, compare, and report the accuracy of these 3 classifiers. Which one is the best? Which one is the worst?

## Knn

In [9]:
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)
knn_predictions = knn.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_predictions)
print(f"KNN Accuracy: {knn_accuracy}")

KNN Accuracy: 0.6229508196721312


# Decision tree

In [10]:
dt = DecisionTreeClassifier(random_state=5)
dt.fit(X_train, y_train)
dt_predictions = dt.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_predictions)
print(f"Decision Tree Accuracy: {dt_accuracy}")

Decision Tree Accuracy: 0.5901639344262295


## WHich one is the best? 

Logistic regression gives us the best accuracy and the worst is decision tree classifier. 


In [11]:
lr = LogisticRegression(max_iter=400)
lr.fit(X_train, y_train)
lr_predictions = lr.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_predictions)
print(f"Logistic Regression Accuracy: {lr_accuracy}")

Logistic Regression Accuracy: 0.7704918032786885


# One hot encoding

To this end, we have to perform a feature engineering process called OneHotEncoding for the categorical features. To do this, each categorical feature should be replaced with dummy columns in the feature table (one column for each possible value of a categorical feature), and then encode it in a binary manner such that only one of the dummy columns can take “1” at a time (and zero for the rest). For example, “Thal” can take three values “fixed” and “normal” and "reversable". Thus, we need to replace this feature (in the feature table) with 3 columns titled “fixed”, “normal”, and "reversable".  Wherever we have a value "fixed", we should put “1”, ”0”, "0" in the columns “fixed” and “normal” and "reversable". 

How does the prediction accuracy change for each method?

In [26]:
df = pd.read_csv("https://github.com/mpourhoma/CS4661/raw/master/Heart_s2.csv")


In [15]:
categorical_features = ['ChestPain', 'Thal']
numerical_features = [col for col in df.columns if col not in categorical_features + ['AHD']]

In [27]:
print("Numerical features:", numerical_features)
print("Categorical features:", categorical_features)

Numerical features: ['Age', 'RestBP', 'Chol', 'RestECG', 'MaxHR', 'Oldpeak']
Categorical features: ['ChestPain', 'Thal']


In [28]:
X_numerical = df[numerical_features]
print("Numerical features shape:", X_numerical.shape)

Numerical features shape: (301, 6)


In [29]:
X_categorical = pd.get_dummies(df[categorical_features], prefix=categorical_features)
print("One-hot encoded categorical features shape:", X_categorical.shape)
print("One-hot encoded columns:", X_categorical.columns.tolist())

One-hot encoded categorical features shape: (301, 7)
One-hot encoded columns: ['ChestPain_asymptomatic', 'ChestPain_nonanginal', 'ChestPain_nontypical', 'ChestPain_typical', 'Thal_fixed', 'Thal_normal', 'Thal_reversable']


In [31]:
X = pd.concat([X_numerical, X_categorical], axis=1)
print("Combined feature matrix shape:", X.shape)


Combined feature matrix shape: (301, 13)


In [32]:
# preparing label vector 
y = df['AHD']
print("Label distribution:")
print(y.value_counts())

Label distribution:
AHD
No     163
Yes    138
Name: count, dtype: int64


In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=9)

In [34]:
print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")
print(f"Number of features: {X_train.shape[1]}")

Training set size: 240
Testing set size: 61
Number of features: 13


In [37]:
knn = KNeighborsClassifier(n_neighbors=7)
dt = DecisionTreeClassifier(random_state=5)
lr = LogisticRegression(max_iter=400)

results = {}


classifiers = {
    'KNN (k=7)': knn,
    'Decision Tree': dt,
    'Logistic Regression': lr
}


In [38]:
for name, clf in classifiers.items():
    print(f"\nTraining {name}...")
    
    # Train the classifier
    clf.fit(X_train, y_train)
    
    # Make predictions
    y_pred = clf.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    
    print(f"{name} Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")


Training KNN (k=7)...
KNN (k=7) Accuracy: 0.6230 (62.30%)

Training Decision Tree...
Decision Tree Accuracy: 0.7377 (73.77%)

Training Logistic Regression...
Logistic Regression Accuracy: 0.7869 (78.69%)
