In [81]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

## Part A ##
**Read the data file “Hearts_s.csv” (from github using the following command), and assign it to a Pandas DataFrame**  

In [82]:
df = pd.read_csv("https://github.com/mpourhoma/CS4661/raw/master/Heart_s2.csv")

In [83]:
df.head()

Unnamed: 0,Age,ChestPain,RestBP,Chol,RestECG,MaxHR,Oldpeak,Thal,AHD
0,63,typical,145,233,2,150,2.3,fixed,No
1,67,asymptomatic,160,286,2,108,1.5,normal,Yes
2,67,asymptomatic,120,229,2,129,2.6,reversable,Yes
3,37,nonanginal,130,250,0,187,3.5,normal,No
4,41,nontypical,130,204,2,172,1.4,normal,No


## Part C ##
**As you see, there are several categorical features in the dataset (ChestPain, Thal). Let’s ignore these categorical features for now, and only keep the numerical features and build your feature matrix and label vector.**

In [84]:
features = df.drop(columns=["ChestPain", "Thal", "AHD"])
print(features)

     Age  RestBP  Chol  RestECG  MaxHR  Oldpeak
0     63     145   233        2    150      2.3
1     67     160   286        2    108      1.5
2     67     120   229        2    129      2.6
3     37     130   250        0    187      3.5
4     41     130   204        2    172      1.4
..   ...     ...   ...      ...    ...      ...
296   45     110   264        0    132      1.2
297   68     144   193        0    141      3.4
298   57     130   131        0    115      1.2
299   57     130   236        2    174      0.0
300   38     138   175        0    173      0.0

[301 rows x 6 columns]


In [85]:
target = df["AHD"]
print(target)

0       No
1      Yes
2      Yes
3       No
4       No
      ... 
296    Yes
297    Yes
298    Yes
299    Yes
300     No
Name: AHD, Length: 301, dtype: object


## Part D ##
**Split the dataset into testing and training sets with the following parameters: test_size=0.20, random_state=9**

In [86]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.20, random_state = 9)

## Part E ##
**Use KNN (with k=7), Decision Tree (with random_state=5 (this random state is used when you define your decision tree classifier. It is different from the random state that you used to split the data in part D)), and Logistic Regression Classifiers (with max_iter=400) to predict Heart Disease based on the training/testing datasets that you built in part (D). Then check, compare, and report the accuracy of these 3 classifiers**

In [87]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# Setting KNN with k = 7, decision tree with random state 5, and logistic regression with max_iter = 400
knn = KNeighborsClassifier(n_neighbors=7)
my_decisiontree = DecisionTreeClassifier(random_state = 5)
my_logreg = LogisticRegression(max_iter = 400)

In [88]:
# Training and testing KNN Classifier
knn.fit(X_train, y_train)
knn_predictions = knn.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_predictions)

In [89]:
# Training and testing Decision Tree Classifier
my_decisiontree.fit(X_train, y_train)
decision_tree_predictions = my_decisiontree.predict(X_test)
decision_tree_accuracy = accuracy_score(y_test, decision_tree_predictions)

In [90]:
# Training and testing Logistic Regression Classifier
my_logreg.fit(X_train, y_train)
logistic_regression_predictions = my_logreg.predict(X_test)
logistic_regression_accuracy = accuracy_score(y_test, logistic_regression_predictions)

In [91]:
# Printing classifier accuracies
print("KNN Classifier Accuracy:", knn_accuracy)
print("Decision Tree Classifier Accuracy:", decision_tree_accuracy)
print("Logistic Regression Classifier Accuracy:", logistic_regression_accuracy)

KNN Classifier Accuracy: 0.6229508196721312
Decision Tree Classifier Accuracy: 0.6885245901639344
Logistic Regression Classifier Accuracy: 0.7213114754098361


**Which one is the best? Which one is the worst?**

The one with the best accuracy is Logistic Regression anf the worst accuracy is the KNN classifier.

## Part F ##
**Now, we want to use the categorical features as well! To this end, we have to perform a feature engineering process called OneHotEncoding for the categorical features. To do this, each categorical feature should be replaced with dummy columns in the feature table (one column for each possible value of a categorical feature), and then encode it in a binary manner such that only one of the dummy columns can take “1” at a time (and zero for the rest). For example, “Thal” can take three values “fixed” and “normal” and "reversable". Thus, we need to replace this feature (in the feature table) with 3 columns titled “fixed”, “normal”, and "reversable".  Wherever we have a value "fixed", we should put “1”, ”0”, "0" in the columns “fixed” and “normal” and "reversable".  (Hint: Similarly, you will need 4 columns to encode “ChestPain”)**

In [92]:
# Perform One-Hot Encoding for the "Thal" feature
Thal_encoded = pd.get_dummies(df['Thal'], prefix='Thal')
# Replacing the old Thal column with One Hot Encoded Thal column
df = pd.concat([df.drop('Thal', axis=1), Thal_encoded], axis=1)

# Perform One-Hot Encoding for the "ChestPain" feature
ChestPain_encoded = pd.get_dummies(df['ChestPain'], prefix='ChestPain')
# Replacing the old ChestPain column with One Hot Encoded Chest Pain column
df = pd.concat([df.drop('ChestPain', axis=1), ChestPain_encoded], axis=1)

# Printing new data frame
print(df.head())

   Age  RestBP  Chol  RestECG  MaxHR  Oldpeak  AHD  Thal_fixed  Thal_normal  \
0   63     145   233        2    150      2.3   No        True        False   
1   67     160   286        2    108      1.5  Yes       False         True   
2   67     120   229        2    129      2.6  Yes       False        False   
3   37     130   250        0    187      3.5   No       False         True   
4   41     130   204        2    172      1.4   No       False         True   

   Thal_reversable  ChestPain_asymptomatic  ChestPain_nonanginal  \
0            False                   False                 False   
1            False                    True                 False   
2             True                    True                 False   
3            False                   False                  True   
4            False                   False                 False   

   ChestPain_nontypical  ChestPain_typical  
0                 False               True  
1                 False   

## Part G ##
**Repeat parts (d) and (e) with the new dataset that you built in part (f). How does the prediction accuracy change for each method?**

In [56]:
from sklearn.model_selection import train_test_split

# Features
features = df.drop(columns=["AHD"])

# Target variable
target = df["AHD"]

# Split the dataset into testing and training sets with the following parameters
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=9)


In [57]:
# Initializing classifiers
knn_classifier = KNeighborsClassifier(n_neighbors=7)
decision_tree_classifier = DecisionTreeClassifier(random_state=5)
logistic_regression_classifier = LogisticRegression(max_iter=400)

# Training and testing KNN Classifier
knn_classifier.fit(X_train, y_train)
knn_predictions = knn_classifier.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_predictions)

# Training and testing Decision Tree Classifier
decision_tree_classifier.fit(X_train, y_train)
decision_tree_predictions = decision_tree_classifier.predict(X_test)
decision_tree_accuracy = accuracy_score(y_test, decision_tree_predictions)

# Training and testing Logistic Regression Classifier
logistic_regression_classifier.fit(X_train, y_train)
logistic_regression_predictions = logistic_regression_classifier.predict(X_test)
logistic_regression_accuracy = accuracy_score(y_test, logistic_regression_predictions)

# Printing accuracies
print("KNN Classifier Accuracy (After One-Hot Encoding):", knn_accuracy)
print("Decision Tree Classifier Accuracy (After One-Hot Encoding):", decision_tree_accuracy)
print("Logistic Regression Classifier Accuracy (After One-Hot Encoding):", logistic_regression_accuracy)


KNN Classifier Accuracy (After One-Hot Encoding): 0.6229508196721312
Decision Tree Classifier Accuracy (After One-Hot Encoding): 0.7377049180327869
Logistic Regression Classifier Accuracy (After One-Hot Encoding): 0.7868852459016393


In [58]:
from sklearn.model_selection import cross_val_score
import warnings
from sklearn.exceptions import ConvergenceWarning

# classifiers
knn_classifier = KNeighborsClassifier(n_neighbors=7)
decision_tree_classifier = DecisionTreeClassifier(random_state=5)
logistic_regression_classifier = LogisticRegression(max_iter=400)

warnings.filterwarnings("ignore", category = ConvergenceWarning)

# 10-fold Cross-Validation and compute accuracies
knn_cv_scores = cross_val_score(knn_classifier, features, target, cv=10, scoring='accuracy')
decision_tree_cv_scores = cross_val_score(decision_tree_classifier, features, target, cv=10, scoring='accuracy')
logistic_regression_cv_scores = cross_val_score(logistic_regression_classifier, features, target, cv=10, scoring='accuracy')

# Printing Cross-Validation accuracies
print("KNN Classifier Accuracy (After One-Hot Encoding and 10-fold CV):", knn_cv_scores.mean())
print("Decision Tree Classifier Accuracy (After One-Hot Encoding and 10-fold CV):", decision_tree_cv_scores.mean())
print("Logistic Regression Classifier Accuracy (After One-Hot Encoding and 10-fold CV):", logistic_regression_cv_scores.mean())


KNN Classifier Accuracy (After One-Hot Encoding and 10-fold CV): 0.6711827956989247
Decision Tree Classifier Accuracy (After One-Hot Encoding and 10-fold CV): 0.7308602150537634
Logistic Regression Classifier Accuracy (After One-Hot Encoding and 10-fold CV): 0.7906451612903226
