# 1A. KNN Implementation Using KNeighborsClassifier

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score

# Loading of dataset
data = pd.read_csv('raw_heart_disease_dataset.csv')

# Separate features and target
features = data.drop('target', axis=1)
target = data['target']

# Encoding Categorical Variables
for column in features.select_dtypes(include=['object']).columns:
    features[column] = LabelEncoder().fit_transform(features[column])

# Normalization/Standardization
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Save and load the preprocessed data
preprocessed_data = pd.DataFrame(scaled_features, columns=features.columns)
preprocessed_data['target'] = target
preprocessed_data.to_csv('preprocessed_heart_disease_dataset.csv', index=False)

preprocessed_data = pd.read_csv('preprocessed_heart_disease_dataset.csv')
X = preprocessed_data.drop('target', axis=1)
y = preprocessed_data['target']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the KNeighborsClassifier with Euclidean distance
knn = KNeighborsClassifier(n_neighbors=5, metric="euclidean")

# Fit the classifier to the training data
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Model Accuracy: {accuracy:.2f}')
print(f'F1 Score: {f1:.2f}')


Model Accuracy: 0.55
F1 Score: 0.51


## 1B. KNN Implementation Using scipy KDTree

In [2]:
from sklearn.neighbors import KDTree
from collections import Counter

# Load the preprocessed data
preprocessed_data = pd.read_csv('preprocessed_heart_disease_dataset.csv')
X = preprocessed_data.drop('target', axis=1).values
y = preprocessed_data['target'].values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create KDTree with the training data
tree = KDTree(X_train)

# Query KDTree for k nearest neighbors
k = 5
distances, indices = tree.query(X_test, k=k)

# Aggregate  the predictions using majority voting
predicted_labels = []
for index_list in indices:
    neighbor_labels = y_train[index_list]
    # Use mode to determine the most common class among neighbors
    most_common = Counter(neighbor_labels).most_common(1)[0][0]
    predicted_labels.append(most_common)

accuracy = accuracy_score(y_test, predicted_labels)
f1 = f1_score(y_test, predicted_labels, average='weighted')

print(f'Model Accuracy: {accuracy:.2f}')
print(f'F1 Score: {f1:.2f}')

Model Accuracy: 0.58
F1 Score: 0.56


## Implementation using Scikit-learn DecisionTreeClassifier

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA

# Loading of the dataset
data = pd.read_csv('raw_heart_disease_dataset.csv')

# Data Cleaning
# Convert '?' to NaN and drop rows with any NaN values
data.replace('?', np.nan, inplace=True)
if data.isna().any().any():
    print("Missing values detected. Handling missing values.")
    data.dropna(inplace=True)

# Validate data entries (check for impossible values)
print("Checking for unrealistic data entries.")
out_of_bounds = (data['age'] < 25) | (data['age'] > 100) | \
                (data['trestbps'] < 50) | (data['trestbps'] > 200) | \
                (data['chol'] < 100) | (data['chol'] > 600)
if out_of_bounds.any():
    data = data[~out_of_bounds]
    print(f"Removed {out_of_bounds.sum()} rows with unrealistic entries.")

# Handle categorical data with LabelEncoder
for column in ['ca', 'thal']:
    if data[column].dtype == 'object' or data[column].isna().any():
        le = LabelEncoder()
        data[column] = le.fit_transform(data[column].astype(str))

# Remove duplicates
initial_count = len(data)
data.drop_duplicates(inplace=True)
print(f"Duplicates removed: {initial_count - len(data)}")

# Outlier Detection and Removal using IQR
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
outliers = ((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).any(axis=1)
print(f"Identified {outliers.sum()} outliers.")
data = data[~outliers]

# Feature Scaling or normalization
scaler = StandardScaler()
numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

# To Ensure that the target is an integer for classification
data['target'] = data['target'].astype(int)

# Dimensionality Reduction using PCA
pca = PCA(n_components=0.95)  # Retain 95% of variance
X = pca.fit_transform(data.drop('target', axis=1))
y = data['target']

# Save the fully preprocessed data
data.to_csv('preprocessed2_heart_disease_dataset.csv', index=False)

# Load the preprocessed data
data = pd.read_csv('preprocessed2_heart_disease_dataset.csv')

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# Initialization and training of the Decision Tree classifier
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)

# Predictions and Evaluation of the model
y_pred = decision_tree.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy:.2f}')
print(f'F1 Score: {f1:.2f}')


Missing values detected. Handling missing values.
Checking for unrealistic data entries.
Duplicates removed: 0
Identified 85 outliers.
Accuracy: 0.75
F1 Score: 0.76


## Implementation using XGBOOST DecisionTree

In [5]:
from xgboost import XGBClassifier

# Loading the preprocessed data
data = pd.read_csv('preprocessed2_heart_disease_dataset.csv')

# Split data into testing & training datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# Initialization and training of  the XGBoost classifier
xgb_classifier = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_classifier.fit(X_train, y_train)

# Predictions and Evaluation of the model
y_pred = xgb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy:.2f}')
print(f'F1 Score: {f1:.2f}')


Accuracy: 0.80
F1 Score: 0.79
