In [63]:
import os
os.environ['OMP_NUM_THREADS'] = '3' # for windows threading issue

import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder

In [64]:
# In this problem, you are required to apply various classification techniques on the benchmark
# dataset heart.csv from the UCI Machine Learning Repository. This dataset contains 14 fields,
# where the last one is the class: presence of heart disease (1) or absence of heart disease.

In [65]:
df = pd.read_csv('data/heart_disease_uci.csv', delimiter=',')

In [66]:
# 1. Data Preprocessing:
# a. Load the heart.csv dataset and handle any missing values by imputing them with the median
# of their respective columns.
# b. Normalize all numerical attributes to a range of [0, 1].

numeric_cols = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca', 'num']
categorical_cols = ['sex', 'dataset', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']

# Updated categorical mappings
categorical_mapping = {
    'sex': {'Male': 1, 'Female': 0},
    'dataset': {country: idx for idx, country in enumerate(sorted(df['dataset'].unique()))},
    'cp': {'typical angina': 0, 'atypical angina': 1, 'non-anginal pain': 2, 'asymptomatic': 3},
    'fbs': {True: 1, False: 0},
    'restecg': {'normal': 0, 'st-t abnormality': 1, 'lv hypertrophy': 2},
    'exang': {True: 1, False: 0},
    'slope': {'downsloping': 0, 'flat': 1, 'upsloping': 2},
    'thal': {'fixed defect': 0, 'normal': 1, 'reversable defect': 2}
}

df_processed = df.copy()
df_processed[numeric_cols] = df_processed[numeric_cols].fillna(df_processed[numeric_cols].median())

for col in categorical_cols:
    if col in categorical_mapping:
        df_processed[col] = df_processed[col].map(categorical_mapping[col])
        print(df_processed[col].unique())

scaler = MinMaxScaler()
df_processed[numeric_cols] = scaler.fit_transform(df_processed[numeric_cols])

[1 0]
[0 1 2 3]
[ 0.  3. nan  1.]
[ 1.  0. nan]
[ 2.  0.  1. nan]
[ 0.  1. nan]
[ 0.  1.  2. nan]
[ 0.  1.  2. nan]


In [67]:
# 2. Data Splitting:
# Obtain 200 random splits of the dataset into training (75%) and test (25%) sets.

splits = []
for i in range(200):
    train, test = train_test_split(df, test_size=0.25)
    splits.append((train, test))

In [68]:
# 3. Classification Techniques:
# For each split, apply the following classification techniques:
# i. Decision Trees
# ii. K-Nearest Neighbors (KNN) with K = 5
# iii. Logistic Regression
# iv. Multinomial Naïve Bayes
# v. Support Vector Machines with polynomial kernel of degree 3.

results = []
for i, (train, test) in enumerate(splits):
    classifiers = {
        'Decision Trees': DecisionTreeClassifier(),
        'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
        'Logistic Regression': LogisticRegression(),
        'Multinomial Naïve Bayes': MultinomialNB(),
        'Support Vector Machines': SVC(kernel='poly', degree=3)
    }

    for name, clf in classifiers.items():
        clf.fit(train[train.columns[:-1]], train['num'])
        y_pred = clf.predict(test[test.columns[:-1]])
        acc = accuracy_score(test['num'], y_pred)
        results.append((i, name, acc))
    
results = pd.DataFrame(results, columns=['split', 'classifier', 'accuracy'])

ValueError: could not convert string to float: 'Male'

In [None]:
# 4. Evaluation:
# For each model and each split:
# a. Compute the following evaluation metrics on the test set:
# - Precision
# - Recall
# - F1 Score
# - Accuracy
# b. Store the results of all splits for summarization.

eval_results = []
for i, (train, test) in enumerate(splits):
    for name, clf in classifiers.items():
        clf.fit(train[train.columns[:-1]], train['target'])
        y_pred = clf.predict(test[test.columns[:-1]])
        precision = precision_score(test['target'], y_pred)
        recall = recall_score(test['target'], y_pred)
        f1 = f1_score(test['target'], y_pred)
        acc = accuracy_score(test['target'], y_pred)
        eval_results.append((i, name, precision, recall, f1, acc))
    
eval_results = pd.DataFrame(eval_results, columns=['split', 'classifier', 'precision', 'recall', 'f1', 'accuracy'])

In [None]:
# 5. Result Summarization:
# Create a summary table showing the average values of precision, recall, F1 score, and accuracy for
# each classification technique across the 200 splits.

summary = eval_results.groupby('classifier').mean().reset_index()

print(summary)