In [1]:
import pandas as pd
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score
import numpy as np

In [2]:
# Load datta
names_filepath = 'Congressional Voting Records/house-votes-84.names'
with open(names_filepath, 'r') as file:
    names_content = file.read()
    
data_filepath = 'Congressional Voting Records/house-votes-84.data'
data = pd.read_csv(data_filepath, header=None)

data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,republican,n,y,n,y,y,y,n,n,n,y,?,y,y,y,n,y
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,?
2,democrat,?,y,y,?,y,y,n,n,n,n,y,n,y,y,n,n
3,democrat,n,y,y,n,?,y,n,n,n,n,y,n,y,n,n,y
4,democrat,y,y,y,n,y,y,n,n,n,n,y,?,y,y,y,y


In [3]:
# Replacing y with 1, n with 0, and ? with NaN
data_encoded = data.replace({'y': 1, 'n': 0, '?': None})

In [4]:
# Three methods for handling missing values
# Discard instances with any missing values
data_discard = data_encoded.dropna()

# Treat Missing as a Value: Replace NaN with 2
data_ternary = data_encoded.fillna(2)

# Replace NA with the most common value of the column, which is the mode
data_impute = data_encoded.fillna(data_encoded.mode().iloc[0])

In [5]:
# Checking the first few rows of each dataset
data_samples = {
    'Original (Encoded)': data_encoded.head(),
    'Discard Missing': data_discard.head(),
    'Treat Missing as a Value': data_ternary.head(),
    'Impute Missing Values': data_impute.head()
}

print(data_samples)

{'Original (Encoded)':            0    1    2    3    4    5    6    7    8    9    10   11   12  \
0  republican  0.0  1.0  0.0  1.0  1.0  1.0  0.0  0.0  0.0  1.0  NaN  1.0   
1  republican  0.0  1.0  0.0  1.0  1.0  1.0  0.0  0.0  0.0  0.0  0.0  1.0   
2    democrat  NaN  1.0  1.0  NaN  1.0  1.0  0.0  0.0  0.0  0.0  1.0  0.0   
3    democrat  0.0  1.0  1.0  0.0  NaN  1.0  0.0  0.0  0.0  0.0  1.0  0.0   
4    democrat  1.0  1.0  1.0  0.0  1.0  1.0  0.0  0.0  0.0  0.0  1.0  NaN   

    13   14   15   16  
0  1.0  1.0  0.0  1.0  
1  1.0  1.0  0.0  NaN  
2  1.0  1.0  0.0  0.0  
3  1.0  0.0  0.0  1.0  
4  1.0  1.0  1.0  1.0  , 'Discard Missing':             0    1    2    3    4    5    6    7    8    9    10   11   12  \
5     democrat  0.0  1.0  1.0  0.0  1.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0   
8   republican  0.0  1.0  0.0  1.0  1.0  1.0  0.0  0.0  0.0  0.0  0.0  1.0   
19    democrat  1.0  1.0  1.0  0.0  0.0  0.0  1.0  1.0  1.0  0.0  1.0  0.0   
23    democrat  1.0  1.0  1.0  0.0  0.

In [6]:
datasets = {
    'Discard Missing': data_discard,
    'Treat Missing as Value': data_ternary,
    'Impute Missing Values': data_impute
}

def precision(y_true, y_pred): return precision_score(y_true, y_pred, pos_label='republican')
def recall(y_true, y_pred): return recall_score(y_true, y_pred, pos_label='republican')
def f1(y_true, y_pred): return f1_score(y_true, y_pred, pos_label='republican')

cross_val = KFold(n_splits=5, shuffle=True, random_state=0)

results = {}

# Decision Tree
decision_tree_model = DecisionTreeClassifier(random_state=0)
for name, data in datasets.items():
    x = data.drop(columns=0)
    y = data[0]
    
    results[f'Decision Tree with {name}'] = {
        'Precision': np.mean(cross_val_score(decision_tree_model, x, y, cv=cross_val, scoring=make_scorer(precision))),
        'Recall': np.mean(cross_val_score(decision_tree_model, x, y, cv=cross_val, scoring=make_scorer(recall))),
        'F1 Score': np.mean(cross_val_score(decision_tree_model, x, y, cv=cross_val, scoring=make_scorer(f1)))
    }

# Naive Bayes
naive_bayes_model = GaussianNB()
for name, data in datasets.items():
    x = data.drop(columns=0) 
    y = data[0]
    
    # Storing results
    results[f'Naive Bayes with {name}'] = {
        'Precision': np.mean(cross_val_score(naive_bayes_model, x, y, cv=cross_val, scoring=make_scorer(precision))),
        'Recall': np.mean(cross_val_score(naive_bayes_model, x, y, cv=cross_val, scoring=make_scorer(recall))),
        'F1 Score': np.mean(cross_val_score(naive_bayes_model, x, y, cv=cross_val, scoring=make_scorer(f1)))
    }

print(results)


{'Decision Tree with Discard Missing': {'Precision': 0.9532794612794613, 'Recall': 0.949608695652174, 'F1 Score': 0.9495177865612648}, 'Decision Tree with Treat Missing as Value': {'Precision': 0.9280491551459293, 'Recall': 0.9206205145429284, 'F1 Score': 0.9235691210367183}, 'Decision Tree with Impute Missing Values': {'Precision': 0.9130882352941174, 'Recall': 0.9242763271211547, 'F1 Score': 0.9185209583779841}, 'Naive Bayes with Discard Missing': {'Precision': 0.9595317725752508, 'Recall': 0.9419420289855072, 'F1 Score': 0.949231568836432}, 'Naive Bayes with Treat Missing as Value': {'Precision': 0.9185752943612984, 'Recall': 0.9203202831651108, 'F1 Score': 0.9189395433862015}, 'Naive Bayes with Impute Missing Values': {'Precision': 0.8806089743589745, 'Recall': 0.9188018016466291, 'F1 Score': 0.8988999897552246}}
