[Probability and Statistics for Data Science](https://www.ps4ds.net/) 

Code for Section 4.8

Prediction of the political affiliation of representatives from the United States House of Representatives in 1984 based on their votes
Topics: Naive Bayes, classification

Author: Carlos Fernandez-Granda
Data source: https://archive.ics.uci.edu/dataset/105/congressional+voting+records

In [1]:
import numpy as np
file_name = "../data/house-votes-84.data"
data = np.genfromtxt(file_name, dtype=str,delimiter=',')

np.random.seed(2019)
n_test = 10
n_pol = data.shape[0]
aux_ind = np.random.permutation(range(n_pol))
ind_test = aux_ind[:n_test]
ind_train = aux_ind[n_test:]
data_test = data[ind_test,:]
data_train = data[ind_train,:]

# data_test and data_train contain the data for the representatives. The first entry is 'republican'
# or 'democrat'. The rest of the entries contain the votes: 'y' = yes, 'n' = no, '?' = absent

n_pol_train = data_train.shape[0]    
n_votes = data.shape[1]- 1
count_mat = np.zeros((2,n_votes))
total_votes = np.zeros((2,n_votes))
n_reps = 0
n_dems = 0
for ind in range(n_pol_train):
    if data_train[ind][0] == 'republican':
        n_reps += 1
        for i_vote in range(n_votes):
            if data_train[ind][1+i_vote] == 'y':
                count_mat[0,i_vote] += 1
            if data_train[ind][1+i_vote] != '?':
                total_votes[0,i_vote] += 1
    elif data_train[ind][0] == 'democrat':
        n_dems += 1
        for i_vote in range(n_votes):
            if data_train[ind][1+i_vote] == 'y':
                count_mat[1,i_vote] += 1
            if data_train[ind][1+i_vote] != '?':
                total_votes[1,i_vote] += 1
prob_cond = count_mat / total_votes

print("Democrats: " + str(n_dems) + ", Republicans: "+ str(n_reps))
p_r = n_reps / (n_dems + n_reps)
p_d = 1 - p_r
print("p_r: " + str(p_r))
print("p_d: " + str(p_d))
p_rep_cond_v = []

print(data_test[1,:])

for i_test in range(n_test):
    p_v_r = 1
    for i_vote in range(n_votes):
        if data_test[i_test,1+i_vote] == 'y':
            p_v_r = p_v_r * prob_cond[0,i_vote]
        elif data_test[i_test,1+i_vote] == 'n':
            p_v_r = p_v_r * (1-prob_cond[0,i_vote])
    p_v_d = 1
    for i_vote in range(n_votes):
        if data_test[i_test,1+i_vote] == 'y':
            p_v_d = p_v_d * prob_cond[1,i_vote]
        elif data_test[i_test,1+i_vote] == 'n':
            p_v_d = p_v_d * (1-prob_cond[1,i_vote])
    p_r_v = p_v_r * p_r / ( p_v_r * p_r + p_v_d * p_d)
    p_rep_cond_v.append(p_r_v)

mistake_inds = []
for i_test in range(n_test):
    p_r_v = p_rep_cond_v[i_test]
    print('Test example ' + str(i_test))
    print('P(Rep / data) = ' + str(p_r_v))
    print('Truth: ' + str(data_test[i_test,0]))
    if(p_r_v > 0.5 and data_test[i_test][0] == 'democrat'):
        print('MISTAKE')
        mistake_inds.append(i_test)
    else: 
        print('CORRECT')

Democrats: 263, Republicans: 162
p_r: 0.3811764705882353
p_d: 0.6188235294117648
['democrat' 'n' '?' 'y' 'n' 'n' 'y' 'y' 'y' 'n' 'y' 'n' 'n' 'n' 'n' 'y'
 '?']
Test example 0
P(Rep / data) = 0.9999999208413131
Truth: republican
CORRECT
Test example 1
P(Rep / data) = 1.9486344814482765e-08
Truth: democrat
CORRECT
Test example 2
P(Rep / data) = 0.9999989560596692
Truth: republican
CORRECT
Test example 3
P(Rep / data) = 2.2748033467687386e-08
Truth: democrat
CORRECT
Test example 4
P(Rep / data) = 0.999999757942457
Truth: republican
CORRECT
Test example 5
P(Rep / data) = 0.9995379435625033
Truth: democrat
MISTAKE
Test example 6
P(Rep / data) = 0.9999999823379229
Truth: republican
CORRECT
Test example 7
P(Rep / data) = 0.9999999424856351
Truth: republican
CORRECT
Test example 8
P(Rep / data) = 6.222238991709903e-11
Truth: democrat
CORRECT
Test example 9
P(Rep / data) = 0.9989332386194845
Truth: republican
CORRECT
