In [210]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd

congressional_voting_records = fetch_ucirepo(id=105)

X = congressional_voting_records.data.features
y = congressional_voting_records.data.targets
df = congressional_voting_records.data.original

default_table = {'republican':[0,0,0],'democrat':[0,0,0]}
default_df = pd.DataFrame(data=default_table, index=['y', 'n', 'nan'])
variables = congressional_voting_records.variables['name']


In [211]:
def simpleSample(data):
    testing = data.groupby('Class', group_keys=False).sample(frac=0.2)
    learning = data.drop(testing.index)
    return learning, testing


In [212]:
def fold10Sample(data):
    folds = []
    for i in range(0,10):
        folds.append(data.groupby('Class', group_keys=False).sample(frac=1/(10-i)))
        data = data.drop(folds[i].index)
    return folds


In [213]:
def transform(data):

    return data


In [None]:
import numpy as np

def creatBayesTables(data):
    bayestables = {}
    for category in data.columns:
        if category == 'Class':
            bayestables[category] = data['Class'].value_counts(dropna=False, ascending=True).to_frame()
        else:
            temp = data[['Class',category]].value_counts(dropna=False)
            bayestables[category] = (pd.DataFrame({'republican':temp['republican'], 'democrat':temp['democrat']}) / bayestables['Class']['count']).apply(np.log)
    
    return bayestables


In [215]:
def naiveBayesClassifier(learning, testing):
    categorical = creatBayesTables(learning)

    def func(test):
        temp = pd.DataFrame({"republican":test.drop('Class'), "democrat":test.drop('Class')})
        long = (
            temp
            .rename_axis('category')
            .reset_index()
            .melt(id_vars='category', var_name='Class', value_name='type')
        )

        long['proportion'] = long.apply(
            lambda r: categorical[r['category']]
                .loc[r['type'], r['Class']],
            axis=1
        )

        temp = (
            long
            .pivot(index='category', columns='Class', values='proportion')
        )
        results = temp.sum(axis=0)
        #print(results.idxmax())
        return results.idxmax()
    
    return testing.apply(func, axis=1)


In [216]:
import sys
#m = input("Mode?")
#m = int(m)
m = 0
if m == 1:
    df = transform(df)
elif m == 0:
    df.fillna('?')
else:
    sys.exit("Not a valid mode")
    
data, testing = simpleSample(df)
_ , validate = simpleSample(data)

divisor = len(validate.index)

validate['result'] = naiveBayesClassifier(data, validate)
success = validate[validate['result'] == validate['Class']].shape[0]

print("1.Train Set Accuracy:\nAccuracy:", "{:.2%}".format(success/divisor))

print("\n2.10-Fold Cross-Validation Results:")
folds = fold10Sample(data)
successes = [0] * 10
for i in range(0,10):
    divisor = len(folds[i].index)
    folds[i]['result'] = naiveBayesClassifier(data.drop(folds[i].index), folds[i])
    successes[i] = folds[i][folds[i]['result'] == folds[i]['Class']].shape[0]/divisor
    print("Accuracy Fold", i+1, ":", "{:.2%}".format(successes[i]))

print("\nAverage Accuracy:",  "{:.2%}".format(np.average(successes)))
print("Standard Deviation:",  "{:.2%}".format(np.std(successes)))

testing['result'] = naiveBayesClassifier(data, testing)
success = testing[testing['result'] == testing['Class']].shape[0]
divisor = len(testing.index)
print("\n3.Test Set Accuracy:\nAccuracy:", "{:.2%}".format(success/divisor))



1.Train Set Accuracy:
Accuracy: 91.43%

2.10-Fold Cross-Validation Results:
Accuracy Fold 1 : 91.18%
Accuracy Fold 2 : 82.35%
Accuracy Fold 3 : 88.89%
Accuracy Fold 4 : 91.18%
Accuracy Fold 5 : 80.56%
Accuracy Fold 6 : 94.12%
Accuracy Fold 7 : 94.44%
Accuracy Fold 8 : 94.12%
Accuracy Fold 9 : 86.11%
Accuracy Fold 10 : 88.24%

Average Accuracy: 89.12%
Standard Deviation: 4.65%

3.Test Set Accuracy:
Accuracy: 93.10%
