In [155]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd

congressional_voting_records = fetch_ucirepo(id=105)

X = congressional_voting_records.data.features
y = congressional_voting_records.data.targets
df = congressional_voting_records.data.original

default_table = {'republican':[0,0,0],'democrat':[0,0,0]}
default_df = pd.DataFrame(data=default_table, index=['y', 'n', 'nan'])
variables = congressional_voting_records.variables['name']


In [156]:
def simpleSample(data):
    testing = data.groupby('Class', group_keys=False).sample(frac=0.2)
    learning = data.drop(testing.index)
    return learning, testing


In [157]:
def fold10Sample(data):
    folds = []
    for i in range(0,10):
        folds.append(data.groupby('Class', group_keys=False).sample(frac=1/(10-i)))
        data = data.drop(folds[i].index)
    return folds


In [158]:
def transform(data):

    return data


In [159]:
import numpy as np

data, testing = simpleSample(df)
classes = default_df
for category in data.columns:
    if category == 'Class':
        classes = data['Class'].value_counts(dropna=False, ascending=True).to_frame()
        print(classes)
    else:
        temp = data[['Class',category]].value_counts(dropna=False)
        temp = pd.DataFrame({'republican':temp['republican'], 'democrat':temp['democrat']})
        temp = temp / classes['count']
        print(temp)
        #print(pd.pivot_table(data,values=data.columns.drop(category), index='Class', columns=category, aggfunc='count', dropna=False))


            count
Class            
republican    134
democrat      214
                     republican  democrat
handicapped-infants                      
n                      0.820896  0.359813
y                      0.171642  0.602804
NaN                    0.007463  0.037383
                            republican  democrat
water-project-cost-sharing                      
n                             0.440299  0.448598
y                             0.462687  0.429907
NaN                           0.097015  0.121495
                                   republican  democrat
adoption-of-the-budget-resolution                      
n                                    0.873134  0.112150
y                                    0.104478  0.869159
NaN                                  0.022388  0.018692
                      republican  democrat
physician-fee-freeze                      
n                       0.007463  0.925234
y                       0.977612  0.051402
NaN                  

In [160]:
import numpy as np

def creatBayesTables(data):
    bayestables = {}
    for category in data.columns:
        if category == 'Class':
            bayestables[category] = data['Class'].value_counts(dropna=False, ascending=True).to_frame()
        else:
            temp = data[['Class',category]].value_counts(dropna=False)
            bayestables[category] = pd.DataFrame({'republican':temp['republican'], 'democrat':temp['democrat']}) / bayestables['Class']['count']
    
    return bayestables


In [None]:
def naiveBayesClassifier(learning, testing):
    categorical = creatBayesTables(learning)

    def func(test):
        temp = pd.DataFrame({"republican":test.drop('Class'), "democrat":test.drop('Class')})
        long = (
            temp
            .rename_axis('category')
            .reset_index()
            .melt(id_vars='category', var_name='Class', value_name='type')
        )

        long['proportion'] = long.apply(
            lambda r: categorical[r['category']]
                .loc[r['type'], r['Class']],
            axis=1
        )

        temp = (
            long
            .pivot(index='category', columns='Class', values='proportion')
        )
        results = temp.prod(axis=0)
        #print(results.idxmax())
        return results.idxmax()


"\n    leaning = {'republican': 1, 'democrat': 1}\n    for c in categorical:\n        for party in leaning:\n            el = testing[categorytuple[c]]\n            leaning[party] *= categorical[c][party][el]/categorical['Class']['count'][party][el]\n    leaning['democrat'] *= categorical['Class']['count']['democrat']/(categorical['Class']['count']['democrat'] + categorical['Class']['count']['republican'])\n    leaning['republican'] *= categorical['Class']['count']['republican']/(categorical['Class']['count']['democrat'] + categorical['Class']['count']['republican'])\n    return max(leaning, key=leaning.get)\n\ndata, testing = simpleSample(df)\ntesting['result']=naiveBayesClassifier(data, testing)\nprint(testing)\n"

In [165]:
import sys
#m = input("Mode?")
#m = int(m)
m = 0
if m == 1:
    df = transform(df)
elif m == 0:
    df.fillna('?')
else:
    sys.exit("Not a valid mode")
    
data, testing = simpleSample(df)
_ , validate = simpleSample(data)

divisor = len(validate.index)

validate['result'] = naiveBayesClassifier(data, validate)
success = validate[validate['result'] == validate['Class']].shape[0]

print("1.Train Set Accuracy:\nAccuracy:", "{:.2%}".format(success/divisor))

print("\n2.10-Fold Cross-Validation Results:")
folds = fold10Sample(data)
successes = [0] * 10
for i in range(0,10):
    divisor = len(folds[i].index)
    folds[i]['result'] = naiveBayesClassifier(data, folds[i])
    successes[i] = folds[i][folds[i]['result'] == folds[i]['Class']].shape[0]/divisor
    print("Accuracy Fold", i+1, ":", "{:.2%}".format(successes[i]))

print("\nAverage Accuracy:",  "{:.2%}".format(np.average(successes)))
print("Standard Deviation:",  "{:.2%}".format(np.std(successes)))

testing['result'] = naiveBayesClassifier(data, testing)
success = testing[testing['result'] == testing['Class']].shape[0]
divisor = len(testing.index)
print("\n3.Test Set Accuracy:\nAccuracy:", "{:.2%}".format(success/divisor))



1.Train Set Accuracy:
Accuracy: 94.29%

2.10-Fold Cross-Validation Results:
Accuracy Fold 1 : 91.18%
Accuracy Fold 2 : 88.24%
Accuracy Fold 3 : 91.67%
Accuracy Fold 4 : 94.12%
Accuracy Fold 5 : 94.44%
Accuracy Fold 6 : 94.12%
Accuracy Fold 7 : 86.11%
Accuracy Fold 8 : 88.24%
Accuracy Fold 9 : 88.89%
Accuracy Fold 10 : 97.06%

Average Accuracy: 91.41%
Standard Deviation: 3.32%

3.Test Set Accuracy:
Accuracy: 88.51%
