In [302]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd

congressional_voting_records = fetch_ucirepo(id=105)

X = congressional_voting_records.data.features
y = congressional_voting_records.data.targets
df = congressional_voting_records.data.original

default_table = {'republican':[0,0,0],'democrat':[0,0,0]}
default_df = pd.DataFrame(data=default_table, index=['y', 'n', 'nan'])
variables = congressional_voting_records.variables['name']


In [303]:
def simpleSample(data):
    testing = data.groupby('Class', group_keys=False).sample(frac=0.2)
    learning = data.drop(testing.index)
    return learning, testing


In [304]:
def fold10Sample(data):
    folds = []
    for i in range(0,10):
        folds.append(data.groupby('Class', group_keys=False).sample(frac=1/(10-i)))
        data = data.drop(folds[i].index)
    return folds


### Method of transformation

kNN

In [305]:
def transform(data):
    
    return data


In [None]:
def laplaceSmoothing(data, totals):
    l = 1
    additive = pd.Series([l] * data.shape[0])
    data = data*totals
    for party in data.columns: 
        if 0 in data[party].values:
            totals[party] += l*data.shpae[0]
            data[party] = data[party] + additive
    
    return data/totals


data = pd.DataFrame({"a":[0,1,2],"b":[3,0,4]})
totals = {"a":3,"b":7}
data = data/totals
print(data,"\n",laplaceSmoothing(data,totals))


          a         b
0  0.000000  0.428571
1  0.333333  0.000000
2  0.666667  0.571429 
           a         b
0  0.166667  0.285714
1  0.333333  0.071429
2  0.500000  0.357143


In [None]:
import numpy as np

def creatBayesTables(data):
    bayestables = {}
    for category in data.columns:
        if category == 'Class':
            bayestables[category] = data['Class'].value_counts(dropna=False, ascending=True).to_frame()
        else:
            temp = data[['Class',category]].value_counts(dropna=False)
            bayestables[category] = pd.DataFrame({'republican':temp['republican'], 'democrat':temp['democrat']}) / bayestables['Class']['count']
            if any(0 in sublist for sublist in bayestables[category].values):
                print(temp)
                bayestables[category] = laplaceSmoothing(bayestables[category], bayestables['Class']['count'])
            bayestables[category] = bayestables[category].apply(np.log)
            
    return bayestables


### func explanation

1. Transforms the testing row to a DataFrame
2. The DataFrame into a Series of the two possible variations of Class for the row
3. The Series maps to the values for the corresponding class, category and answer
4. The mapped series back to a DataFrame with columns class and indecies category
5. Sums the columns and gets the argmax column

In [308]:
def naiveBayesClassifier(learning, testing):
    categorical = creatBayesTables(learning)

    def func(test):
        temp = pd.DataFrame({"republican":test.drop('Class'), "democrat":test.drop('Class')})
        long = (
            temp
            .rename_axis('category')
            .reset_index()
            .melt(id_vars='category', var_name='Class', value_name='type')
        )

        long['proportion'] = long.apply(
            lambda r: categorical[r['category']]
                .loc[r['type'], r['Class']],
            axis=1
        )

        temp = (
            long
            .pivot(index='category', columns='Class', values='proportion')
        )
        results = temp.sum(axis=0)
        #print(results.idxmax())
        return results.idxmax()
    
    return testing.apply(func, axis=1)


In [309]:
import sys
#m = input("Mode?")
#m = int(m)
m = 0
if m == 1:
    df = transform(df)
elif m != 0:
    sys.exit("Not a valid mode")
    
data, testing = simpleSample(df)
_ , validate = simpleSample(data)

divisor = len(validate.index)

validate['result'] = naiveBayesClassifier(data, validate)
success = validate[validate['result'] == validate['Class']].shape[0]

print("1.Train Set Accuracy:\nAccuracy:", "{:.2%}".format(success/divisor))

print("\n2.10-Fold Cross-Validation Results:")
folds = fold10Sample(data)
successes = [0] * 10
for i in range(0,10):
    divisor = len(folds[i].index)
    folds[i]['result'] = naiveBayesClassifier(data.drop(folds[i].index), folds[i])
    successes[i] = folds[i][folds[i]['result'] == folds[i]['Class']].shape[0]/divisor
    print("Accuracy Fold", i+1, ":", "{:.2%}".format(successes[i]))

print("\nAverage Accuracy:",  "{:.2%}".format(np.average(successes)))
print("Standard Deviation:",  "{:.2%}".format(np.std(successes)))

testing['result'] = naiveBayesClassifier(data, testing)
success = testing[testing['result'] == testing['Class']].shape[0]
divisor = len(testing.index)
print("\n3.Test Set Accuracy:\nAccuracy:", "{:.2%}".format(success/divisor))



1.Train Set Accuracy:
Accuracy: 88.57%

2.10-Fold Cross-Validation Results:
Accuracy Fold 1 : 88.24%
Accuracy Fold 2 : 73.53%
Accuracy Fold 3 : 94.44%
Accuracy Fold 4 : 97.06%
Accuracy Fold 5 : 100.00%
Accuracy Fold 6 : 91.18%
Accuracy Fold 7 : 75.00%
Accuracy Fold 8 : 91.18%
Accuracy Fold 9 : 88.89%
Accuracy Fold 10 : 94.12%

Average Accuracy: 89.36%
Standard Deviation: 8.28%

3.Test Set Accuracy:
Accuracy: 93.10%
