In [198]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd

congressional_voting_records = fetch_ucirepo(id=105)

X = congressional_voting_records.data.features
y = congressional_voting_records.data.targets
df = congressional_voting_records.data.original

default_table = {'republican':[0,0,0],'democrat':[0,0,0]}
default_df = pd.DataFrame(data=default_table, index=['y', 'n', 'nan'])
variables = congressional_voting_records.variables['name']


In [199]:
def simpleSample(data):
    testing = data.groupby('Class', group_keys=False).sample(frac=0.2)
    learning = data.drop(testing.index)
    return learning, testing


In [200]:
def fold10Sample(data):
    folds = []
    for i in range(0,10):
        folds.append(data.groupby('Class', group_keys=False).sample(frac=1/(10-i)))
        data = data.drop(folds[i].index)
    return folds


### Method of transformation

Frist half of the observations with missing data: random assigned data

Second half of the observations with missing data: kNN with only complete data

In [201]:
def computeDistance(a, b):
    return sum([(ai == bi) for ai, bi in zip(a, b)])

def getMajorityLabel(neighboursList):
    majority = {"y":0, "n":0}
    furthest = neighboursList[-1][0] + 1
    for dist, label in neighboursList:
        majority[label[0]] += furthest - dist
    return max(majority, key=majority.get)

def kNNClassify(X, Y, xt, k):
    distancePairList = []
    for xi, yi in zip(X.values, Y.values):
        di = computeDistance(xi, xt)
        distancePairList.append((di, yi))
    distancePairList.sort()

    kNeighboursList = distancePairList[:k]
    yt = getMajorityLabel(kNeighboursList)

    return yt


In [202]:
import numpy as np

def transform(data):
    completeData = data.dropna()
    missingData = data.drop(completeData.index)
    randomData = missingData.sample(frac=0.5)
    knnData = missingData.drop(randomData.index)
    np.random.seed()

    for col in randomData.columns:
        m = randomData[col].isna()
        if m.any():
            randomData.loc[m, col] = np.random.choice(["y", "n"], size=m.sum())

    completeData = pd.concat([completeData, randomData])

    def mappingkNNClassify(xt):
        k = 10
        m = xt.isna()
        for ind in xt[m].index:
            xt[ind] = kNNClassify(completeData.drop(xt[m].index, axis=1), completeData[ind], xt.drop(xt[m].index), k )

        return xt

    knnData = knnData.apply(mappingkNNClassify,axis=1)
    return pd.concat([completeData, knnData])


In [203]:
def laplaceSmoothing(data, totals):
    l = 1
    additive = [l] * data.shape[0]
    data = data*totals
    for col in data.columns: 
        if 0 in data[col].values:
            totals[col] += l*data.shape[0]
            data[col] = data[col].add(additive)
    
    return data/totals


In [None]:
def creatBayesTables(data):
    bayestables = {}
    for category in data.columns:
        if category == 'Class':
            bayestables[category] = data['Class'].value_counts(dropna=False, ascending=True).to_frame()
        else:
            temp = data[['Class',category]].value_counts(dropna=False)
            bayestables[category] = pd.DataFrame({'republican':temp['republican'], 'democrat':temp['democrat']}) / bayestables['Class']['count']
            # print(any([any(np.isnan(sublist)) for sublist in bayestables[category].values]))
            bayestables[category].fillna(0, inplace=True)
            if any([0 in sublist for sublist in bayestables[category].values]):
                bayestables[category] = laplaceSmoothing(bayestables[category], bayestables['Class']['count'])
            bayestables[category] = bayestables[category].apply(np.log)
            
    return bayestables


### func explanation

1. Transforms the testing row to a DataFrame
2. The DataFrame into a Series of the two possible variations of Class for the row
3. The Series maps to the values for the corresponding class, category and answer
4. The mapped series back to a DataFrame with columns class and indecies category
5. Sums the columns and gets the argmax column

In [205]:
def naiveBayesClassifier(learning, testing):
    categorical = creatBayesTables(learning)

    def func(test):
        temp = pd.DataFrame({"republican":test.drop('Class'), "democrat":test.drop('Class')})
        long = (
            temp
            .rename_axis('category')
            .reset_index()
            .melt(id_vars='category', var_name='Class', value_name='type')
        )

        long['proportion'] = long.apply(
            lambda r: categorical[r['category']]
                .loc[r['type'], r['Class']],
            axis=1
        )

        temp = (
            long
            .pivot(index='category', columns='Class', values='proportion')
        )
        results = temp.sum(axis=0)
        #print(temp)
        return results.idxmax()
    
    return testing.apply(func, axis=1)


In [None]:
import sys
m = input("Mode?")
m = int(m)
# m = 1
if m == 1:
    d = transform(df)
elif m == 0:
    d = df
else:
    sys.exit("Not a valid mode")
    
data, testing = simpleSample(d)
_ , validate = simpleSample(data)

divisor = len(validate.index)

validate['result'] = naiveBayesClassifier(data, validate)
success = validate[validate['result'] == validate['Class']].shape[0]

print("1.Train Set Accuracy:\nAccuracy:", "{:.2%}".format(success/divisor))

print("\n2.10-Fold Cross-Validation Results:")
folds = fold10Sample(data)
successes = [0] * 10
for i in range(0,10):
    divisor = len(folds[i].index)
    folds[i]['result'] = naiveBayesClassifier(data.drop(folds[i].index), folds[i])
    successes[i] = folds[i][folds[i]['result'] == folds[i]['Class']].shape[0]/divisor
    print("Accuracy Fold", i+1, ":", "{:.2%}".format(successes[i]))

print("\nAverage Accuracy:",  "{:.2%}".format(np.average(successes)))
print("Standard Deviation:",  "{:.2%}".format(np.std(successes)))

testing['result'] = naiveBayesClassifier(data, testing)
success = testing[testing['result'] == testing['Class']].shape[0]
divisor = len(testing.index)
print("\n3.Test Set Accuracy:\nAccuracy:", "{:.2%}".format(success/divisor))

# res = []
# d = transform(df)
# for i in range(0,50):
#     data, testing = simpleSample(d)
#     testing['result'] = naiveBayesClassifier(data, testing)
#     success = testing[testing['result'] == testing['Class']].shape[0]
#     divisor = len(testing.index)
#     res.append(float("{:.2}".format(success/divisor)))

# np.mean(res)



1.Train Set Accuracy:
Accuracy: 92.86%

2.10-Fold Cross-Validation Results:
Accuracy Fold 1 : 91.18%
Accuracy Fold 2 : 79.41%
Accuracy Fold 3 : 94.44%
Accuracy Fold 4 : 85.29%
Accuracy Fold 5 : 88.89%
Accuracy Fold 6 : 97.06%
Accuracy Fold 7 : 86.11%
Accuracy Fold 8 : 94.12%
Accuracy Fold 9 : 94.44%
Accuracy Fold 10 : 88.24%

Average Accuracy: 89.92%
Standard Deviation: 5.11%

3.Test Set Accuracy:
Accuracy: 91.95%


np.float64(0.9053999999999999)