In [151]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
import numpy as np

breast_cancer = fetch_ucirepo(id=14)

X = breast_cancer.data.features
y = breast_cancer.data.targets
df = breast_cancer.data.original
variables = breast_cancer.variables['name']

def simpleSample(data):
    testing = data.groupby('Class', group_keys=False).sample(frac=0.2)
    learning = data.drop(testing.index)
    return learning, testing

def fold10Sample(data):
    folds = []
    for i in range(0,10):
        folds.append(data.groupby('Class', group_keys=False).sample(4))
        data = data.drop(folds[i].index)
    return folds


In [152]:
def preprune(df):
    return df


In [153]:
def postprune(df):
    return df


In [154]:
class Node:
    def __init__(self, feature=None, results=None, branches=None):
        self.feature = feature  # Feature to split on
        self.results = results  # Stores class labels if node is a leaf node
        self.branches = branches  # Branch for values that are True for the feature


In [155]:
def laplaceSmoothing(data, t):
    l = 1
    totals = t.copy()
    additive = [l] * data.shape[0]
    data[['no-recurrence-events', 'recurrence-events']] = data[['no-recurrence-events', 'recurrence-events']]*totals
    for col in data.columns: 
        if 0 in data[col].values:
            totals[col] += l*data.shape[0]
            data[col] = data[col].add(additive)
            data['total'] = data['total'].add(additive)
    data[['no-recurrence-events', 'recurrence-events']] /= totals
    return data

def creatFrequencyTables(data, smoothing):
    frequencyTable = {}
    frequencyTable['Class'] = data['Class'].value_counts(dropna=False, ascending=True).to_frame()
    for category in data.columns:
        if category == 'Class':
            continue
        else:
            temp = data[['Class',category]].value_counts(dropna=False)
            frequencyTable[category] = pd.DataFrame({'no-recurrence-events':temp['no-recurrence-events'], 'recurrence-events':temp['recurrence-events']})
            frequencyTable[category].fillna(0, inplace=True)
            frequencyTable[category]['total'] = frequencyTable[category]['no-recurrence-events'] + frequencyTable[category]['recurrence-events']
            #frequencyTable[category][['no-recurrence-events', 'recurrence-events']] /= frequencyTable['Class']['count']
            if smoothing and any([0 in sublist for sublist in frequencyTable[category].values]):
                frequencyTable[category] = laplaceSmoothing(frequencyTable[category], frequencyTable['Class']['count'])
    return frequencyTable

def entropy(a,b):
    sum = a + b
    a = a/sum
    b = b/sum
    return -(a*np.log2(a) if a != 0 else 0)- (b*np.log2(b) if b != 0 else 0)

def entropyTableGenerator(df):
    frequency = creatFrequencyTables(df, False)
    entropyTable = {}
    for category in frequency:
        if category == 'Class':
            entropyTable[category] = entropy(frequency[category]['count']['no-recurrence-events'],frequency[category]['count']['recurrence-events'])
        else:
            entropyTable[category] = frequency[category].apply(lambda x: entropy(x['no-recurrence-events'], x['recurrence-events']), axis=1)
            entropyTable[category] *= frequency[category]['total'].div(frequency[category]['total'].sum())
            entropyTable[category] = entropyTable[category].sum()

    return entropyTable


def informationGainTableGenerator(entropyTable):
    informationGain = {}
    for category in entropyTable:
        if category == 'Class':
            continue

        informationGain[category] = entropyTable['Class'] - entropyTable[category]
    #return informationGain
    maxkey = max(informationGain, key=informationGain.get)
    return maxkey, informationGain[maxkey]

informationGainTableGenerator(entropyTableGenerator(df))


('deg-malig', np.float64(0.07700985251661441))

In [156]:
def split_data(data, feature):
    result = {}
    for discription in data[feature].drop_duplicates():
        result[discription] = data[feature == discription]
    return result

def buildtree(data):
    if data.shape[0] == 1:
        return Node(results=data['Class'][0])
    
    bestFeature, bestGain = informationGainTableGenerator(entropyTableGenerator(data))

    if bestGain > 0:
        branchesData = split_data(data, bestFeature)
        branches = {}
        for discription in branchesData:
            branches[discription] = buildtree(branchesData[discription])
        return Node(feature=bestFeature, branches=branches)
    
    return Node(results=data['Class'][0])


def id3(training, testing):
    decisionTree = buildtree(training)

    def predict(tree, sample):
        if tree.results is not None:
            return tree.results
        else:
            branch = tree.branches[sample[tree.feature]]
            return predict(branch, sample)
    
    return testing.apply(lambda x: predict(decisionTree, x), axis = 1)


In [157]:
import sys
# m = input("Mode?")
# m = int(m)
m = 0
if m == 0:
    d = preprune(df)
elif m == 1:
    d = postprune(df)
elif m == 2:
    d = postprune(preprune(df))
else:
    sys.exit("Not a valid mode")
    
data, testing = simpleSample(d)
_ , validate = simpleSample(data)

divisor = validate.shape[0]
print(validate['age'].drop_duplicates())
validate['result'] = id3(data, validate)
success = validate[validate['result'] == validate['Class']].shape[0]

print("1.Train Set Accuracy:\nAccuracy:", "{:.2%}".format(success/divisor))

print("\n2.10-Fold Cross-Validation Results:")
folds = fold10Sample(data)
successes = [0] * 10
for i in range(0,10):
    divisor = folds[i].shape[0]
    folds[i]['result'] = id3(data.drop(folds[i].index), folds[i])
    successes[i] = folds[i][folds[i]['result'] == folds[i]['Class']].shape[0]/divisor
    print("Accuracy Fold", i+1, ":", "{:.2%}".format(successes[i]))

print("\nAverage Accuracy:",  "{:.2%}".format(np.average(successes)))
print("Standard Deviation:",  "{:.2%}".format(np.std(successes)))

testing['result'] = id3(data, testing)
success = testing[testing['result'] == testing['Class']].shape[0]
divisor = testing.shape[0]
print("\n3.Test Set Accuracy:\nAccuracy:", "{:.2%}".format(success/divisor))



158    50-59
120    60-69
55     70-79
137    40-49
118    30-39
Name: age, dtype: object


KeyError: False