In [33]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
import numpy as np

breast_cancer = fetch_ucirepo(id=14)

X = breast_cancer.data.features
y = breast_cancer.data.targets
df = breast_cancer.data.original
variables = breast_cancer.variables['name']
df.fillna("?", inplace=True)
# df['node-caps'].fillna(df['node-caps'].mode(), inplace=True)
# df['breast-quad'].fillna(df['breast-quad'].mode(), inplace=True)
def simpleSample(data):
    testing = data.groupby('Class', group_keys=False).sample(frac=0.2)
    learning = data.drop(testing.index)
    return learning, testing

def fold10Sample(data):
    folds = []
    for i in range(0,10):
        folds.append(data.groupby('Class', group_keys=False).sample(4))
        data = data.drop(folds[i].index)
    return folds


In [34]:
class Node:
    def __init__(self, feature=None, results=None, branches=None, n_samples = 0, errors = 0):
        self.feature = feature 
        self.results = results 
        self.branches = branches 
        self.n_samples = n_samples
        self.errors = errors


In [35]:
def laplaceSmoothing(data, t):
    l = 1
    totals = t.copy()
    additive = [l] * data.shape[0]
    data[['no-recurrence-events', 'recurrence-events']] = data[['no-recurrence-events', 'recurrence-events']]*totals
    for col in data.columns: 
        if 0 in data[col].values:
            totals[col] += l*data.shape[0]
            data[col] = data[col].add(additive)
            data['total'] = data['total'].add(additive)
    data[['no-recurrence-events', 'recurrence-events']] /= totals
    return data

def creatFrequencyTables(data, smoothing):
    frequencyTable = {}
    frequencyTable['Class'] = data['Class'].value_counts(dropna=False, ascending=True).to_frame()
    if frequencyTable['Class'].shape[0] == 1:
        return "no entropy"
    for category in data.columns:
        if category == 'Class':
            continue
        else:
            temp = data[['Class',category]].value_counts(dropna=False)
            frequencyTable[category] = pd.DataFrame({'no-recurrence-events':temp['no-recurrence-events'], 'recurrence-events':temp['recurrence-events']})
            frequencyTable[category].fillna(0, inplace=True)
            frequencyTable[category]['total'] = frequencyTable[category]['no-recurrence-events'] + frequencyTable[category]['recurrence-events']
            #frequencyTable[category][['no-recurrence-events', 'recurrence-events']] /= frequencyTable['Class']['count']
            if smoothing and any([0 in sublist for sublist in frequencyTable[category].values]):
                frequencyTable[category] = laplaceSmoothing(frequencyTable[category], frequencyTable['Class']['count'])
    return frequencyTable

def entropy(a,b):
    sum = a + b
    a = a/sum
    b = b/sum
    return -(a*np.log2(a) if a != 0 else 0)- (b*np.log2(b) if b != 0 else 0)

def entropyTableGenerator(df):
    frequency = creatFrequencyTables(df, False)

    if frequency == "no entropy":
        return frequency

    entropyTable = {}
    for category in frequency:
        if category == 'Class':
            entropyTable[category] = entropy(frequency[category]['count']['no-recurrence-events'],frequency[category]['count']['recurrence-events'])
        else:
            entropyTable[category] = frequency[category].apply(lambda x: entropy(x['no-recurrence-events'], x['recurrence-events']), axis=1)
            entropyTable[category] *= frequency[category]['total'].div(frequency[category]['total'].sum())
            entropyTable[category] = entropyTable[category].sum()

    return entropyTable


def informationGainTableGenerator(entropyTable):
    if entropyTable == 'no entropy':
        return entropyTable, entropyTable

    informationGain = {}
    for category in entropyTable:
        if category == 'Class':
            continue

        informationGain[category] = entropyTable['Class'] - entropyTable[category]
    #return informationGain
    maxkey = max(informationGain, key=informationGain.get)
    return maxkey, informationGain[maxkey]



In [36]:
def getErrorEstimate(node, is_subtree=False):
    if node.results is not None or not is_subtree:
        # Single node error: (E + 0.5) / n
        return (node.errors + 0.45) / node.n_samples  

    leaf_errors, leaf_count, total_n = sumLeafStats(node)
    return (leaf_errors + (0.5 * leaf_count)) / total_n  

def sumLeafStats(node):
    if node.results is not None:
        return node.errors, 1, node.n_samples
    
    total_e, total_l, total_n = 0, 0, 0
    for child in node.branches.values():
        e, l, n = sumLeafStats(child)
        total_e += e
        total_l += l
        total_n += n
    return total_e, total_l, total_n

def postpruning(node):
    if node.results is not None:
        return
    
    for child in node.branches.values():
        postpruning(child)

    canPrune = all(child.results is not None for child in node.branches.values())

    if canPrune:
        error_subtree = getErrorEstimate(node, is_subtree=True)
        error_leaf = getErrorEstimate(node, is_subtree=False)

        if error_leaf <= error_subtree:
            node.results = node.branches['default'].results
            node.branches = {}

    


In [37]:
def split_data(data, feature):
    result = {}
    for discription in data[feature].drop_duplicates():
        result[discription] = data[data[feature] == discription]
    return result

def buildtree(data, minimumGain):
    if data.shape[0] == 1:
        return Node(results=data['Class'].values[0], n_samples=1)
    
    bestFeature, bestGain = informationGainTableGenerator(entropyTableGenerator(data))
    if bestFeature == 'no entropy':
        return Node(results=data['Class'].values[0], n_samples=data.shape[0])
    
    # print(bestFeature, bestGain)
    if bestGain > minimumGain:
        branchesData = split_data(data, bestFeature)
        branches = {'default':Node(results=data["Class"].mode().values[0], n_samples=data.shape[0])}
        branches['default'].errors = data[data['Class'] != branches['default'].results].shape[0]
        for discription in branchesData:
            # print(bestFeature ," discription = ", discription)
            branches[discription] = buildtree(branchesData[discription], minimumGain)
        return Node(feature=bestFeature, branches=branches, n_samples=data.shape[0], errors=data[data['Class'] != branches['default'].results].shape[0])
    
    return Node(results=data['Class'].values[0],n_samples=1)


def id3(training, testing, minimumGain, postprune):
    decisionTree = buildtree(training, minimumGain)

    if postprune:
        postpruning(decisionTree)

    def predict(tree, sample):
        if tree.results is not None:
            return tree.results
        else:
            if sample[tree.feature] not in tree.branches:
                return tree.branches['default'].results 
            branch = tree.branches[sample[tree.feature]]
            return predict(branch, sample)
    
    return testing.apply(lambda x: predict(decisionTree, x), axis = 1)


In [None]:
import sys
# m = input("Mode?")
# m = int(m)
m = 2
if m == 0:
    postprune = False
    minimumGain = 0.07
elif m == 1:
    postprune = True
    minimumGain = 0
elif m == 2:
    postprune = True
    minimumGain = 0.07
else:
    sys.exit("Not a valid mode")
    
data, testing = simpleSample(df)
_ , validate = simpleSample(data)

divisor = validate.shape[0]

validate['result'] = id3(data, validate, minimumGain, postprune)
success = validate[validate['result'] == validate['Class']].shape[0]

print("1.Train Set Accuracy:\nAccuracy:", "{:.2%}".format(success/divisor))

print("\n2.10-Fold Cross-Validation Results:")
folds = fold10Sample(data)
successes = [0] * 10
for i in range(0,10):
    divisor = folds[i].shape[0]
    folds[i]['result'] = id3(data.drop(folds[i].index), folds[i], minimumGain, postprune)
    successes[i] = folds[i][folds[i]['result'] == folds[i]['Class']].shape[0]/divisor
    print("Accuracy Fold", i+1, ":", "{:.2%}".format(successes[i]))

print("\nAverage Accuracy:",  "{:.2%}".format(np.average(successes)))
print("Standard Deviation:",  "{:.2%}".format(np.std(successes)))

testing['result'] = id3(data, testing, minimumGain, postprune)
success = testing[testing['result'] == testing['Class']].shape[0]
divisor = testing.shape[0]
print("\n3.Test Set Accuracy:\nAccuracy:", "{:.2%}".format(success/divisor))



1.Train Set Accuracy:
Accuracy: 97.83%

2.10-Fold Cross-Validation Results:
Accuracy Fold 1 : 25.00%
Accuracy Fold 2 : 37.50%
Accuracy Fold 3 : 50.00%
Accuracy Fold 4 : 62.50%
Accuracy Fold 5 : 37.50%
Accuracy Fold 6 : 37.50%
Accuracy Fold 7 : 25.00%
Accuracy Fold 8 : 62.50%
Accuracy Fold 9 : 37.50%
Accuracy Fold 10 : 50.00%

Average Accuracy: 42.50%
Standard Deviation: 12.75%

3.Test Set Accuracy:
Accuracy: 68.42%


In [39]:
# for u in df['age'].drop_duplicates():
#     print(u, df[df['age'] == u])
# branchData = split_data(df, 'age')
# for discription in branchData:
#     print(branchData[discription])

#print(data.dtypes)
# selection = data.loc[(data['tumor-size'] == '30-34') & (data['inv-nodes'] == '0-2') & (data['breast-quad'] == 'left_low') & (data['age'] == '50-59')]
# first = selection['Class'].head(1)
# print(first[0])

print(data["Class"].mode().values[0])


no-recurrence-events
