In [1]:
#Extract Data
import numpy as np
import math
import random
import copy
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics 
from sklearn import utils



datasetFile = open('./spambase.data', 'r')

spam = list()
notSpam = list()


for line in datasetFile:
    splitedLine = line.split(',')
    for a in range(0, len(splitedLine)):
        splitedLine[a] = float(splitedLine[a])
    if(splitedLine[-1] == 1):
        spam.append(splitedLine)
    else:
        notSpam.append(splitedLine)

#Randomize
random.shuffle(spam)
random.shuffle(notSpam)

training = copy.deepcopy(spam[0:1359])

notSpamTraining = copy.deepcopy(notSpam[0:2091])
training.extend(notSpamTraining)

testing = copy.deepcopy(spam[1359:-1])

notSpamTesting = copy.deepcopy(notSpam[2091:-1])
testing.extend(notSpamTesting)

random.shuffle(training)
random.shuffle(testing)

   
trainingX = np.array([row[:-1] for row in training])
trainingY = np.array([row[-1] for row in training])

testingX = np.array([row[:-1] for row in testing])
testingY = np.array([row[-1] for row in testing])

discreteTrainingX = list()
for a in trainingX:
    b = (a > 0).astype(int)[:-3]
    b = np.append(b, a[-3:])
    discreteTrainingX.append(b)

discreteTestX = list()
for a in testingX:
    b = (a > 0).astype(int)[:-3]
    b = np.append(b, a[-3:])
    discreteTestX.append(b)    
    
print("Data Preped")

Data Preped


In [38]:
#Sklearn's Random Forest
for numOfTrees in [10, 50 ,100]:
    clf = RandomForestClassifier(n_estimators=numOfTrees)
    clf = clf.fit(discreteTrainingX, trainingY)

    print("Random Forest with %d" % numOfTrees)
    #Metrics
    y_pred = clf.predict(discreteTrainingX)
    confMatrix = metrics.confusion_matrix(trainingY, y_pred)
    accuracy = (confMatrix[1][1]+confMatrix[0][0])/len(y_pred)
    precision = confMatrix[1][1]/(confMatrix[1][1]+confMatrix[0][1])
    recall = confMatrix[1][1]/(confMatrix[1][1]+confMatrix[1][0])
    print("For training set")
    print("Accuracy %.4f" % accuracy)
    print("Error %.4f" % (1-accuracy))
    print("Precision: %.4f" % precision)
    print("Recall: %.4f" % recall)
    print()


    y_pred = clf.predict(discreteTestX)
    confMatrix = metrics.confusion_matrix(testingY, y_pred)
    accuracy = (confMatrix[1][1]+confMatrix[0][0])/len(y_pred)
    precision = confMatrix[1][1]/(confMatrix[1][1]+confMatrix[0][1])
    recall = confMatrix[1][1]/(confMatrix[1][1]+confMatrix[1][0])
    print("For testing set")
    print("Accuracy %.4f" % accuracy)
    print("Error %.4f" % (1-accuracy))
    print("Precision: %.4f" % precision)
    print("Recall: %.4f" % recall)
    print()

Random Forest with 10
For training set
Accuracy 0.9974
Error 0.0026
Precision: 0.9971
Recall: 0.9963

For testing set
Accuracy 0.9487
Error 0.0513
Precision: 0.9417
Recall: 0.9272

Random Forest with 50
For training set
Accuracy 0.9994
Error 0.0006
Precision: 1.0000
Recall: 0.9985

For testing set
Accuracy 0.9582
Error 0.0418
Precision: 0.9451
Recall: 0.9492

Random Forest with 100
For training set
Accuracy 0.9994
Error 0.0006
Precision: 1.0000
Recall: 0.9985

For testing set
Accuracy 0.9600
Error 0.0400
Precision: 0.9492
Recall: 0.9492



In [34]:
#Own Random Forest Classification Creation
def calculateFeatureToUse(mergedIO, featuresSelected):
    #Use Gini
    giniList = list()
    for feature in featuresSelected:     
        featureIs0AndOutputIs0 = 0
        featureIs0AndOutputIs1 = 0
        featureIs1AndOutputIs0 = 0
        featureIs1AndOutputIs1 = 0
        #If feature value is less than 54 then it is discrete 0 and 1
        if(feature < 54):
            for example in mergedIO:
                if example[feature] == 0 and example[-1] == 0:
                    featureIs0AndOutputIs0 += 1
                elif example[feature] == 0 and example[-1] == 1:
                    featureIs0AndOutputIs1 += 1
                elif example[feature] == 1 and example[-1] == 0:
                    featureIs1AndOutputIs0 += 1
                elif example[feature] == 1 and example[-1] == 1:
                    featureIs1AndOutputIs1 += 1
             
            feature0 = 0
            if(featureIs0AndOutputIs0 != 0 and featureIs0AndOutputIs1 != 0):
                feature0Output0 = (featureIs0AndOutputIs0/(featureIs0AndOutputIs0+featureIs0AndOutputIs1))
                feature0Output1 = (featureIs0AndOutputIs1/(featureIs0AndOutputIs0+featureIs0AndOutputIs1))
                feature0 = (1-(feature0Output0**2+feature0Output1**2)) * ((featureIs0AndOutputIs0+featureIs0AndOutputIs1)/len(mergedIO))
            
            feature1 = 0
            if(featureIs1AndOutputIs0 != 0 or featureIs1AndOutputIs1 != 0):
                feature1Output0 = (featureIs1AndOutputIs0/(featureIs1AndOutputIs0+featureIs1AndOutputIs1))
                feature1Output1 = (featureIs1AndOutputIs1/(featureIs1AndOutputIs0+featureIs1AndOutputIs1))
                feature1 = (1-(feature1Output0**2+feature1Output1**2)) * ((featureIs1AndOutputIs0+featureIs1AndOutputIs1)/len(mergedIO))
            
            giniIndex = feature0 + feature1
            
            giniList.append(giniIndex)
        else:
            print("This should never happen")
            

    location = np.argmin(giniList)
    return featuresSelected[location], giniList[location]

def generateTree(mergedIO, numFeaturesToUse, tree, featureSelection):
    #Structure(featureToUse, howToSplit, leftLeafIndex, feature = 0, RightLeafIndex, feature = 1)
    random.shuffle(featureSelection)
    featureToUse, giniIndex = calculateFeatureToUse(mergedIO, featureSelection[:numFeaturesToUse])
    featureSelection.remove(featureToUse)
    
    
    indexForEdit = len(tree)   
    #Perfect Split
    if(giniIndex == 0):       
        tree.append([featureToUse, 0, False, True])
        return indexForEdit
            
    tree.append([featureToUse, 0, 0, 0])
    
    #Split the set
    lessThanSet = list()
    moreThanSet = list()
    for example in mergedIO:
        if(example[featureToUse] == 0):
            lessThanSet.append(example)
        elif(example[featureToUse] == 1):
            moreThanSet.append(example)
        else:
            print("Split Set Sanity Check")
    lessThanSet = np.array(lessThanSet)
    moreThanSet = np.array(moreThanSet) 
    
    if(len(featureSelection) == 0):
        #No more features to test so end
        (values,counts) = np.unique(lessThanSet[:,-1],return_counts=True)
        location=np.argmax(counts)
        tree[indexForEdit][2] = bool(values[location])

        (values,counts) = np.unique(moreThanSet[:,-1],return_counts=True)
        location=np.argmax(counts)
        tree[indexForEdit][3] = bool(values[location])
        print("No more features")
        return indexForEdit
    
    
    #Check if totally one sided
    #If when feature is 0, then find most common output for when feature is 1 and opposite for when feature is 0
    if(len(lessThanSet) == 0):
        (values,counts) = np.unique(moreThanSet[:,-1],return_counts=True)
        location=np.argmax(counts)
        tree[indexForEdit][2] = not bool(values[location])
        tree[indexForEdit][3] = bool(values[location])
        return indexForEdit
    #If when feature is 1, then find most common output for when feature is 0 and opposite for when feature is 1
    elif(len(moreThanSet) == 0):
        (values,counts) = np.unique(lessThanSet[:,-1],return_counts=True)
        location=np.argmax(counts)
        tree[indexForEdit][2] = bool(values[location])
        tree[indexForEdit][3] = not bool(values[location])
        return indexForEdit
            
    #Do left node (False)
    leftNodeLocation = generateTree(lessThanSet, numFeaturesToUse, tree, copy.deepcopy(featureSelection))
    tree[indexForEdit][2] = leftNodeLocation
    
    rightNodeLocation = generateTree(moreThanSet, numFeaturesToUse, tree, copy.deepcopy(featureSelection))
    tree[indexForEdit][3] = rightNodeLocation                  
    
    return indexForEdit
        
        

def randomForest(x, y, numOfTrees=10, featuresToUse=1):

    #Calculate number of features
    numOfFeatures = len(x[0])

    if featuresToUse > numOfFeatures:
        print("Error in number of features")
        return
    
    trees = list()
    for a in range(0, numOfTrees):
        #Bootstrap sampling
        bootStrapX, bootStrapY = utils.resample(x, y, replace=False)
        
        #Merge input and output
        mergedFeatureOutput = np.append(x, np.array([y]).transpose(), axis=1)
        
        #Create tree and store
        treeCreated = list()
        featureSelection = [a for a in range(0, len(mergedFeatureOutput[0])-4)]
        
        #-4 to skip the last 3
        generateTree(mergedFeatureOutput, featuresToUse, treeCreated, featureSelection)
        trees.append(treeCreated)
    
    return trees


trees = randomForest(discreteTrainingX, trainingY, 10, 7)
print("Done")





Done


In [35]:
def traverse(example, tree):
    nextToCheck = 0
    
    while(True):
        data = tree[nextToCheck]
        featureToCheck = data[0]
        leftNodeLocation = data[2]
        rightNodeLocation = data[3]

        if example[featureToCheck] == 0:
            if(type(leftNodeLocation) is not bool):
                nextToCheck = leftNodeLocation
            else:
                return leftNodeLocation
        elif example[featureToCheck] == 1:
            if(type(rightNodeLocation) is not bool):
                nextToCheck = rightNodeLocation
            else:
                return rightNodeLocation
            
def predictRandomForest(testX, trees):
    predictY = list()
    for example in testX:
        vote = list()
        for tree in trees:
            vote.append(traverse(example, tree))
        
        #print(vote)
        (values,counts) = np.unique(vote,return_counts=True)
        location=np.argmax(counts)
        predictY.append(int(values[location]))
        
    return predictY
     
        

In [53]:
for numOfTrees in [10, 50 ,100]:
    trees = randomForest(discreteTrainingX, trainingY, numOfTrees, 8)
    
    print("Random Forest with %d" % numOfTrees)
    #Metrics
    y_pred = predictRandomForest(discreteTrainingX, trees) 
    confMatrix = metrics.confusion_matrix(trainingY, y_pred)
    accuracy = (confMatrix[1][1]+confMatrix[0][0])/len(y_pred)
    precision = confMatrix[1][1]/(confMatrix[1][1]+confMatrix[0][1])
    recall = confMatrix[1][1]/(confMatrix[1][1]+confMatrix[1][0])
    print("For training set")
    print("Accuracy %.4f" % accuracy)
    print("Error %.4f" % (1-accuracy))
    print("Precision: %.4f" % precision)
    print("Recall: %.4f" % recall)
    print()


    y_pred = predictRandomForest(discreteTestX, trees) 
    confMatrix = metrics.confusion_matrix(testingY, y_pred)
    accuracy = (confMatrix[1][1]+confMatrix[0][0])/len(y_pred)
    precision = confMatrix[1][1]/(confMatrix[1][1]+confMatrix[0][1])
    recall = confMatrix[1][1]/(confMatrix[1][1]+confMatrix[1][0])
    print("For testing set")
    print("Accuracy %.4f" % accuracy)
    print("Error %.4f" % (1-accuracy))
    print("Precision: %.4f" % precision)
    print("Recall: %.4f" % recall)
    print()

Random Forest with 10
For training set
Accuracy 0.6067
Error 0.3933
Precision: 0.5063
Recall: 0.0589

For testing set
Accuracy 0.6084
Error 0.3916
Precision: 0.5429
Recall: 0.0419

Random Forest with 50
For training set
Accuracy 0.6119
Error 0.3881
Precision: 0.5877
Recall: 0.0493

For testing set
Accuracy 0.6040
Error 0.3960
Precision: 0.4688
Recall: 0.0331

Random Forest with 100
For training set
Accuracy 0.6107
Error 0.3893
Precision: 0.5606
Recall: 0.0545

For testing set
Accuracy 0.6066
Error 0.3934
Precision: 0.5152
Recall: 0.0375



In [52]:
#Sklearn's Random Forest feature varying
featuresLen = len(discreteTrainingX[0])

clf = RandomForestClassifier(n_estimators=50, max_features=None)
clf = clf.fit(discreteTrainingX, trainingY)

print("Random Forest with %d features" % featuresLen)
#Metrics
y_pred = clf.predict(discreteTrainingX)
confMatrix = metrics.confusion_matrix(trainingY, y_pred)
accuracy = (confMatrix[1][1]+confMatrix[0][0])/len(y_pred)
precision = confMatrix[1][1]/(confMatrix[1][1]+confMatrix[0][1])
recall = confMatrix[1][1]/(confMatrix[1][1]+confMatrix[1][0])
print("For training set")
print("Accuracy %.4f" % accuracy)
print("Error %.4f" % (1-accuracy))
print("Precision: %.4f" % precision)
print("Recall: %.4f" % recall)
print()


y_pred = clf.predict(discreteTestX)
confMatrix = metrics.confusion_matrix(testingY, y_pred)
accuracy = (confMatrix[1][1]+confMatrix[0][0])/len(y_pred)
precision = confMatrix[1][1]/(confMatrix[1][1]+confMatrix[0][1])
recall = confMatrix[1][1]/(confMatrix[1][1]+confMatrix[1][0])
print("For testing set")
print("Accuracy %.4f" % accuracy)
print("Error %.4f" % (1-accuracy))
print("Precision: %.4f" % precision)
print("Recall: %.4f" % recall)
print()



clf = RandomForestClassifier(n_estimators=50, max_features=(featuresLen//2))
clf = clf.fit(discreteTrainingX, trainingY)

print("Random Forest with %d features" % (featuresLen//2))
#Metrics
y_pred = clf.predict(discreteTrainingX)
confMatrix = metrics.confusion_matrix(trainingY, y_pred)
accuracy = (confMatrix[1][1]+confMatrix[0][0])/len(y_pred)
precision = confMatrix[1][1]/(confMatrix[1][1]+confMatrix[0][1])
recall = confMatrix[1][1]/(confMatrix[1][1]+confMatrix[1][0])
print("For training set")
print("Accuracy %.4f" % accuracy)
print("Error %.4f" % (1-accuracy))
print("Precision: %.4f" % precision)
print("Recall: %.4f" % recall)
print()


y_pred = clf.predict(discreteTestX)
confMatrix = metrics.confusion_matrix(testingY, y_pred)
accuracy = (confMatrix[1][1]+confMatrix[0][0])/len(y_pred)
precision = confMatrix[1][1]/(confMatrix[1][1]+confMatrix[0][1])
recall = confMatrix[1][1]/(confMatrix[1][1]+confMatrix[1][0])
print("For testing set")
print("Accuracy %.4f" % accuracy)
print("Error %.4f" % (1-accuracy))
print("Precision: %.4f" % precision)
print("Recall: %.4f" % recall)
print()



clf = RandomForestClassifier(n_estimators=50, max_features=int(featuresLen**(1/2)))
clf = clf.fit(discreteTrainingX, trainingY)

print("Random Forest with %d features" % int(featuresLen**(1/2)))
#Metrics
y_pred = clf.predict(discreteTrainingX)
confMatrix = metrics.confusion_matrix(trainingY, y_pred)
accuracy = (confMatrix[1][1]+confMatrix[0][0])/len(y_pred)
precision = confMatrix[1][1]/(confMatrix[1][1]+confMatrix[0][1])
recall = confMatrix[1][1]/(confMatrix[1][1]+confMatrix[1][0])
print("For training set")
print("Accuracy %.4f" % accuracy)
print("Error %.4f" % (1-accuracy))
print("Precision: %.4f" % precision)
print("Recall: %.4f" % recall)
print()


y_pred = clf.predict(discreteTestX)
confMatrix = metrics.confusion_matrix(testingY, y_pred)
accuracy = (confMatrix[1][1]+confMatrix[0][0])/len(y_pred)
precision = confMatrix[1][1]/(confMatrix[1][1]+confMatrix[0][1])
recall = confMatrix[1][1]/(confMatrix[1][1]+confMatrix[1][0])
print("For testing set")
print("Accuracy %.4f" % accuracy)
print("Error %.4f" % (1-accuracy))
print("Precision: %.4f" % precision)
print("Recall: %.4f" % recall)
print()

Random Forest with 57 features
For training set
Accuracy 0.9994
Error 0.0006
Precision: 0.9993
Recall: 0.9993

For testing set
Accuracy 0.9434
Error 0.0566
Precision: 0.9330
Recall: 0.9227

Random Forest with 28 features
For training set
Accuracy 0.9991
Error 0.0009
Precision: 0.9985
Recall: 0.9993

For testing set
Accuracy 0.9504
Error 0.0496
Precision: 0.9342
Recall: 0.9404

Random Forest with 7 features
For training set
Accuracy 0.9991
Error 0.0009
Precision: 1.0000
Recall: 0.9978

For testing set
Accuracy 0.9539
Error 0.0461
Precision: 0.9425
Recall: 0.9404

