In [1]:
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext()

#### Loading the test data 

In [17]:
sample = sc.parallelize([(1,[1,1,0,1]),
         (0,[0,0,1,1]),
         (1,[1,1,0,2]),
         (0,[1,1,0,1]),
         (0,[0,0,1,0]),
         (2,[1,0,2,1]),
         (2,[2,0,1,1]),
         (2,[1,2,1,1]),
         (0,[0,1,1,1]),
         (1,[1,2,0,2]),
         (0,[1,1,1,1]),
         (0,[0,1,1,0]),
         (2,[2,2,2,1]),
         (2,[2,1,1,1]),
         (2,[2,2,1,1])])

In [134]:
sample2 = sc.textFile('../data/balance-scale.csv')
rdd2=sample2.map(lambda x:x.split()).map(lambda x: x[0].strip("'").split(","))\
            .map(lambda x:[v for v in x])\
            .map(lambda x: (str(x[0]),[int(k) for k in x[1:]]))
columns2 = ['Left-Weight','Left-Distance','Right-Weight','Right-Distance']

In [135]:
rdd2.take(2)

[('B', [1, 1, 1, 1]), ('R', [1, 1, 1, 2])]

#### Loading Actual data

In [19]:
txtFile=sc.textFile('../data/covtype.csv')
#Convert it into RDD of lists 
rdd=(txtFile.map(lambda x:x.split())
    .map(lambda x: x[0].strip("'").split(","))
    .map(lambda x:[float(v) for v in x])
    .map(lambda x: (x[-1]-1,x[0:-1])))

In [22]:
import itertools
soil_list =[]
for k in range(40):
    string = 'Soil_Type_' + str(k+1)
    soil_list.append(string)
WA_list =[]
for k in range(4):
    string = 'WA_' + str(k+1)
    WA_list.append(string)
names = [['Elevation'], ['Aspect'], ['Slope'], ['HDHyrdo'], ['VDHydro'], ['HDRoadways'], \
         ['9amHills'],['NoonHills'], ['3pmHills'], ['HDFirePoints'], WA_list,\
         soil_list, ['Cover_Type']]
columns = list(itertools.chain(*names))

In [40]:
import random
#selecting random features
m = 5 #No of features
indices = range(len(columns)-1)
random.shuffle(indices)
print indices[:m]

[31, 38, 24, 30, 23]


In [41]:
sampled_rdd = rdd.map(lambda x: (x[0],[x[1][j] for j in indices[:m]]))

#### InfoGain function

In [4]:
from collections import Counter
import numpy as np
def IG(l):
    length=len(l)
    c=Counter()
    for v in l:
        c[v] += 1.0/length
    return 1-sum(np.multiply(c.values(),c.values()))

In [5]:
def infoGain2(sampled_rdd,attr=0):
    c=sampled_rdd.count()
    pair_rdd=sampled_rdd.map(lambda x: (x[1][attr],x[0]))
    list_rdd=pair_rdd.groupByKey().mapValues(lambda x: tuple(x)).map(lambda x: (x[0],x[1], len(x[1])/float(c)))
    gains_rdd=list_rdd.map(lambda x: (x[0],IG(x[1]),x[2]))
    return gains_rdd.map(lambda x: x[1]*x[2]).reduce(lambda a,b:a+b)

In [76]:
def choose_attribute(sampled_rdd, attributes):
    best_gain = float("inf")
    best_attr = None
    for att in attributes:
        gain = infoGain2(sampled_rdd,att)
        #print att,gain
        if gain <= best_gain:
            best_gain = gain
            best_attr = att
    cats=sampled_rdd.map(lambda x:x[1][best_attr]).distinct().collect()
    return best_attr,cats

In [77]:
#Function returns most frequent value in response variable
def most_frequent(data):
    highest_freq = 0
    most_freq = None
    vals = data.map(lambda x:x[0]).distinct().collect()
    for val in vals:
        freq = data.filter(lambda x:x[0] == val)\
                    .map(lambda x:x[0]).count()
        if freq > highest_freq:
            most_freq = val
            highest_freq = freq
    return most_freq

In [118]:
import copy
def createDecisionTree(sub_rdd,attributes,columns):
    if len(attributes) <= 0:
        return most_frequent(sub_rdd)
    elif sub_rdd.map(lambda x:x[0]).distinct().count() == 1:
        return sub_rdd.map(lambda x:x[0]).distinct().collect()[0][0]
    else:
        bestAttr,vals = choose_attribute(sub_rdd,attributes)
        attributes.remove(bestAttr)
        #print bestAttr,vals
        tree = {columns[bestAttr]:{}}
        for val in vals:
            new_rdd = sub_rdd.filter(lambda x:x[1][bestAttr] == val)\
            .map(lambda x:(x[0],x[1]))
            #print val,bestAttr,attributes
            new_attributes = copy.deepcopy(attributes)
            subtree = createDecisionTree(new_rdd,new_attributes,columns)
            tree[columns[bestAttr]][val] = subtree
    return tree

In [126]:
tree = createDecisionTree(rdd2,range(len(columns2)),columns2)

In [127]:
tree

{'Right-Distance': {'1': {'Left-Distance': {'1': {'Right-Weight': {'1': {'Left-Weight': {'1': 'B',
        '2': 'L',
        '3': 'L',
        '4': 'L',
        '5': 'L'}},
      '2': {'Left-Weight': {'1': 'R', '2': 'B', '3': 'L', '4': 'L', '5': 'L'}},
      '3': {'Left-Weight': {'1': 'R', '2': 'R', '3': 'B', '4': 'L', '5': 'L'}},
      '4': {'Left-Weight': {'1': 'R', '2': 'R', '3': 'R', '4': 'B', '5': 'L'}},
      '5': {'Left-Weight': {'1': 'R',
        '2': 'R',
        '3': 'R',
        '4': 'R',
        '5': 'B'}}}},
    '2': {'Left-Weight': {'1': {'Right-Weight': {'1': 'L',
        '2': 'B',
        '3': 'R',
        '4': 'R',
        '5': 'R'}},
      '2': {'Right-Weight': {'1': 'L',
        '2': 'L',
        '3': 'L',
        '4': 'B',
        '5': 'R'}},
      '3': 'L',
      '4': 'L',
      '5': 'L'}},
    '3': {'Left-Weight': {'1': {'Right-Weight': {'1': 'L',
        '2': 'L',
        '3': 'B',
        '4': 'R',
        '5': 'R'}},
      '2': 'L',
      '3': 'L',
      '4': '

In [128]:
#Function for testing a tree on a set of data
def tree_test(data,columns,tree):
    #data will be the list stored in the second half of the tuple
    #columns contains the name of columns for referencing the tree
    if type(tree) == type("string"):
        return tree
    else:
        attr = tree.keys()[0]
        t = tree[attr][data[columns.index(attr)]]
        return tree_test(data,columns,t)

In [132]:
#Applying tree test row wise to the RDD
res_rdd = rdd2.map(lambda x:(x[0],tree_test(x[1],columns2,tree)))

In [133]:
#Calculating accuracy
print "Accuracy:", 100*(res_rdd.filter(lambda x:x[0] == x[1]).count())/float(res_rdd.count())

Accuracy: 100.0


#### Adding support for continuous variables

In [139]:
##Loading dataset with continuous features
sample_c = sc.textFile('../data/crx.csv')
rdd_c=sample_c.map(lambda x:x.split()).map(lambda x: x[0].strip("'").split(","))\
            .map(lambda x:[v for v in x])\
            .map(lambda x: (str(x[-1]),[k for k in x[0:-1]]))
columns_c = []
for k in range(1,16):
    columns_c.append('A'+str(k))

In [None]:
def dicretize_columns(data,column_ids):