In [1]:
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext()

#### Loading the test data 

In [17]:
sample = sc.parallelize([(1,[1,1,0,1]),
         (0,[0,0,1,1]),
         (1,[1,1,0,2]),
         (0,[1,1,0,1]),
         (0,[0,0,1,0]),
         (2,[1,0,2,1]),
         (2,[2,0,1,1]),
         (2,[1,2,1,1]),
         (0,[0,1,1,1]),
         (1,[1,2,0,2]),
         (0,[1,1,1,1]),
         (0,[0,1,1,0]),
         (2,[2,2,2,1]),
         (2,[2,1,1,1]),
         (2,[2,2,1,1])])

In [134]:
sample2 = sc.textFile('../data/balance-scale.csv')
rdd2=sample2.map(lambda x:x.split()).map(lambda x: x[0].strip("'").split(","))\
            .map(lambda x:[v for v in x])\
            .map(lambda x: (str(x[0]),[int(k) for k in x[1:]]))
columns2 = ['Left-Weight','Left-Distance','Right-Weight','Right-Distance']

In [135]:
rdd2.take(2)

[('B', [1, 1, 1, 1]), ('R', [1, 1, 1, 2])]

#### Loading Actual data

In [19]:
txtFile=sc.textFile('../data/covtype.csv')
#Convert it into RDD of lists 
rdd=(txtFile.map(lambda x:x.split())
    .map(lambda x: x[0].strip("'").split(","))
    .map(lambda x:[float(v) for v in x])
    .map(lambda x: (x[-1]-1,x[0:-1])))

In [22]:
import itertools
soil_list =[]
for k in range(40):
    string = 'Soil_Type_' + str(k+1)
    soil_list.append(string)
WA_list =[]
for k in range(4):
    string = 'WA_' + str(k+1)
    WA_list.append(string)
names = [['Elevation'], ['Aspect'], ['Slope'], ['HDHyrdo'], ['VDHydro'], ['HDRoadways'], \
         ['9amHills'],['NoonHills'], ['3pmHills'], ['HDFirePoints'], WA_list,\
         soil_list, ['Cover_Type']]
columns = list(itertools.chain(*names))

In [40]:
import random
#selecting random features
m = 5 #No of features
indices = range(len(columns)-1)
random.shuffle(indices)
print indices[:m]

[31, 38, 24, 30, 23]


In [41]:
sampled_rdd = rdd.map(lambda x: (x[0],[x[1][j] for j in indices[:m]]))

#### InfoGain function

In [4]:
from collections import Counter
import numpy as np
def IG(l):
    length=len(l)
    c=Counter()
    for v in l:
        c[v] += 1.0/length
    return 1-sum(np.multiply(c.values(),c.values()))

In [5]:
def infoGain2(sampled_rdd,attr=0):
    c=sampled_rdd.count()
    pair_rdd=sampled_rdd.map(lambda x: (x[1][attr],x[0]))
    list_rdd=pair_rdd.groupByKey().mapValues(lambda x: tuple(x)).map(lambda x: (x[0],x[1], len(x[1])/float(c)))
    gains_rdd=list_rdd.map(lambda x: (x[0],IG(x[1]),x[2]))
    return gains_rdd.map(lambda x: x[1]*x[2]).reduce(lambda a,b:a+b)

In [76]:
def choose_attribute(sampled_rdd, attributes):
    best_gain = float("inf")
    best_attr = None
    for att in attributes:
        gain = infoGain2(sampled_rdd,att)
        #print att,gain
        if gain <= best_gain:
            best_gain = gain
            best_attr = att
    cats=sampled_rdd.map(lambda x:x[1][best_attr]).distinct().collect()
    return best_attr,cats

In [77]:
#Function returns most frequent value in response variable
def most_frequent(data):
    highest_freq = 0
    most_freq = None
    vals = data.map(lambda x:x[0]).distinct().collect()
    for val in vals:
        freq = data.filter(lambda x:x[0] == val)\
                    .map(lambda x:x[0]).count()
        if freq > highest_freq:
            most_freq = val
            highest_freq = freq
    return most_freq

In [304]:
import copy
def createDecisionTree(sub_rdd,attributes,columns):
    if len(attributes) <= 0:
        return most_frequent(sub_rdd)
    elif sub_rdd.map(lambda x:x[0]).distinct().count() == 1:
        return sub_rdd.map(lambda x:x[0]).distinct().collect()[0][0]
    ##changes
    elif sub_rdd.count() == 0:
        return 0
    ##changes
    else:
        bestAttr,vals = choose_attribute(sub_rdd,attributes)
        attributes.remove(bestAttr)
        print bestAttr,vals
        tree = {columns[bestAttr]:{}}
        for val in vals:
            new_rdd = sub_rdd.filter(lambda x:x[1][bestAttr] == val)\
            .map(lambda x:(x[0],x[1]))
            print val,bestAttr,attributes
            new_attributes = copy.deepcopy(attributes)
            subtree = createDecisionTree(new_rdd,new_attributes,columns)
            tree[columns[bestAttr]][val] = subtree
    return tree

In [126]:
tree = createDecisionTree(rdd2,range(len(columns2)),columns2)

In [127]:
tree

{'Right-Distance': {'1': {'Left-Distance': {'1': {'Right-Weight': {'1': {'Left-Weight': {'1': 'B',
        '2': 'L',
        '3': 'L',
        '4': 'L',
        '5': 'L'}},
      '2': {'Left-Weight': {'1': 'R', '2': 'B', '3': 'L', '4': 'L', '5': 'L'}},
      '3': {'Left-Weight': {'1': 'R', '2': 'R', '3': 'B', '4': 'L', '5': 'L'}},
      '4': {'Left-Weight': {'1': 'R', '2': 'R', '3': 'R', '4': 'B', '5': 'L'}},
      '5': {'Left-Weight': {'1': 'R',
        '2': 'R',
        '3': 'R',
        '4': 'R',
        '5': 'B'}}}},
    '2': {'Left-Weight': {'1': {'Right-Weight': {'1': 'L',
        '2': 'B',
        '3': 'R',
        '4': 'R',
        '5': 'R'}},
      '2': {'Right-Weight': {'1': 'L',
        '2': 'L',
        '3': 'L',
        '4': 'B',
        '5': 'R'}},
      '3': 'L',
      '4': 'L',
      '5': 'L'}},
    '3': {'Left-Weight': {'1': {'Right-Weight': {'1': 'L',
        '2': 'L',
        '3': 'B',
        '4': 'R',
        '5': 'R'}},
      '2': 'L',
      '3': 'L',
      '4': '

In [128]:
#Function for testing a tree on a set of data
def tree_test(data,columns,tree):
    #data will be the list stored in the second half of the tuple
    #columns contains the name of columns for referencing the tree
    if type(tree) == type("string"):
        return tree
    else:
        attr = tree.keys()[0]
        t = tree[attr][data[columns.index(attr)]]
        return tree_test(data,columns,t)

In [132]:
#Applying tree test row wise to the RDD
res_rdd = rdd2.map(lambda x:(x[0],tree_test(x[1],columns2,tree)))

In [133]:
#Calculating accuracy
print "Accuracy:", 100*(res_rdd.filter(lambda x:x[0] == x[1]).count())/float(res_rdd.count())

Accuracy: 100.0


#### Adding support for continuous variables

In [158]:
##Loading dataset with continuous features
sample_c = sc.textFile('../data/crx.csv')
rdd_c=sample_c.map(lambda x:x.split()).map(lambda x: x[0].strip("'").split(","))\
            .map(lambda x:[v for v in x])\
            .map(lambda x: (str(x[-1]),[k for k in x[0:-1]]))
columns_c = []
for k in range(1,16):
    columns_c.append('A'+str(k))

In [295]:
##Provide list of column numbers for columns to be discretized
import operator
def discretize_column(data,column,n_bins):
    col_max = data.map(lambda x:float(x[1][column])).max()
    col_min = data.map(lambda x:float(x[1][column])).min()
    ranges = list(np.linspace(col_min, col_max, n_bins))
    new_data = data.map(lambda x:(x[0],[x[1][0:column],[discrete_val(x[1][column],ranges)],x[1][column+1:]]))
    new_data = new_data.map(lambda x:(x[0],reduce(operator.add, x[1])))
    data = new_data
    return data

In [296]:
def discretize_columns(data,column_ids,n_bins=10):
    col_count = 0
    for column in column_ids:
        if col_count == 0:
            return_rdd = discretize_column(data,column,n_bins)
        else:
            return_rdd = discretize_column(return_rdd,column,n_bins)
        col_count += 1
    return return_rdd

In [297]:
test_rdd = discretize_columns(rdd_c,[1,2,7,10,13,14])

In [298]:
from bisect import bisect
def discrete_val(data,ranges):
    k = bisect(ranges,float(data))
    print k
    if k == 0:
        return str(ranges[k])+"<"
    elif k == len(ranges):
        return str(ranges[k-1])+">"
    else:
        return str(ranges[k-1])+ "-" + str(ranges[k])    

In [299]:
discrete_val(7.5,[1,2,3,4,5,6,7,8,9,10])

7


'7-8'

In [305]:
test_tree = createDecisionTree(test_rdd,range(len(columns_c)),columns_c)

8 [u't', u'f']
t 8 [0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14]
9 [u't', u'f']
t 9 [0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14]
5 [u'aa', u'c', u'e', u'w', u'cc', u'k', u'm', u'q', u'i', u'ff', u'x', u'r', u'j', u'd']
aa 5 [0, 1, 2, 3, 4, 6, 7, 10, 11, 12, 13, 14]
c 5 [0, 1, 2, 3, 4, 6, 7, 10, 11, 12, 13, 14]
1 ['34.75-41.75', '27.75-34.75', '48.75-55.75', '55.75-62.75', '13.75-20.75', '41.75-48.75', '20.75-27.75', '62.75-69.75']
34.75-41.75 1 [0, 2, 3, 4, 6, 7, 10, 11, 12, 13, 14]
27.75-34.75 1 [0, 2, 3, 4, 6, 7, 10, 11, 12, 13, 14]
48.75-55.75 1 [0, 2, 3, 4, 6, 7, 10, 11, 12, 13, 14]
55.75-62.75 1 [0, 2, 3, 4, 6, 7, 10, 11, 12, 13, 14]
13.75-20.75 1 [0, 2, 3, 4, 6, 7, 10, 11, 12, 13, 14]
11 [u't', u'f']
t 11 [0, 2, 3, 4, 6, 7, 10, 12, 13, 14]
6 [u'h', u'v']
h 6 [0, 2, 3, 4, 7, 10, 12, 13, 14]
v 6 [0, 2, 3, 4, 7, 10, 12, 13, 14]
0 [u'a', u'b']
a 0 [2, 3, 4, 7, 10, 12, 13, 14]
4 [u'g', u'p']
g 4 [2, 3, 7, 10, 12, 13, 14]
p 4 [2, 3, 7, 10, 12, 13, 14]
b 0 [2, 3, 4, 7, 10, 12, 13, 14]

In [306]:
test_tree

{'A9': {u'f': {'A5': {u'g': {'A6': {u'aa': '-',
      u'c': {'A7': {u'bb': '-',
        u'h': {'A14': {'0.0-222.222222222': {'A8': {'0.0-3.16666666667': '+',
            '6.33333333333-9.5': '-'}},
          '222.222222222-444.444444444': '-'}},
        u'j': '+',
        u'n': '-',
        u'v': {'A2': {'13.75-20.75': {'A3': {'0.0-3.11111111111': {'A1': {u'a': '-',
              u'b': {'A15': {'0.0-11111.1111111': {'A14': {'0.0-222.222222222': {'A13': {u'g': {'A12': {u'f': {'A11': {'0.0-7.44444444444': {'A10': {u'f': {'A8': {'0.0-3.16666666667': {'A4': {u'u': '-'}}}}}}}}}}}}}}}}}},
            '3.11111111111-6.22222222222': '-',
            '6.22222222222-9.33333333333': '-',
            '9.33333333333-12.4444444444': '-'}},
          '20.75-27.75': {'A3': {'0.0-3.11111111111': {'A12': {u'f': {'A14': {'0.0-222.222222222': {'A10': {u'f': {'A15': {'0.0-11111.1111111': {'A13': {u'g': {'A11': {'0.0-7.44444444444': {'A8': {'0.0-3.16666666667': {'A4': {u'u': {'A1': {u'b': '+'}}}}}}}}}}}},
 

In [310]:
res_rdd_c = test_rdd.map(lambda x:(x[0],tree_test(x[1],columns_c,test_tree)))




In [312]:
#Calculating accuracy
print "Accuracy:", 100*(res_rdd_c.filter(lambda x:x[0] == x[1]).count())/float(res_rdd_c.count())

Accuracy: 99.5405819296
