In [1]:
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext()

In [18]:
import line_profiler
import IPython
ip = IPython.get_ipython()
ip.define_magic('lprun', line_profiler.magic_lprun)

#### Loading the test data 

In [2]:
sample = sc.parallelize([(1,[1,1,0,1]),
         (0,[0,0,1,1]),
         (1,[1,1,0,2]),
         (0,[1,1,0,1]),
         (0,[0,0,1,0]),
         (2,[1,0,2,1]),
         (2,[2,0,1,1]),
         (2,[1,2,1,1]),
         (0,[0,1,1,1]),
         (1,[1,2,0,2]),
         (0,[1,1,1,1]),
         (0,[0,1,1,0]),
         (2,[2,2,2,1]),
         (2,[2,1,1,1]),
         (2,[2,2,1,1])])

In [3]:
sample2 = sc.textFile('../data/balance-scale.csv')
rdd2=sample2.map(lambda x:x.split()).map(lambda x: x[0].strip("'").split(","))\
            .map(lambda x:[v for v in x])\
            .map(lambda x: (str(x[0]),[int(k) for k in x[1:]]))
columns2 = ['Left-Weight','Left-Distance','Right-Weight','Right-Distance']

In [4]:
rdd2.take(2)

[('B', [1, 1, 1, 1]), ('R', [1, 1, 1, 2])]

#### Loading Actual data

In [56]:
txtFile=sc.textFile('../data/covtype.csv')
#Convert it into RDD of lists 
rdd=(txtFile.map(lambda x:x.split())
    .map(lambda x: x[0].strip("'").split(","))
    .map(lambda x:[float(v) for v in x])
    .map(lambda x: (x[-1]-1,x[0:-1])))

In [57]:
import itertools
soil_list =[]
for k in range(40):
    string = 'Soil_Type_' + str(k+1)
    soil_list.append(string)
WA_list =[]
for k in range(4):
    string = 'WA_' + str(k+1)
    WA_list.append(string)
names = [['Elevation'], ['Aspect'], ['Slope'], ['HDHyrdo'], ['VDHydro'], ['HDRoadways'], \
         ['9amHills'],['NoonHills'], ['3pmHills'], ['HDFirePoints'], WA_list,\
         soil_list, ['Cover_Type']]
columns = list(itertools.chain(*names))

In [58]:
import random
#selecting random features
m = 5 #No of features
indices = range(len(columns)-1)
random.shuffle(indices)
print indices[:m]

[7, 26, 47, 44, 38]


In [70]:
sampled_rdd = rdd.map(lambda x: (str(x[0]),[x[1][j] for j in indices[:m]]))

In [71]:
sampled_rdd = discretize_columns(sampled_rdd,[0])

In [72]:
col_sampled = [columns[i] for i in indices[:m]]

#### InfoGain function

In [92]:
from collections import Counter
import numpy as np
def IG(l):
    length=len(l)
    c=Counter()
    for v in l:
        c[v] += 1.0/length
    return 1-sum(np.multiply(c.values(),c.values()))

In [82]:
def infoGain2(sampled_rdd,count,attr=0):
    sampled_rdd.cache()
    output = sampled_rdd.map(lambda x: (x[1][attr],x[0]))\
            .groupByKey().mapValues(lambda x: tuple(x))\
            .map(lambda x: (x[0],x[1], len(x[1])/float(count)))\
            .map(lambda x: (x[0],IG(x[1]),x[2]))\
            .map(lambda x: x[1]*x[2]).reduce(lambda a,b:a+b)
    return output

In [83]:
def choose_attribute(sampled_rdd,attributes, count):
    best_gain = float("inf")
    best_attr = None
    for att in attributes:
        gain = infoGain2(sampled_rdd,count,att)
        #print att,gain
        if gain <= best_gain:
            best_gain = gain
            best_attr = att
    cats=sampled_rdd.map(lambda x:x[1][best_attr]).distinct().collect()
    return best_attr,cats

In [84]:
#Function returns most frequent value in response variable
def most_frequent(data):
    highest_freq = 0
    most_freq = None
    vals = data.map(lambda x:x[0]).distinct().collect()
    for val in vals:
        freq = data.filter(lambda x:x[0] == val)\
                    .map(lambda x:x[0]).count()
        if freq > highest_freq:
            most_freq = val
            highest_freq = freq
    return most_freq

In [85]:
import copy
def createDecisionTree(sub_rdd,attributes,columns):
    if len(attributes) <= 0:
        return most_frequent(sub_rdd)
    elif sub_rdd.map(lambda x:x[0]).distinct().count() == 1:
        return sub_rdd.map(lambda x:x[0]).distinct().collect()[0][0]
    ##changes
    elif sub_rdd.count() == 0:
        return 0
    ##changes
    else:
        bestAttr,vals = choose_attribute(sub_rdd,attributes,sub_rdd.count())
        attributes.remove(bestAttr)
        #print bestAttr,vals
        tree = {columns[bestAttr]:{}}
        for val in vals:
            new_rdd = sub_rdd.filter(lambda x:x[1][bestAttr] == val)\
            .map(lambda x:(x[0],x[1]))
            #print val,bestAttr,attributes
            new_attributes = copy.deepcopy(attributes)
            subtree = createDecisionTree(new_rdd,new_attributes,columns)
            tree[columns[bestAttr]][val] = subtree
    return tree

In [94]:
%lprun -f IG tree = createDecisionTree(rdd2,range(len(columns2)),columns2)

In [93]:
%time tree = createDecisionTree(rdd2,range(len(columns2)),columns2)

CPU times: user 17.2 s, sys: 4.04 s, total: 21.2 s
Wall time: 1min 45s


In [128]:
#Function for testing a tree on a set of data
def tree_test(data,columns,tree):
    #data will be the list stored in the second half of the tuple
    #columns contains the name of columns for referencing the tree
    if type(tree) == type("string"):
        return tree
    else:
        attr = tree.keys()[0]
        t = tree[attr][data[columns.index(attr)]]
        return tree_test(data,columns,t)

In [132]:
#Applying tree test row wise to the RDD
res_rdd = rdd2.map(lambda x:(x[0],tree_test(x[1],columns2,tree)))

In [133]:
#Calculating accuracy
print "Accuracy:", 100*(res_rdd.filter(lambda x:x[0] == x[1]).count())/float(res_rdd.count())

Accuracy: 100.0


#### Adding support for continuous variables

In [158]:
##Loading dataset with continuous features
sample_c = sc.textFile('../data/crx.csv')
rdd_c=sample_c.map(lambda x:x.split()).map(lambda x: x[0].strip("'").split(","))\
            .map(lambda x:[v for v in x])\
            .map(lambda x: (str(x[-1]),[k for k in x[0:-1]]))
columns_c = []
for k in range(1,16):
    columns_c.append('A'+str(k))

In [63]:
from bisect import bisect
def discrete_val(data,ranges):
    k = bisect(ranges,float(data))
    print k
    if k == 0:
        return str(ranges[k])+"<"
    elif k == len(ranges):
        return str(ranges[k-1])+">"
    else:
        return str(ranges[k-1])+ "-" + str(ranges[k])    

In [61]:
##Provide list of column numbers for columns to be discretized
import operator
def discretize_column(data,column,n_bins):
    col_max = data.map(lambda x:float(x[1][column])).max()
    col_min = data.map(lambda x:float(x[1][column])).min()
    ranges = list(np.linspace(col_min, col_max, n_bins))
    new_data = data.map(lambda x:(x[0],[x[1][0:column],[discrete_val(x[1][column],ranges)],x[1][column+1:]]))
    new_data = new_data.map(lambda x:(x[0],reduce(operator.add, x[1])))
    data = new_data
    return data

In [62]:
def discretize_columns(data,column_ids,n_bins=10):
    col_count = 0
    for column in column_ids:
        if col_count == 0:
            return_rdd = discretize_column(data,column,n_bins)
        else:
            return_rdd = discretize_column(return_rdd,column,n_bins)
        col_count += 1
    return return_rdd

In [297]:
test_rdd = discretize_columns(rdd_c,[1,2,7,10,13,14])

In [299]:
discrete_val(7.5,[1,2,3,4,5,6,7,8,9,10])

7


'7-8'

In [310]:
res_rdd_c = test_rdd.map(lambda x:(x[0],tree_test(x[1],columns_c,test_tree)))




In [312]:
#Calculating accuracy
print "Accuracy:", 100*(res_rdd_c.filter(lambda x:x[0] == x[1]).count())/float(res_rdd_c.count())

Accuracy: 99.5405819296


For Main dataset

In [73]:
sampled_rdd.take(2)

[('4.0', ['225.777777778-254.0', 0.0, 0.0, 0.0, 0.0]),
 ('4.0', ['225.777777778-254.0', 0.0, 0.0, 0.0, 0.0])]

In [None]:
%time tree_test = createDecisionTree(sampled_rdd,range(len(col_sampled)),col_sampled)