In [2]:
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext()

In [16]:
from pyspark.ml import Pipeline
from pyspark.mllib.tree import RandomForest
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.util import MLUtils

#### Subset of dataset

In [23]:
import csv
import itertools
data = []
num_rows = 40000;
row_num = 0
with open('rForests/data/covtype.csv', 'rb') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',')
    for row in spamreader:
        row_num += 1
        data.append((int(row[-1])-1,row[0:-1]))
        if row_num >= num_rows:
            break

soil_list =[]
for k in range(40):
    string = 'Soil_Type_' + str(k+1)
    soil_list.append(string)
WA_list =[]
for k in range(4):
    string = 'WA_' + str(k+1)
    WA_list.append(string)
names = [['Elevation'], ['Aspect'], ['Slope'], ['HDHyrdo'], ['VDHydro'], ['HDRoadways'], \
         ['9amHills'],['NoonHills'], ['3pmHills'], ['HDFirePoints'], WA_list,\
         soil_list, ['Cover_Type']]
columns_data = list(itertools.chain(*names))

In [24]:
rdd = sc.parallelize(data)

In [25]:
from pyspark.mllib.regression import LabeledPoint
#Turn it into RDD of LabeledPoints
rdd=rdd.map(lambda x: LabeledPoint(x[0],x[1]))# Subtract 1 from label so it goes from 0-6 not 1-7

In [41]:
%time model = RandomForest.trainClassifier(rdd, numClasses=7, categoricalFeaturesInfo={},\
                                             numTrees=30, featureSubsetStrategy="auto",\
                                             impurity='gini', maxDepth=30, maxBins=32)

CPU times: user 19.1 ms, sys: 6.69 ms, total: 25.8 ms
Wall time: 36 s


#### For the full dataset

In [48]:
# input as CSV
txtFile=sc.textFile('rForests/data/covtype.csv')

In [49]:
#Convert it into RDD of lists 
rdd=(txtFile.map(lambda x:x.split())
    .map(lambda x: x[0].strip("'").split(","))
    .map(lambda x:[float(v) for v in x])
    .map(lambda x: (x[-1]-1,x[0:-1])))


In [50]:
#Columns for future reference
columns=['Elevation',
         'Aspect',
         'Slope',
         'Horizontal_Distance_To_Hydrology',
         'Vertical_Distance_To_Hydrology',
         'Horizontal_Distance_To_Roadways',
         'Hillshade_9am',
         'Hillshade_Noon',
         'Hillshade_3pm',
         'Horizontal_Distance_To_Fire_Points',
         'Wilderness_Area0_i',
         'Wilderness_Area1_i',
         'Wilderness_Area2_i',
         'Wilderness_Area3_i',
         'Soil_Type0_i',
        'Soil_Type1_i',
        'Soil_Type2_i',
        'Soil_Type3_i',
        'Soil_Type4_i',
        'Soil_Type5_i',
        'Soil_Type6_i',
        'Soil_Type7_i',
        'Soil_Type8_i',
        'Soil_Type9_i',
        'Soil_Type10_i',
        'Soil_Type11_i',
        'Soil_Type12_i',
        'Soil_Type13_i',
        'Soil_Type14_i',
        'Soil_Type15_i',
        'Soil_Type16_i',
        'Soil_Type17_i',
        'Soil_Type18_i',
        'Soil_Type19_i',
        'Soil_Type20_i',
        'Soil_Type21_i',
        'Soil_Type22_i',
        'Soil_Type23_i',
        'Soil_Type24_i',
        'Soil_Type25_i',
        'Soil_Type26_i',
        'Soil_Type27_i',
        'Soil_Type28_i',
        'Soil_Type29_i',
        'Soil_Type30_i',
        'Soil_Type31_i',
        'Soil_Type32_i',
        'Soil_Type33_i',
        'Soil_Type34_i',
        'Soil_Type35_i',
        'Soil_Type36_i',
        'Soil_Type37_i',
        'Soil_Type38_i',
        'Soil_Type39_i',
        'Cover_Type']

In [51]:
from pyspark.mllib.regression import LabeledPoint
#Turn it into RDD of LabeledPoints
rdd=rdd.map(lambda x: LabeledPoint(x[0],x[1]))# Subtract 1 from label so it goes from 0-6 not 1-7

In [52]:
(trainingData, testData) = rdd.randomSplit([0.7, 0.3])

In [57]:
%time model = RandomForest.trainClassifier(trainingData, numClasses=7, \
                                           categoricalFeaturesInfo={},\
                                     numTrees=20, featureSubsetStrategy="auto",\
                                     impurity='gini', maxDepth=30, maxBins=32)

CPU times: user 24.8 ms, sys: 9.24 ms, total: 34 ms
Wall time: 3min 3s


In [180]:
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification forest model:')
print(model.toDebugString())

Test Error = 0.319556474779
Learned classification forest model:
TreeEnsembleModel classifier with 5 trees

  Tree 0:
    If (feature 0 <= 3049.0)
     If (feature 15 <= 0.0)
      If (feature 0 <= 2558.0)
       If (feature 13 <= 0.0)
        Predict: 1.0
       Else (feature 13 > 0.0)
        Predict: 2.0
      Else (feature 0 > 2558.0)
       If (feature 0 <= 2953.0)
        Predict: 1.0
       Else (feature 0 > 2953.0)
        Predict: 1.0
     Else (feature 15 > 0.0)
      If (feature 9 <= 1345.0)
       If (feature 9 <= 309.0)
        Predict: 5.0
       Else (feature 9 > 309.0)
        Predict: 2.0
      Else (feature 9 > 1345.0)
       If (feature 0 <= 2620.0)
        Predict: 2.0
       Else (feature 0 > 2620.0)
        Predict: 2.0
    Else (feature 0 > 3049.0)
     If (feature 45 <= 0.0)
      If (feature 12 <= 0.0)
       If (feature 38 <= 0.0)
        Predict: 0.0
       Else (feature 38 > 0.0)
        Predict: 1.0
      Else (feature 12 > 0.0)
       If (feature 33 <= 0.0

In [34]:
###Plot for comparison

In [35]:
import matplotlib.pyplot as plt

In [None]:
numtrees = [5,10,15,20,25]
mllib=[8.59, 11.5, 15, 31, 36.5]
our_code = [11.59, 23.83, 32.98, 34.2, 41.2]