In [21]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer

In [22]:
sc = SparkContext(appName="m3_w3_t1")

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=m3_w3_t1, master=local[*]) created by __init__ at <ipython-input-2-861f245ef866>:1 

In [23]:
spark = SparkSession.Builder().getOrCreate()

In [24]:
trainDf = spark.read.option("inferSchema", "true").csv("/data/covertype2/train.csv", header=True)
trainDf.show(5)

+---------+------+-----+--------------------------------+------------------------------+-------------------------------+-------------+--------------+-------------+----------------------------------+---------+--------------------+------+
|Elevation|Aspect|Slope|Horizontal_Distance_To_Hydrology|Vertical_Distance_To_Hydrology|Horizontal_Distance_To_Roadways|Hillshade_9am|Hillshade_Noon|Hillshade_3pm|Horizontal_Distance_To_Fire_Points|Wild_Type|           Soil_Type|Target|
+---------+------+-----+--------------------------------+------------------------------+-------------------------------+-------------+--------------+-------------+----------------------------------+---------+--------------------+------+
|     3122|   266|   10|                             433|                            75|                           3069|          195|           245|          188|                               451| Comanche|Catamount family ...|     1|
|     3018|   308|   15|                            

In [25]:
wildTypeIndexer = StringIndexer(inputCol="Wild_Type", outputCol="Wild_Type_Ind")
wtModel = wildTypeIndexer.fit(trainDf)
trainDf2 = wtModel.transform(trainDf)

soilTypeIndexer = StringIndexer(inputCol="Soil_Type", outputCol="Soil_Type_Ind")
stModel = soilTypeIndexer.fit(trainDf2)
trainDfInd = stModel.transform(trainDf2)

trainDfInd.show(5)

+---------+------+-----+--------------------------------+------------------------------+-------------------------------+-------------+--------------+-------------+----------------------------------+---------+--------------------+------+-------------+-------------+
|Elevation|Aspect|Slope|Horizontal_Distance_To_Hydrology|Vertical_Distance_To_Hydrology|Horizontal_Distance_To_Roadways|Hillshade_9am|Hillshade_Noon|Hillshade_3pm|Horizontal_Distance_To_Fire_Points|Wild_Type|           Soil_Type|Target|Wild_Type_Ind|Soil_Type_Ind|
+---------+------+-----+--------------------------------+------------------------------+-------------------------------+-------------+--------------+-------------+----------------------------------+---------+--------------------+------+-------------+-------------+
|     3122|   266|   10|                             433|                            75|                           3069|          195|           245|          188|                               451| Comanc

In [26]:
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col

featureAssembler = VectorAssembler(
    inputCols=["Elevation","Aspect","Slope","Horizontal_Distance_To_Hydrology","Vertical_Distance_To_Hydrology",
               "Horizontal_Distance_To_Roadways","Hillshade_9am","Hillshade_Noon","Hillshade_3pm",
               "Horizontal_Distance_To_Fire_Points", "Wild_Type_Ind", "Soil_Type_Ind"], outputCol="features")

data = featureAssembler.transform(trainDfInd).select(col("features"),col("target").alias("label"))

data.show()


+--------------------+-----+
|            features|label|
+--------------------+-----+
|[3122.0,266.0,10....|    1|
|[3018.0,308.0,15....|    1|
|[3146.0,151.0,12....|    2|
|[2980.0,163.0,6.0...|    2|
|[2972.0,187.0,16....|    2|
|[2768.0,17.0,13.0...|    2|
|[2948.0,319.0,9.0...|    1|
|[2127.0,320.0,31....|    6|
|[2968.0,322.0,19....|    1|
|[2983.0,295.0,10....|    1|
|[2947.0,54.0,22.0...|    2|
|[2987.0,130.0,16....|    2|
|[2748.0,8.0,14.0,...|    2|
|[3330.0,81.0,18.0...|    7|
|[2944.0,12.0,13.0...|    2|
|[3381.0,203.0,10....|    7|
|[3083.0,199.0,13....|    1|
|[3451.0,279.0,17....|    7|
|[3202.0,51.0,19.0...|    1|
|[2900.0,221.0,14....|    2|
+--------------------+-----+
only showing top 20 rows



In [27]:
(trainData, testData) = data.randomSplit([0.7, 0.3])

In [28]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [29]:
randomForest = RandomForestClassifier(labelCol="label", maxBins=45, maxDepth=9)
model = randomForest.fit(trainData)

In [35]:
prediction = model.transform(testData)

prediction.select("label", "prediction").show()

+-----+----------+
|label|prediction|
+-----+----------+
|    2|       1.0|
|    3|       3.0|
|    3|       3.0|
|    3|       3.0|
|    6|       6.0|
|    6|       6.0|
|    3|       3.0|
|    6|       6.0|
|    3|       3.0|
|    6|       3.0|
|    6|       6.0|
|    6|       3.0|
|    6|       3.0|
|    6|       3.0|
|    3|       3.0|
|    6|       6.0|
|    6|       6.0|
|    6|       6.0|
|    3|       4.0|
|    3|       3.0|
+-----+----------+
only showing top 20 rows



In [32]:
model.featureImportances

SparseVector(12, {0: 0.4733, 1: 0.0136, 2: 0.0102, 3: 0.0266, 4: 0.0188, 5: 0.0585, 6: 0.0116, 7: 0.0225, 8: 0.0086, 9: 0.0466, 10: 0.1414, 11: 0.1684})

In [34]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

evaluator.evaluate(prediction)

0.767912571132954