In [1]:
from __future__ import division, print_function, unicode_literals # For the compatibility with Python 2

In [2]:
from pyspark.sql import SparkSession
spark_session = SparkSession.builder\
                            .enableHiveSupport()\
                            .appName("spark sql")\
                            .master("local[4]")\
                            .getOrCreate()

In [3]:
from pyspark import SparkContext
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline 
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

data  = spark_session.read.format("com.databricks.spark.csv")\
        .option("header", "true")\
        .option("inferschema", "true")\
        .load("/data/covertype2/train.csv")\
        .repartition(100)

In [4]:
data.head()

Row(Elevation=2807, Aspect=150, Slope=22, Horizontal_Distance_To_Hydrology=285, Vertical_Distance_To_Hydrology=-15, Horizontal_Distance_To_Roadways=660, Hillshade_9am=242, Hillshade_Noon=233, Hillshade_3pm=109, Horizontal_Distance_To_Fire_Points=485, Wild_Type='Rawah', Soil_Type='Como family - Rock land - Legault family complex, extremely stony.', Target=5)

In [5]:
stringIndexer = [StringIndexer(inputCol = column, outputCol = column+"Indexed").fit(data) 
                 for column in ['Soil_Type','Wild_Type']]
pipeline = Pipeline(stages=stringIndexer)
df = pipeline.fit(data).transform(data)

In [6]:
oneHotEncoder = [OneHotEncoder(inputCol = column, outputCol = column+"OneHot") 
                 for column in ['Soil_TypeIndexed','Wild_TypeIndexed']]
pipeline = Pipeline(stages=oneHotEncoder)
df1 = pipeline.fit(df).transform(df)

In [7]:
vector_assembler = VectorAssembler(inputCols=['Soil_TypeIndexedOneHot',
                                              'Wild_TypeIndexedOneHot', 
                                              'Elevation',
                                              'Aspect',
                                              'Slope',
                                              'Horizontal_Distance_To_Hydrology',
                                              'Vertical_Distance_To_Hydrology',
                                              'Horizontal_Distance_To_Roadways',
                                              'Hillshade_9am',
                                              'Hillshade_Noon',
                                              'Hillshade_3pm',
                                              'Horizontal_Distance_To_Fire_Points'
                                              ], outputCol='features')

In [8]:
finalDF = vector_assembler.transform(df1)

In [9]:
rf = RandomForestClassifier(labelCol='Target',featuresCol='features')

pipeline_rf = Pipeline(stages = [rf])

paramGrid = ParamGridBuilder()\
    .addGrid(rf.maxDepth, [9, 10])\
    .addGrid(rf.numTrees, [120])\
    .build()
    
evaluator = MulticlassClassificationEvaluator(labelCol = 'Target', predictionCol = "prediction", metricName = "accuracy") 

crossval = CrossValidator(estimator = pipeline_rf,
                          estimatorParamMaps = paramGrid,
                          evaluator = evaluator,
                          numFolds = 3)

In [10]:
cvModel = crossval.fit(finalDF)

In [11]:
cvModel.avgMetrics

[0.727139432281734, 0.739654821084856]

In [12]:
print(cvModel.bestModel.stages[0])

RandomForestClassificationModel (uid=rfc_702821d7b8d1) with 120 trees


In [13]:
data_test = spark_session.read.format("com.databricks.spark.csv")\
        .option("header", "true")\
        .option("inferschema", "true")\
        .load("/data/covertype2/test.csv")\
        .repartition(100)

In [14]:
pipeline = Pipeline(stages=stringIndexer)
df_test = pipeline.fit(data_test).transform(data_test)

In [15]:
pipeline = Pipeline(stages=oneHotEncoder)
df1_test = pipeline.fit(df_test).transform(df_test)

In [16]:
finalDF_test = vector_assembler.transform(df1_test)

In [17]:
predictions = cvModel.bestModel.transform(finalDF_test)
evaluator = MulticlassClassificationEvaluator(labelCol = "Target", predictionCol = "prediction", metricName = "accuracy")
accuracy = evaluator.evaluate(predictions)

In [18]:
print(accuracy)

0.7472916702662543
