In [None]:
import os
import sys
import pyspark
import pandas as pd

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [None]:
#1) Demonstrate how to load a dataset suitable for prediction into a PySpark DataFrame and
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType
#setting column names and adding it to the csv file dataframe
ss = SparkSession.builder.appName('folder_read').getOrCreate()
df = ss.read.option("inferSchema", True).option("header", False).csv("covtype.data")
colnames = ["Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways","Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm", "Horizontal_Distance_To_Fire_Points"]
for i in range(0, 4):
    colnames.append('Wilderness_Area_' + str(i))
for i in range(0, 40):
    colnames.append('Soil_Type_' + str(i))
colnames.append("Cover_Type")
df = df.toDF(*colnames).withColumn("Cover_Type", col("Cover_Type").cast(DoubleType()))
df.show()


+---------+------+-----+--------------------------------+------------------------------+-------------------------------+-------------+--------------+-------------+----------------------------------+-----------------+-----------------+-----------------+-----------------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+----------+
|Elevation|Aspect|Slope|Horizontal_Distance_To_Hydrology|Vertical_Distance_To_Hydrology|Horizontal_Distance_To_Roadways|Hillshade_9am|Hillshade_Noon|Hillshade_3pm|Horizontal_Distance_To_Fire_Points|Wilderness

In [None]:
#Display basic statistics and information about the dataset.
print("The schema of the data is")
df.printSchema()
print("Dataframe contains " + str(df.count()) + " entries")

The schema of the data is
root
 |-- Elevation: integer (nullable = true)
 |-- Aspect: integer (nullable = true)
 |-- Slope: integer (nullable = true)
 |-- Horizontal_Distance_To_Hydrology: integer (nullable = true)
 |-- Vertical_Distance_To_Hydrology: integer (nullable = true)
 |-- Horizontal_Distance_To_Roadways: integer (nullable = true)
 |-- Hillshade_9am: integer (nullable = true)
 |-- Hillshade_Noon: integer (nullable = true)
 |-- Hillshade_3pm: integer (nullable = true)
 |-- Horizontal_Distance_To_Fire_Points: integer (nullable = true)
 |-- Wilderness_Area_0: integer (nullable = true)
 |-- Wilderness_Area_1: integer (nullable = true)
 |-- Wilderness_Area_2: integer (nullable = true)
 |-- Wilderness_Area_3: integer (nullable = true)
 |-- Soil_Type_0: integer (nullable = true)
 |-- Soil_Type_1: integer (nullable = true)
 |-- Soil_Type_2: integer (nullable = true)
 |-- Soil_Type_3: integer (nullable = true)
 |-- Soil_Type_4: integer (nullable = true)
 |-- Soil_Type_5: integer (nulla

In [None]:
#Implement a PySpark script to handle missing values and categorical features in the dataset.

column_list_string = [c for c, t in df.dtypes if t.startswith('string')]
#print(column_list_string)
column_list_int = [c for c, t in df.dtypes if t.startswith('int')]
#print(column_list_int)
print("Number of categorical columns are " + str(len(column_list_string)) + ", the number of integer columns are " + str(len(column_list_int)))
#no categorical values in this dataset
#drop missing values and rertain integer values (categorical values would require encoding if they were present?)
df = df.na.drop()
df.show()


Number of categorical columns are 0, the number of integer columns are 54
+---------+------+-----+--------------------------------+------------------------------+-------------------------------+-------------+--------------+-------------+----------------------------------+-----------------+-----------------+-----------------+-----------------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+----------+
|Elevation|Aspect|Slope|Horizontal_Distance_To_Hydrology|Vertical_Distance_To_Hydrology|Horizontal_Distance_To_Roadways|Hillshade_9am|

In [None]:
#split test and train datat using randomsplit function
 (train_data, test_data) = df.randomSplit([0.9, 0.1])
train_data.cache()
test_data.cache()

DataFrame[Elevation: int, Aspect: int, Slope: int, Horizontal_Distance_To_Hydrology: int, Vertical_Distance_To_Hydrology: int, Horizontal_Distance_To_Roadways: int, Hillshade_9am: int, Hillshade_Noon: int, Hillshade_3pm: int, Horizontal_Distance_To_Fire_Points: int, Wilderness_Area_0: int, Wilderness_Area_1: int, Wilderness_Area_2: int, Wilderness_Area_3: int, Soil_Type_0: int, Soil_Type_1: int, Soil_Type_2: int, Soil_Type_3: int, Soil_Type_4: int, Soil_Type_5: int, Soil_Type_6: int, Soil_Type_7: int, Soil_Type_8: int, Soil_Type_9: int, Soil_Type_10: int, Soil_Type_11: int, Soil_Type_12: int, Soil_Type_13: int, Soil_Type_14: int, Soil_Type_15: int, Soil_Type_16: int, Soil_Type_17: int, Soil_Type_18: int, Soil_Type_19: int, Soil_Type_20: int, Soil_Type_21: int, Soil_Type_22: int, Soil_Type_23: int, Soil_Type_24: int, Soil_Type_25: int, Soil_Type_26: int, Soil_Type_27: int, Soil_Type_28: int, Soil_Type_29: int, Soil_Type_30: int, Soil_Type_31: int, Soil_Type_32: int, Soil_Type_33: int, S

In [None]:
#3) Develop a PySpark script that trains a decision tree model on the training dataset.
from pyspark.ml.feature import VectorAssembler

input_cols = colnames[:-1]
vector_assembler = VectorAssembler(inputCols = input_cols, outputCol = "featureVector")
assembled_train_data = vector_assembler.transform(train_data)
assembled_train_data.select("featureVector").show(truncate = False)

+-----------------------------------------------------------------------------------------------------+
|featureVector                                                                                        |
+-----------------------------------------------------------------------------------------------------+
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1863.0,37.0,17.0,120.0,18.0,90.0,217.0,202.0,115.0,769.0,1.0,1.0])  |
|(54,[0,1,2,5,6,7,8,9,13,18],[1874.0,18.0,14.0,90.0,208.0,209.0,135.0,793.0,1.0,1.0])                 |
|(54,[0,1,2,3,4,5,6,7,8,9,13,18],[1879.0,28.0,19.0,30.0,12.0,95.0,209.0,196.0,117.0,778.0,1.0,1.0])   |
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1888.0,33.0,22.0,150.0,46.0,108.0,209.0,185.0,103.0,735.0,1.0,1.0]) |
|(54,[0,1,2,3,4,5,6,7,8,9,13,14],[1889.0,28.0,22.0,150.0,23.0,120.0,205.0,185.0,108.0,759.0,1.0,1.0]) |
|(54,[0,1,2,3,4,5,6,7,8,9,13,18],[1889.0,353.0,30.0,95.0,39.0,67.0,153.0,172.0,146.0,600.0,1.0,1.0])  |
|(54,[0,1,2,3,4,5,6,7,8,9,13,18],[1896.0,337.0,12.0,30.0,6.0,175

In [None]:
from pyspark.ml .classification import DecisionTreeClassifier

classifier = DecisionTreeClassifier(seed = 100, labelCol = "Cover_Type", featuresCol = "featureVector", predictionCol = "prediction")
model = classifier.fit(assembled_train_data)
print(model.toDebugString)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_2dc9df3ee479, depth=5, numNodes=49, numClasses=8, numFeatures=54
  If (feature 0 <= 3052.5)
   If (feature 0 <= 2473.5)
    If (feature 3 <= 15.0)
     If (feature 13 <= 0.5)
      Predict: 6.0
     Else (feature 13 > 0.5)
      If (feature 23 <= 0.5)
       Predict: 4.0
      Else (feature 23 > 0.5)
       Predict: 3.0
    Else (feature 3 > 15.0)
     If (feature 16 <= 0.5)
      Predict: 3.0
     Else (feature 16 > 0.5)
      If (feature 9 <= 1280.5)
       Predict: 3.0
      Else (feature 9 > 1280.5)
       Predict: 4.0
   Else (feature 0 > 2473.5)
    If (feature 17 <= 0.5)
     If (feature 0 <= 2942.5)
      If (feature 15 <= 0.5)
       Predict: 2.0
      Else (feature 15 > 0.5)
       Predict: 3.0
     Else (feature 0 > 2942.5)
      If (feature 3 <= 125.5)
       Predict: 1.0
      Else (feature 3 > 125.5)
       Predict: 2.0
    Else (feature 17 > 0.5)
     If (feature 0 <= 2688.5)
      Predict: 3.0
     Else (featur

In [None]:
pd_importance = pd.DataFrame(model.featureImportances.toArray(), index=input_cols, columns=['importance']).sort_values(by="importance", ascending = False)
pd_importance

Exception ignored in: <function JavaWrapper.__del__ at 0x7f366c9002f0>
Traceback (most recent call last):
  File "/home/lplab/anaconda3/lib/python3.7/site-packages/pyspark/ml/wrapper.py", line 53, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'DecisionTreeClassifier' object has no attribute '_java_obj'


Unnamed: 0,importance
Elevation,0.802719
Horizontal_Distance_To_Hydrology,0.042416
Soil_Type_3,0.039446
Soil_Type_1,0.030948
Soil_Type_31,0.027806
Hillshade_Noon,0.025414
Wilderness_Area_2,0.01148
Horizontal_Distance_To_Roadways,0.008054
Soil_Type_2,0.003457
Wilderness_Area_3,0.00278


In [None]:
predictions = model.transform(assembled_train_data)
predictions.select("Cover_Type", "prediction", "probability").show(10, truncate = False)

+----------+----------+------------------------------------------------------------------------------------------------+
|Cover_Type|prediction|probability                                                                                     |
+----------+----------+------------------------------------------------------------------------------------------------+
|6.0       |3.0       |[0.0,0.0,0.04669703872437358,0.6233519707323808,0.021950714433630152,0.0,0.3080002761096155,0.0]|
|6.0       |4.0       |[0.0,0.0,0.02487198244330651,0.23920994879297733,0.6430138990490124,0.0,0.09290416971470374,0.0]|
|6.0       |3.0       |[0.0,0.0,0.04669703872437358,0.6233519707323808,0.021950714433630152,0.0,0.3080002761096155,0.0]|
|6.0       |3.0       |[0.0,0.0,0.04669703872437358,0.6233519707323808,0.021950714433630152,0.0,0.3080002761096155,0.0]|
|6.0       |3.0       |[0.0,0.0,0.04669703872437358,0.6233519707323808,0.021950714433630152,0.0,0.3080002761096155,0.0]|
|6.0       |3.0       |[0.0,0.0,

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="Cover_Type", predictionCol="prediction")
evaluator.setMetricName("accuracy").evaluate(predictions)
evaluator.setMetricName("f1").evaluate(predictions)

0.6880218200876276

In [None]:
#4) Implement code to evaluate the decision tree model using metrics such as accuracy, precision, and recall.
confusion_matrix = predictions.groupBy("Cover_Type").\
pivot("prediction", range(1,8)).count().\
na.fill(0.0).\
orderBy("Cover_Type")
confusion_matrix.show()

+----------+------+------+-----+----+---+---+----+
|Cover_Type|     1|     2|    3|   4|  5|  6|   7|
+----------+------+------+-----+----+---+---+----+
|       1.0|131982| 54742|   73|   0| 40|  2|3593|
|       2.0| 52711|198753| 2659|  34|359| 36| 610|
|       3.0|     0|  4864|26873| 341| 50| 89|   0|
|       4.0|     0|    14| 1298|1150|  0|  0|   0|
|       5.0|    87|  7729|  271|   0|423|  0|   0|
|       6.0|     0|  5147| 9953| 127| 15|404|   0|
|       7.0|  9713|   284|    0|   0|  0|  0|8522|
+----------+------+------+-----+----+---+---+----+



In [None]:
pandas_df = confusion_matrix.toPandas()
print(pandas_df.columns.values.tolist())


['Cover_Type', '1', '2', '3', '4', '5', '6', '7']


In [None]:
accuracies = []
for index, row in pandas_df.iterrows():
    total_entries = pandas_df[str(index + 1)].sum()
    correct = row[str(index + 1)]
    accuracies.append(correct/total_entries)
#print(len(accuracies))
df_accuracy = pd.DataFrame({'Cover Type': [i for i in range(1, 8)], 'Accuracy': accuracies})
df_accuracy


Unnamed: 0,Cover Type,Accuracy
0,1,0.678595
1,2,0.731966
2,3,0.653415
3,4,0.696126
4,5,0.476888
5,6,0.760829
6,7,0.669705
