In [1]:
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('tree_methods_adv').getOrCreate()

In [3]:
dataset = spark.read.csv('dataset.csv',inferSchema=True,header=True)

In [4]:
data.columns

['accident_index',
 'vehicle_reference',
 'vehicle_type',
 'towing_and_articulation',
 'vehicle_manoeuvre',
 'vehicle_location-restricted_lane',
 'junction_location',
 'skidding_and_overturning',
 'hit_object_in_carriageway',
 'vehicle_leaving_carriageway',
 'hit_object_off_carriageway',
 '1st_point_of_impact',
 'was_vehicle_left_hand_drive?',
 'journey_purpose_of_driver',
 'sex_of_driver',
 'age_of_driver',
 'age_band_of_driver',
 'engine_capacity_(cc)',
 'propulsion_code',
 'age_of_vehicle',
 'driver_imd_decile',
 'driver_home_area_type',
 'vehicle_imd_decile',
 'NUmber_of_Casualities_unique_to_accident_index',
 'No_of_Vehicles_involved_unique_to_accident_index',
 'location_easting_osgr',
 'location_northing_osgr',
 'longitude',
 'latitude',
 'police_force',
 'accident_severity',
 'number_of_vehicles',
 'number_of_casualties',
 'date',
 'day_of_week',
 'time',
 'local_authority_(district)',
 'local_authority_(highway)',
 '1st_road_class',
 '1st_road_number',
 'road_type',
 'speed_lim

In [5]:
my_cols = data.select([ 'weather_conditions'
    ,'age_of_driver',
    'accident_severity'])

In [6]:
finaldata = my_cols.na.drop()

In [7]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,
                                OneHotEncoder,StringIndexer)

In [8]:
assembler = VectorAssembler(inputCols=[ 'weather_conditions'
    ,'age_of_driver',
    'accident_severity'],outputCol='features')

In [9]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier,GBTClassifier,RandomForestClassifier

In [10]:
rfc = RandomForestClassifier(labelCol='accident_severity',featuresCol='features')

In [11]:
pipeline = Pipeline(stages=[ assembler,rfc])

In [13]:
train_data, test_data = finaldata.randomSplit([0.75,0.25])

In [14]:
fit_model = pipeline.fit(train_data)

In [15]:
rfc_predictions = fit_model.transform(test_data)

In [16]:
rfc_predictions.show()

+------------------+-------------+-----------------+--------------+------------------+-----------------+----------+
|weather_conditions|age_of_driver|accident_severity|      features|     rawPrediction|      probability|prediction|
+------------------+-------------+-----------------+--------------+------------------+-----------------+----------+
|                 1|           -1|                2|[1.0,-1.0,2.0]|[0.0,0.0,20.0,0.0]|[0.0,0.0,1.0,0.0]|       2.0|
|                 1|           -1|                2|[1.0,-1.0,2.0]|[0.0,0.0,20.0,0.0]|[0.0,0.0,1.0,0.0]|       2.0|
|                 1|           -1|                2|[1.0,-1.0,2.0]|[0.0,0.0,20.0,0.0]|[0.0,0.0,1.0,0.0]|       2.0|
|                 1|           -1|                2|[1.0,-1.0,2.0]|[0.0,0.0,20.0,0.0]|[0.0,0.0,1.0,0.0]|       2.0|
|                 1|           -1|                2|[1.0,-1.0,2.0]|[0.0,0.0,20.0,0.0]|[0.0,0.0,1.0,0.0]|       2.0|
|                 1|           -1|                2|[1.0,-1.0,2.0]|[0.0,

In [17]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [19]:
acc_evaluator = MulticlassClassificationEvaluator(labelCol="accident_severity", predictionCol="prediction", metricName="accuracy")

In [20]:
rfc_acc = acc_evaluator.evaluate(rfc_predictions)

In [21]:
print('A random forest ensemble has an accuracy of: {0:2.2f}%'.format(rfc_acc*100))

A random forest ensemble has an accuracy of: 100.00%
