# Road Safety Dataset
## Random Forest

by Bernardo Augusto and Miguel Cisneiros

In [1]:
# Imports
from pyspark import SparkFiles

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px



from pyspark.sql import functions as F

from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


from sklearn import metrics
from sklearn.metrics import classification_report 


import findspark
findspark.init()
import pyspark # Call this only after findspark.init()
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [2]:
# Load the data
dataset = spark.read.csv("file://"+ SparkFiles.get("/Users/bernardoaugusto/Desktop/3ยบ ano/1ยบ semestre/Big Data/Project/2/Road Safety Data - Accidents 2019.csv"),header=True, sep=",", inferSchema=True)

#show top 10 rows
dataset.show(10) 

+--------------+---------------------+----------------------+---------+---------+------------+-----------------+------------------+--------------------+----------+-----------+-----+--------------------------+-------------------------+--------------+---------------+---------+-----------+---------------+----------------+--------------+---------------+---------------------------------+---------------------------------------+----------------+------------------+-----------------------+--------------------------+-------------------+-------------------+-------------------------------------------+-------------------------+
|Accident_Index|Location_Easting_OSGR|Location_Northing_OSGR|Longitude| Latitude|Police_Force|Accident_Severity|Number_of_Vehicles|Number_of_Casualties|      Date|Day_of_Week| Time|Local_Authority_(District)|Local_Authority_(Highway)|1st_Road_Class|1st_Road_Number|Road_Type|Speed_limit|Junction_Detail|Junction_Control|2nd_Road_Class|2nd_Road_Number|Pedestrian_Crossing-Human_

In [3]:
# Drop the -1 values from the spark dataset
dataset = dataset.filter((dataset["Light_Conditions"] != -1) & (dataset["Junction_Control"] != -1) & (dataset["2nd_Road_Class"] != -1)
                        & (dataset["Pedestrian_Crossing-Human_Control"] != -1) & (dataset["Pedestrian_Crossing-Physical_Facilities"] != -1)
                        & (dataset["Road_Surface_COnditions"] != -1) & (dataset["Special_Conditions_at_Site"] != -1)
                        & (dataset["Carriageway_Hazards"] != -1)
                        )

In [4]:
dataset.na.drop("all")

DataFrame[Accident_Index: string, Location_Easting_OSGR: int, Location_Northing_OSGR: int, Longitude: double, Latitude: double, Police_Force: int, Accident_Severity: int, Number_of_Vehicles: int, Number_of_Casualties: int, Date: string, Day_of_Week: int, Time: string, Local_Authority_(District): int, Local_Authority_(Highway): string, 1st_Road_Class: int, 1st_Road_Number: int, Road_Type: int, Speed_limit: int, Junction_Detail: int, Junction_Control: int, 2nd_Road_Class: int, 2nd_Road_Number: int, Pedestrian_Crossing-Human_Control: int, Pedestrian_Crossing-Physical_Facilities: int, Light_Conditions: int, Weather_Conditions: int, Road_Surface_Conditions: int, Special_Conditions_at_Site: int, Carriageway_Hazards: int, Urban_or_Rural_Area: int, Did_Police_Officer_Attend_Scene_of_Accident: int, LSOA_of_Accident_Location: string]

In [5]:
# drop the indexes and the correlated variables
dataset = dataset.drop("Accident_Index", "Location_Easting_OSGR", "Location_Northing_OSGR", "Police_Force", "LSOA_of_Accident_Location", 
                      "Local_Authority_(Highway)", "Time", "Date", "Longitude", "Latitude")

In [6]:
# Schema
dataset.printSchema()

root
 |-- Accident_Severity: integer (nullable = true)
 |-- Number_of_Vehicles: integer (nullable = true)
 |-- Number_of_Casualties: integer (nullable = true)
 |-- Day_of_Week: integer (nullable = true)
 |-- Local_Authority_(District): integer (nullable = true)
 |-- 1st_Road_Class: integer (nullable = true)
 |-- 1st_Road_Number: integer (nullable = true)
 |-- Road_Type: integer (nullable = true)
 |-- Speed_limit: integer (nullable = true)
 |-- Junction_Detail: integer (nullable = true)
 |-- Junction_Control: integer (nullable = true)
 |-- 2nd_Road_Class: integer (nullable = true)
 |-- 2nd_Road_Number: integer (nullable = true)
 |-- Pedestrian_Crossing-Human_Control: integer (nullable = true)
 |-- Pedestrian_Crossing-Physical_Facilities: integer (nullable = true)
 |-- Light_Conditions: integer (nullable = true)
 |-- Weather_Conditions: integer (nullable = true)
 |-- Road_Surface_Conditions: integer (nullable = true)
 |-- Special_Conditions_at_Site: integer (nullable = true)
 |-- Carriag

# ML Pipeline

In [7]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

# transformer
vector_assembler = VectorAssembler(inputCols=["Number_of_Vehicles", "Number_of_Casualties", "Day_of_Week", 
                                              "Local_Authority_(District)", "1st_Road_Class", "1st_Road_Number", 
                                              "Road_Type", "Speed_limit", "Junction_Detail", "Junction_Control", "2nd_Road_Class", 
                                              "2nd_Road_Number", "Pedestrian_Crossing-Human_Control", "Pedestrian_Crossing-Physical_Facilities", 
                                              "Light_Conditions", "Weather_Conditions", "Road_Surface_Conditions", "Special_Conditions_at_Site", 
                                              "Carriageway_Hazards", "Urban_or_Rural_Area", "Did_Police_Officer_Attend_Scene_of_Accident"
                                             ],outputCol="features")
output = vector_assembler.transform(dataset)
output = output.withColumn('label', output.Accident_Severity)
output.show(5)

+-----------------+------------------+--------------------+-----------+--------------------------+--------------+---------------+---------+-----------+---------------+----------------+--------------+---------------+---------------------------------+---------------------------------------+----------------+------------------+-----------------------+--------------------------+-------------------+-------------------+-------------------------------------------+--------------------+-----+
|Accident_Severity|Number_of_Vehicles|Number_of_Casualties|Day_of_Week|Local_Authority_(District)|1st_Road_Class|1st_Road_Number|Road_Type|Speed_limit|Junction_Detail|Junction_Control|2nd_Road_Class|2nd_Road_Number|Pedestrian_Crossing-Human_Control|Pedestrian_Crossing-Physical_Facilities|Light_Conditions|Weather_Conditions|Road_Surface_Conditions|Special_Conditions_at_Site|Carriageway_Hazards|Urban_or_Rural_Area|Did_Police_Officer_Attend_Scene_of_Accident|            features|label|
+-----------------+-----

In [8]:
# not scaler

from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=False, withMean=False)

scalerModel = scaler.fit(output).transform(output)

scalerModel.select("features","scaledFeatures").show(5)

+--------------------+--------------------+
|            features|      scaledFeatures|
+--------------------+--------------------+
|[2.0,3.0,2.0,1.0,...|[2.0,3.0,2.0,1.0,...|
|[2.0,1.0,3.0,2.0,...|[2.0,1.0,3.0,2.0,...|
|[1.0,1.0,3.0,2.0,...|[1.0,1.0,3.0,2.0,...|
|[2.0,2.0,3.0,28.0...|[2.0,2.0,3.0,28.0...|
|[1.0,1.0,3.0,20.0...|[1.0,1.0,3.0,20.0...|
+--------------------+--------------------+
only showing top 5 rows



In [9]:
# Convert into binary 
from pyspark.ml.feature import StandardScaler
from pyspark.sql import functions as F

scalerModel = output.withColumn("Accident_Severity", F.when(F.col("Accident_Severity")<=2,0).otherwise(F.when(F.col("Accident_Severity")>2,1)))

In [10]:
scalerModel = scalerModel.withColumn("label", scalerModel.Accident_Severity)

scalerModel.select("features", "label").show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[2.0,3.0,2.0,1.0,...|    1|
|[2.0,1.0,3.0,2.0,...|    1|
|[1.0,1.0,3.0,2.0,...|    0|
|[2.0,2.0,3.0,28.0...|    1|
|[1.0,1.0,3.0,20.0...|    1|
+--------------------+-----+
only showing top 5 rows



In [11]:
#Undersampling
from pyspark.sql.functions import col, explode, array, lit

major_df = scalerModel.filter(col("label") == 1)
minor_df = scalerModel.filter(col("label") == 0)
ratio = int(major_df.count()/minor_df.count())
print("ratio: {}".format(ratio))

ratio: 3


In [12]:
sampled_majority_df = major_df.sample(False, 1/ratio)
combined_df_2 = sampled_majority_df.unionAll(minor_df)
scalerModel = combined_df_2
scalerModel.show(5)

+-----------------+------------------+--------------------+-----------+--------------------------+--------------+---------------+---------+-----------+---------------+----------------+--------------+---------------+---------------------------------+---------------------------------------+----------------+------------------+-----------------------+--------------------------+-------------------+-------------------+-------------------------------------------+--------------------+-----+
|Accident_Severity|Number_of_Vehicles|Number_of_Casualties|Day_of_Week|Local_Authority_(District)|1st_Road_Class|1st_Road_Number|Road_Type|Speed_limit|Junction_Detail|Junction_Control|2nd_Road_Class|2nd_Road_Number|Pedestrian_Crossing-Human_Control|Pedestrian_Crossing-Physical_Facilities|Light_Conditions|Weather_Conditions|Road_Surface_Conditions|Special_Conditions_at_Site|Carriageway_Hazards|Urban_or_Rural_Area|Did_Police_Officer_Attend_Scene_of_Accident|            features|label|
+-----------------+-----

In [13]:
#String Indexer

inputDF = scalerModel.select('label', 'features')
inputDF.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    1|[2.0,1.0,3.0,16.0...|
|    1|[1.0,1.0,3.0,8.0,...|
|    1|[2.0,3.0,3.0,26.0...|
|    1|[2.0,1.0,3.0,12.0...|
|    1|[2.0,1.0,3.0,5.0,...|
|    1|[2.0,1.0,3.0,5.0,...|
|    1|[1.0,2.0,3.0,7.0,...|
|    1|[2.0,1.0,4.0,17.0...|
|    1|[2.0,1.0,4.0,10.0...|
|    1|[2.0,1.0,4.0,10.0...|
|    1|[2.0,1.0,5.0,5.0,...|
|    1|[1.0,1.0,4.0,10.0...|
|    1|[2.0,1.0,5.0,18.0...|
|    1|[2.0,1.0,4.0,16.0...|
|    1|[3.0,1.0,4.0,22.0...|
|    1|[3.0,1.0,5.0,32.0...|
|    1|[1.0,1.0,6.0,28.0...|
|    1|[2.0,1.0,6.0,10.0...|
|    1|[2.0,1.0,5.0,29.0...|
|    1|[2.0,1.0,6.0,9.0,...|
+-----+--------------------+
only showing top 20 rows



In [14]:
from pyspark.ml.feature import StringIndexer

stringIndexer = StringIndexer(inputCol = 'label', outputCol = 'labelIndexed')

In [15]:
inputDF2 = stringIndexer.fit(inputDF).transform(inputDF)

In [16]:
inputDF2.show()

+-----+--------------------+------------+
|label|            features|labelIndexed|
+-----+--------------------+------------+
|    1|[2.0,1.0,3.0,16.0...|         0.0|
|    1|[1.0,1.0,3.0,8.0,...|         0.0|
|    1|[2.0,3.0,3.0,26.0...|         0.0|
|    1|[2.0,1.0,3.0,12.0...|         0.0|
|    1|[2.0,1.0,3.0,5.0,...|         0.0|
|    1|[2.0,1.0,3.0,5.0,...|         0.0|
|    1|[1.0,2.0,3.0,7.0,...|         0.0|
|    1|[2.0,1.0,4.0,17.0...|         0.0|
|    1|[2.0,1.0,4.0,10.0...|         0.0|
|    1|[2.0,1.0,4.0,10.0...|         0.0|
|    1|[2.0,1.0,5.0,5.0,...|         0.0|
|    1|[1.0,1.0,4.0,10.0...|         0.0|
|    1|[2.0,1.0,5.0,18.0...|         0.0|
|    1|[2.0,1.0,4.0,16.0...|         0.0|
|    1|[3.0,1.0,4.0,22.0...|         0.0|
|    1|[3.0,1.0,5.0,32.0...|         0.0|
|    1|[1.0,1.0,6.0,28.0...|         0.0|
|    1|[2.0,1.0,6.0,10.0...|         0.0|
|    1|[2.0,1.0,5.0,29.0...|         0.0|
|    1|[2.0,1.0,6.0,9.0,...|         0.0|
+-----+--------------------+------

# Train/Test Split

In [17]:
(train, test) = inputDF2.randomSplit([0.7, 0.3], seed=100)

# Random Forest
Because Random Forest is better when we have a lot of features

In [18]:
from pyspark .ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [19]:
rfClassifier = RandomForestClassifier(labelCol="labelIndexed", numTrees=100)

In [20]:
rfModel1 = rfClassifier.fit(train)

In [21]:
rfModel1.featureImportances

SparseVector(21, {0: 0.3654, 1: 0.0279, 2: 0.009, 3: 0.0549, 4: 0.005, 5: 0.0085, 6: 0.0196, 7: 0.0634, 8: 0.0426, 9: 0.0135, 10: 0.0226, 11: 0.0224, 12: 0.0019, 13: 0.0038, 14: 0.0096, 15: 0.0107, 16: 0.0047, 17: 0.005, 18: 0.0048, 19: 0.0483, 20: 0.2563})

In [22]:
rfModel1.toDebugString

'RandomForestClassificationModel: uid=RandomForestClassifier_721928b3fdf5, numTrees=100, numClasses=2, numFeatures=21\n  Tree 0 (weight 1.0):\n    If (feature 0 <= 1.5)\n     If (feature 19 <= 1.5)\n      If (feature 11 <= 505.5)\n       If (feature 14 <= 2.5)\n        If (feature 20 <= 1.5)\n         Predict: 1.0\n        Else (feature 20 > 1.5)\n         Predict: 0.0\n       Else (feature 14 > 2.5)\n        If (feature 20 <= 2.5)\n         Predict: 1.0\n        Else (feature 20 > 2.5)\n         Predict: 0.0\n      Else (feature 11 > 505.5)\n       If (feature 7 <= 45.0)\n        If (feature 15 <= 6.0)\n         Predict: 1.0\n        Else (feature 15 > 6.0)\n         Predict: 0.0\n       Else (feature 7 > 45.0)\n        If (feature 2 <= 5.5)\n         Predict: 0.0\n        Else (feature 2 > 5.5)\n         Predict: 1.0\n     Else (feature 19 > 1.5)\n      If (feature 20 <= 1.5)\n       If (feature 11 <= 4052.5)\n        If (feature 15 <= 6.0)\n         Predict: 1.0\n        Else (featu

In [23]:
predictions = rfModel1.transform(test)
predictions.select('label', 'labelIndexed').show()

+-----+------------+
|label|labelIndexed|
+-----+------------+
|    1|         0.0|
|    1|         0.0|
|    1|         0.0|
|    1|         0.0|
|    1|         0.0|
|    1|         0.0|
|    1|         0.0|
|    1|         0.0|
|    1|         0.0|
|    1|         0.0|
|    1|         0.0|
|    1|         0.0|
|    1|         0.0|
|    1|         0.0|
|    1|         0.0|
|    1|         0.0|
|    1|         0.0|
|    1|         0.0|
|    1|         0.0|
|    1|         0.0|
+-----+------------+
only showing top 20 rows



In [26]:
evaluator = MulticlassClassificationEvaluator(labelCol='labelIndexed', predictionCol='labelIndexed')

accuracy = evaluator.evaluate(predictions)

print(f"Accuracy = {accuracy*100}%")

Accuracy = 100.0%
