# Road Safety Dataset
## Clustering

by Bernardo Augusto and Miguel Cisneiros

In [1]:
# Imports
from pyspark import SparkFiles

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px



from pyspark.sql import functions as F

from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


import findspark
findspark.init()
import pyspark # Call this only after findspark.init()
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [2]:
# Load the data
dataset = spark.read.csv("file://"+ SparkFiles.get("/Users/bernardoaugusto/Desktop/3º ano/1º semestre/Big Data/Project/2/Road Safety Data - Accidents 2019.csv"),header=True, sep=",", inferSchema=True)

#show top 10 rows
dataset.show(5) 

+--------------+---------------------+----------------------+---------+---------+------------+-----------------+------------------+--------------------+----------+-----------+-----+--------------------------+-------------------------+--------------+---------------+---------+-----------+---------------+----------------+--------------+---------------+---------------------------------+---------------------------------------+----------------+------------------+-----------------------+--------------------------+-------------------+-------------------+-------------------------------------------+-------------------------+
|Accident_Index|Location_Easting_OSGR|Location_Northing_OSGR|Longitude| Latitude|Police_Force|Accident_Severity|Number_of_Vehicles|Number_of_Casualties|      Date|Day_of_Week| Time|Local_Authority_(District)|Local_Authority_(Highway)|1st_Road_Class|1st_Road_Number|Road_Type|Speed_limit|Junction_Detail|Junction_Control|2nd_Road_Class|2nd_Road_Number|Pedestrian_Crossing-Human_

In [3]:
# Drop the -1 values from the spark dataset
dataset = dataset.filter((dataset["Light_Conditions"] != -1) & (dataset["Junction_Control"] != -1) & (dataset["2nd_Road_Class"] != -1)
                        & (dataset["Pedestrian_Crossing-Human_Control"] != -1) & (dataset["Pedestrian_Crossing-Physical_Facilities"] != -1)
                        & (dataset["Road_Surface_COnditions"] != -1) & (dataset["Special_Conditions_at_Site"] != -1)
                        & (dataset["Carriageway_Hazards"] != -1)
                        )

In [4]:
dataset.na.drop("all")

DataFrame[Accident_Index: string, Location_Easting_OSGR: int, Location_Northing_OSGR: int, Longitude: double, Latitude: double, Police_Force: int, Accident_Severity: int, Number_of_Vehicles: int, Number_of_Casualties: int, Date: string, Day_of_Week: int, Time: string, Local_Authority_(District): int, Local_Authority_(Highway): string, 1st_Road_Class: int, 1st_Road_Number: int, Road_Type: int, Speed_limit: int, Junction_Detail: int, Junction_Control: int, 2nd_Road_Class: int, 2nd_Road_Number: int, Pedestrian_Crossing-Human_Control: int, Pedestrian_Crossing-Physical_Facilities: int, Light_Conditions: int, Weather_Conditions: int, Road_Surface_Conditions: int, Special_Conditions_at_Site: int, Carriageway_Hazards: int, Urban_or_Rural_Area: int, Did_Police_Officer_Attend_Scene_of_Accident: int, LSOA_of_Accident_Location: string]

In [5]:
# drop the indexes and the correlated variables
dataset = dataset.drop("Accident_Index", "Location_Easting_OSGR", "Location_Northing_OSGR", "Police_Force", "LSOA_of_Accident_Location", 
                      "Local_Authority_(Highway)", "Time", "Date", "Longitude", "Latitude")

In [6]:
# Schema
dataset.printSchema()

root
 |-- Accident_Severity: integer (nullable = true)
 |-- Number_of_Vehicles: integer (nullable = true)
 |-- Number_of_Casualties: integer (nullable = true)
 |-- Day_of_Week: integer (nullable = true)
 |-- Local_Authority_(District): integer (nullable = true)
 |-- 1st_Road_Class: integer (nullable = true)
 |-- 1st_Road_Number: integer (nullable = true)
 |-- Road_Type: integer (nullable = true)
 |-- Speed_limit: integer (nullable = true)
 |-- Junction_Detail: integer (nullable = true)
 |-- Junction_Control: integer (nullable = true)
 |-- 2nd_Road_Class: integer (nullable = true)
 |-- 2nd_Road_Number: integer (nullable = true)
 |-- Pedestrian_Crossing-Human_Control: integer (nullable = true)
 |-- Pedestrian_Crossing-Physical_Facilities: integer (nullable = true)
 |-- Light_Conditions: integer (nullable = true)
 |-- Weather_Conditions: integer (nullable = true)
 |-- Road_Surface_Conditions: integer (nullable = true)
 |-- Special_Conditions_at_Site: integer (nullable = true)
 |-- Carriag

In [131]:
col1_col2 = dataset.groupBy("Accident_Severity", "Number_of_Casualties").count()

In [132]:
print(col1_col2)

DataFrame[Accident_Severity: int, Number_of_Casualties: int, count: bigint]


# ML Pipeline

In [133]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

# transformer
vector_assembler = VectorAssembler(inputCols=["Accident_Severity",
                                              "Number_of_Casualties"
                                             ],outputCol="features")
output = vector_assembler.transform(col1_col2)
output.show(5)

+-----------------+--------------------+-----+----------+
|Accident_Severity|Number_of_Casualties|count|  features|
+-----------------+--------------------+-----+----------+
|                3|                   1|40078| [3.0,1.0]|
|                2|                   2| 1667| [2.0,2.0]|
|                2|                  19|    1|[2.0,19.0]|
|                1|                   7|    2| [1.0,7.0]|
|                2|                   3|  528| [2.0,3.0]|
+-----------------+--------------------+-----+----------+
only showing top 5 rows



In [134]:
# Train a k-means model.
kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(output.select("features"))
predictions = model.transform(output.select("features"))

In [135]:
# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.7674905938258503


In [136]:
# In my case, the silhouette score came out to be around 77% which is quite significant, explaining that the clusters are nicely spaced.