In [1]:
from road_network import fetch_road_network, extract_road_segments_DF
from accidents_montreal import fetch_accidents_montreal, extract_accidents_montreal_dataframe
from weather import get_weather
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import pow, col, min
import sys
import os

In [2]:
def init_spark():
    spark = (SparkSession
        .builder
        .appName("Accident Prediction")
        .getOrCreate())
    return spark
spark = init_spark()

In [3]:
fetch_road_network()
rnd = extract_road_segments_DF(spark)

Skip fetching road network: already downloaded
Skip extraction of road network dataframe: already done, reading from file


In [4]:
fetch_accidents_montreal()
amd = extract_accidents_montreal_dataframe(spark)
rnd.columns

Skip fetching montreal accidents dataset: already downloaded
Skip extraction of accidents montreal dataframe: already done, reading from file


['street_name',
 'street_type',
 'center_long',
 'center_lat',
 'coord_long',
 'coord_lat']

In [49]:
from pyspark.sql.functions import monotonically_increasing_id, sin, cos, radians, atan2, sqrt, rank

In [13]:
road_centers = (rnd
                .select(['street_name', 'street_type', 'center_long', 'center_lat'])
                .drop_duplicates()
                .sample(0.1)
                .persist())
sample_accidents = amd.sample(0.05).withColumn('ID', monotonically_increasing_id())

In [50]:
# Source: https://www.movable-type.co.uk/scripts/latlong.html
earth_diameter = 6371 * 2 * 1000
distance_inter = (pow(sin(radians(col('LOC_LAT') - col('center_lat'))/2), 2)
            + (pow(sin(radians(col('LOC_LONG') - col('center_long'))/2), 2) 
                * cos(radians(col('LOC_LAT'))) * cos(radians(col('center_lat')))))
distance_measure = atan2(sqrt(col('distance_inter')), sqrt(1-col('distance_inter')))

accidentWindow = Window.partitionBy("ID").orderBy("distance_measure")
cart = (sample_accidents
        .select(['LOC_LAT', 'LOC_LONG', 'ID'])
        .crossJoin(road_centers)
        .withColumn('distance_inter', distance_inter)
        .withColumn('distance_measure', distance_measure)
        .select('ID', 'street_name', 'street_type', 'distance_measure',
               rank().over(accidentWindow).alias('distance_rank'))
        .filter(col('distance_rank') <= 5)
        .withColumn('distance', col('distance_measure')*earth_diameter)
        .drop('min_distance_measure', 'distance_measure'))

cart.take(5)

[Row(ID=26, street_name='Boulevard Métropolitain', street_type='Primary', distance_rank=1, distance=89.95913383683441),
 Row(ID=26, street_name='Boulevard Ray-Lawson', street_type='Primary', distance_rank=2, distance=139.34510529889405),
 Row(ID=26, street_name='Avenue Azilda', street_type='Secondary', distance_rank=3, distance=162.81337967265424),
 Row(ID=26, street_name='Boulevard Métropolitain', street_type='Primary', distance_rank=4, distance=191.1244951967038),
 Row(ID=26, street_name='Place Pigeon', street_type='Secondary', distance_rank=5, distance=198.4549155752083)]

# Test of the distance measure

In [45]:
import pyspark.sql.functions as f
from pyspark.sql import Window
import pandas as pd

# First example distance between identical points, and second example between London and Arlington 
DF = pd.DataFrame({'LOC_LAT': [0, 51.5],
                   'LOC_LONG': [0, 0],
                   'center_lat': [0, 38.8],
                   'center_long': [0, -77.1]
                  })
df = (spark.createDataFrame(DF)
        .withColumn('distance_inter', distance_inter)
        .withColumn('distance_measure', distance_measure)
        .withColumn('distance', col('distance_measure') * earth_diameter)
         .select('distance'))
df.show()

+-----------------+
|         distance|
+-----------------+
|              0.0|
|5918.185064088762|
+-----------------+



# Test of the top k selection

In [44]:
import pyspark.sql.functions as f
from pyspark.sql import Window
import pandas as pd

k = 2
DF = pd.DataFrame({'a': [1,1,1,2,2,2,3,3,3],
                   'b': [1,2,3,1,2,3,1,2,3],
                   'c': [3,2,1,4,5,6,7,8,9]
                  })

df = spark.createDataFrame(DF)

window = Window.partitionBy("a").orderBy("c")

df.select('a', 'b', 'c', f.rank().over(window).alias('y')).filter(col('y') <= k).show()

+---+---+---+---+
|  a|  b|  c|  y|
+---+---+---+---+
|  1|  3|  1|  1|
|  1|  2|  2|  2|
|  3|  1|  7|  1|
|  3|  2|  8|  2|
|  2|  1|  4|  1|
|  2|  2|  5|  2|
+---+---+---+---+

