In [1]:
cd ..

/home/user/Documents/workspace/projects/accident-prediction-montreal


In [2]:
from accidents_montreal import get_accident_df
from preprocess import init_spark
from road_network import get_road_df
from weather import add_weather_columns, extract_year_month_day
from pyspark.sql.functions import row_number, col, rank, avg, split, to_date, min

In [3]:
spark = init_spark()

# accidents

In [4]:
acc_df = get_accident_df(spark)

Skip fetching montreal accidents dataset: already downloaded
Skip extraction of accidents montreal dataframe: already done, reading from file


In [5]:
acc_df.columns

['NO_SEQ_COLL',
 'JR_SEMN_ACCDN',
 'DT_ACCDN',
 'CD_MUNCP',
 'NO_CIVIQ_ACCDN',
 'SFX_NO_CIVIQ_ACCDN',
 'BORNE_KM_ACCDN',
 'RUE_ACCDN',
 'TP_REPRR_ACCDN',
 'ACCDN_PRES_DE',
 'NB_METRE_DIST_ACCD',
 'CD_GENRE_ACCDN',
 'CD_SIT_PRTCE_ACCDN',
 'CD_ETAT_SURFC',
 'CD_ECLRM',
 'CD_ENVRN_ACCDN',
 'NO_ROUTE',
 'CD_CATEG_ROUTE',
 'CD_ETAT_CHASS',
 'CD_ASPCT_ROUTE',
 'CD_LOCLN_ACCDN',
 'CD_POSI_ACCDN',
 'CD_CONFG_ROUTE',
 'CD_ZON_TRAVX_ROUTR',
 'CD_PNT_CDRNL_ROUTE',
 'CD_PNT_CDRNL_REPRR',
 'CD_COND_METEO',
 'NB_VEH_IMPLIQUES_ACCDN',
 'NB_MORTS',
 'NB_BLESSES_GRAVES',
 'NB_BLESSES_LEGERS',
 'HEURE_ACCDN',
 'AN',
 'NB_VICTIMES_TOTAL',
 'GRAVITE',
 'REG_ADM',
 'MRC',
 'nb_automobile_camion_leger',
 'nb_camionLourd_tractRoutier',
 'nb_outil_equipement',
 'nb_tous_autobus_minibus',
 'nb_bicyclette',
 'nb_cyclomoteur',
 'nb_motocyclette',
 'nb_taxi',
 'nb_urgence',
 'nb_motoneige',
 'nb_VHR',
 'nb_autres_types',
 'nb_veh_non_precise',
 'NB_DECES_PIETON',
 'NB_BLESSES_PIETON',
 'NB_VICTIMES_PIETON',
 'NB_

In [6]:
dates_df = extract_year_month_day(acc_df.select('DT_ACCDN')
                                  .withColumn('date', to_date(col('DT_ACCDN'), format='yyyy/MM/dd'))).persist()
rows = dates_df.select('year').summary("min", "max").collect()
min_ = rows[0]['year']
max_ = rows[1]['year']
print(min_, max_)

2012 2017


In [7]:
# function get_random_date
import time
import datetime
import random
start = "01/01/" + str(min_)
end = "01/01/" + str(max_)
print(start, end)
start_stamp = datetime.datetime.strptime(start, "%d/%m/%Y").timestamp()
end_stamp = datetime.datetime.strptime(end, "%d/%m/%Y").timestamp()
random.seed()
random_stamp = datetime.datetime.fromtimestamp(random.randint(start_stamp, end_stamp))
date = random_stamp.strftime("%d/%m/%Y")
hour = random.randint(0,12)
print(date, hour)

01/01/2012 01/01/2017
06/04/2012 4


In [8]:
# generate all dates from start to end
date = datetime.datetime.strptime(start, "%d/%m/%Y")

dates = list()
while(date != datetime.datetime.strptime(end, "%d/%m/%Y")): 
    date += datetime.timedelta(days=1)
    for i in range(24):
        dates.append((date.strftime("%Y-%m-%d"), i))

In [9]:
sc = spark.sparkContext

In [10]:
print("nb_dates: ",len(dates))
spark.createDataFrame(dates, ['date', 'hour']).limit(10).show()
dates_df = spark.createDataFrame(dates, ['date', 'hour']).persist()

nb_dates:  43848
+----------+----+
|      date|hour|
+----------+----+
|2012-01-02|   0|
|2012-01-02|   1|
|2012-01-02|   2|
|2012-01-02|   3|
|2012-01-02|   4|
|2012-01-02|   5|
|2012-01-02|   6|
|2012-01-02|   7|
|2012-01-02|   8|
|2012-01-02|   9|
+----------+----+



# location
location (route) => coordinates of a road 

loc_lat, loc_long

In [11]:
from pyspark.sql.functions import rand, monotonically_increasing_id
from preprocess import generate_dates_df

#dates_df = generate_dates_df("01/01/2012", "01/01/2017", spark)
road_df = (get_road_df(spark)
               .select(['center_long', 'center_lat', 'street_id'])
               .withColumnRenamed('center_lat', 'loc_lat')
               .withColumnRenamed('center_long', 'loc_long')
               .orderBy(rand())
               .persist())
sc = spark.sparkContext
date_rdd = sc.parallelize(dates_df.rdd.take(20)).persist()
road_rdd = sc.parallelize(road_df.rdd.take(20)).persist()


Skip fetching road network: already downloaded
Extracting road network dataframe...
Extracting road network dataframe done


In [12]:
get_road_df(spark).columns

Skip fetching road network: already downloaded
Extracting road network dataframe...
Extracting road network dataframe done


['street_name',
 'street_type',
 'center_long',
 'center_lat',
 'coord_long',
 'coord_lat',
 'street_id']

In [13]:
road_rdd

ParallelCollectionRDD[59] at parallelize at PythonRDD.scala:195

In [14]:
negatives = date_rdd.cartesian(road_rdd)

In [15]:
negatives.take(10)

[(Row(date='2012-01-02', hour=0),
  Row(loc_long=-73.6639965, loc_lat=45.4765787, street_id=1868792317)),
 (Row(date='2012-01-02', hour=0),
  Row(loc_long=-73.5161412, loc_lat=45.63573, street_id=913716593)),
 (Row(date='2012-01-02', hour=1),
  Row(loc_long=-73.6639965, loc_lat=45.4765787, street_id=1868792317)),
 (Row(date='2012-01-02', hour=1),
  Row(loc_long=-73.5161412, loc_lat=45.63573, street_id=913716593)),
 (Row(date='2012-01-02', hour=0),
  Row(loc_long=-73.552167, loc_lat=45.462587, street_id=1417951801)),
 (Row(date='2012-01-02', hour=0),
  Row(loc_long=-73.4552443, loc_lat=45.7506666, street_id=361548870)),
 (Row(date='2012-01-02', hour=1),
  Row(loc_long=-73.552167, loc_lat=45.462587, street_id=1417951801)),
 (Row(date='2012-01-02', hour=1),
  Row(loc_long=-73.4552443, loc_lat=45.7506666, street_id=361548870)),
 (Row(date='2012-01-02', hour=0),
  Row(loc_long=-73.6232114, loc_lat=45.7152018, street_id=1088193442)),
 (Row(date='2012-01-02', hour=0),
  Row(loc_long=-73.77679

In [20]:
negatives_df = negatives.map(lambda row: row[0] + row[1]).toDF(['date','hour','loc_long','loc_lat', 'street_id']).persist()
negatives_df = negatives_df.withColumn('accident_id', monotonically_increasing_id())
negatives_df.show()

+----------+----+-----------+----------+----------+-----------+
|      date|hour|   loc_long|   loc_lat| street_id|accident_id|
+----------+----+-----------+----------+----------+-----------+
|2012-01-02|   0|-73.6639965|45.4765787|1868792317|          0|
|2012-01-02|   0|-73.5161412|  45.63573| 913716593|          1|
|2012-01-02|   1|-73.6639965|45.4765787|1868792317|          2|
|2012-01-02|   1|-73.5161412|  45.63573| 913716593|          3|
|2012-01-02|   0| -73.552167| 45.462587|1417951801| 8589934592|
|2012-01-02|   0|-73.4552443|45.7506666| 361548870| 8589934593|
|2012-01-02|   1| -73.552167| 45.462587|1417951801| 8589934594|
|2012-01-02|   1|-73.4552443|45.7506666| 361548870| 8589934595|
|2012-01-02|   0|-73.6232114|45.7152018|1088193442|17179869184|
|2012-01-02|   0| -73.776798|45.6502617|1430259433|17179869185|
|2012-01-02|   1|-73.6232114|45.7152018|1088193442|17179869186|
|2012-01-02|   1| -73.776798|45.6502617|1430259433|17179869187|
|2012-01-02|   0|-73.8154341|45.7698406|

In [21]:
from pyspark.sql.functions import udf, col, year, month, dayofmonth
from weather import extract_year_month_day
negatives_df.withColumn('year', year(col('date'))).show()

+----------+----+-----------+----------+----------+-----------+----+
|      date|hour|   loc_long|   loc_lat| street_id|accident_id|year|
+----------+----+-----------+----------+----------+-----------+----+
|2012-01-02|   0|-73.6639965|45.4765787|1868792317|          0|2012|
|2012-01-02|   0|-73.5161412|  45.63573| 913716593|          1|2012|
|2012-01-02|   1|-73.6639965|45.4765787|1868792317|          2|2012|
|2012-01-02|   1|-73.5161412|  45.63573| 913716593|          3|2012|
|2012-01-02|   0| -73.552167| 45.462587|1417951801| 8589934592|2012|
|2012-01-02|   0|-73.4552443|45.7506666| 361548870| 8589934593|2012|
|2012-01-02|   1| -73.552167| 45.462587|1417951801| 8589934594|2012|
|2012-01-02|   1|-73.4552443|45.7506666| 361548870| 8589934595|2012|
|2012-01-02|   0|-73.6232114|45.7152018|1088193442|17179869184|2012|
|2012-01-02|   0| -73.776798|45.6502617|1430259433|17179869185|2012|
|2012-01-02|   1|-73.6232114|45.7152018|1088193442|17179869186|2012|
|2012-01-02|   1| -73.776798|45.65

In [22]:
plus_weather = add_weather_columns(spark, negatives_df)


In [25]:
from road_network import get_road_features_df
road_features_df = get_road_features_df(spark, road_df=get_road_df(spark))
plus_weather.drop('loc_long').drop('loc_lat').drop('Wind_Chill').join(road_features_df, 'street_id').show()

Skip fetching road network: already downloaded
Extracting road network dataframe...
Extracting road network dataframe done
Extracting road features...
Extracting road features: done
+---------+--------------+----+---------+---------+-----------+----------+---------+---------+------------+----------+----+------------+-----------------+-----------+
|street_id|Dew_Point_Temp|Hmdx|  Rel_Hum|Stn_Press|       Temp|Visibility| Wind_Dir| Wind_Spd| accident_id|      date|hour|street_level|    street_length|street_type|
+---------+--------------+----+---------+---------+-----------+----------+---------+---------+------------+----------+----+------------+-----------------+-----------+
|785993152|     1.6631544|null|78.631546| 99.20046|  5.0368457|      25.0|21.851366|22.549969| 60129542148|2012-01-02|   0|   Secondary|31.87846443044035|     street|
|785993152|    0.16315441|null|73.631546| 99.19898|  4.5184226|      25.0|22.305481|30.022568| 60129542150|2012-01-02|   1|   Secondary|31.87846443044

# weather
weather => year/month/day date + hour
just query the API