In [1]:
cd ..

In [2]:
from road_network import get_road_df
from accidents_montreal import fetch_accidents_montreal,\
                               extract_accidents_montreal_df,\
                               get_accident_df
from road_network import distance_intermediate_formula,\
                         distance_measure,\
                         get_road_features_df,\
                         get_road_df
from weather import add_weather_columns, extract_year_month_day
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import row_number, col, rank, avg, split, to_date, \
                                  rand, monotonically_increasing_id
from os.path import isdir
from shutil import rmtree
import datetime
from preprocess import init_spark
from preprocess import preprocess_accidents

In [3]:
spark = init_spark()
road_df = get_road_df(spark)

Skip fetching road network: already downloaded
Skip extraction of road network dataframe: already done, reading from file


In [4]:
cache_path = 'data/positive-samples.parquet'
replace_cache = True
if isdir(cache_path):
    try:
        if replace_cache:
            rmtree(cache_path)
        else:
            print('ah')
    except Exception:
        print('Failed reading from disk cache')
        rmtree(cache_path)

if road_df is None:
    road_df = get_road_df(spark)
    

In [5]:
from shutil import rmtree
import os
cache = 'data/accidents-montreal.parquet'
if os.path.isdir(cache):
    print('deleting dir')
    rmtree(cache)
    print(os.path.isdir(cache))
accidents_df = get_accident_df(spark, True).persist()
accidents_df.columns

deleting dir
False
Skip fetching montreal accidents dataset: already downloaded


['NO_SEQ_COLL',
 'JR_SEMN_ACCDN',
 'DT_ACCDN',
 'CD_MUNCP',
 'NO_CIVIQ_ACCDN',
 'SFX_NO_CIVIQ_ACCDN',
 'BORNE_KM_ACCDN',
 'RUE_ACCDN',
 'TP_REPRR_ACCDN',
 'ACCDN_PRES_DE',
 'NB_METRE_DIST_ACCD',
 'CD_GENRE_ACCDN',
 'CD_SIT_PRTCE_ACCDN',
 'CD_ETAT_SURFC',
 'CD_ECLRM',
 'CD_ENVRN_ACCDN',
 'NO_ROUTE',
 'CD_CATEG_ROUTE',
 'CD_ETAT_CHASS',
 'CD_ASPCT_ROUTE',
 'CD_LOCLN_ACCDN',
 'CD_POSI_ACCDN',
 'CD_CONFG_ROUTE',
 'CD_ZON_TRAVX_ROUTR',
 'CD_PNT_CDRNL_ROUTE',
 'CD_PNT_CDRNL_REPRR',
 'CD_COND_METEO',
 'NB_VEH_IMPLIQUES_ACCDN',
 'NB_MORTS',
 'NB_BLESSES_GRAVES',
 'NB_BLESSES_LEGERS',
 'HEURE_ACCDN',
 'AN',
 'NB_VICTIMES_TOTAL',
 'GRAVITE',
 'REG_ADM',
 'MRC',
 'nb_automobile_camion_leger',
 'nb_camionLourd_tractRoutier',
 'nb_outil_equipement',
 'nb_tous_autobus_minibus',
 'nb_bicyclette',
 'nb_cyclomoteur',
 'nb_motocyclette',
 'nb_taxi',
 'nb_urgence',
 'nb_motoneige',
 'nb_VHR',
 'nb_autres_types',
 'nb_veh_non_precise',
 'NB_DECES_PIETON',
 'NB_BLESSES_PIETON',
 'NB_VICTIMES_PIETON',
 'NB_

In [6]:

    
accidents_df = (accidents_df
            .select('ACCIDENT_ID', 'DT_ACCDN', 'LOC_LAT',
                    'LOC_LONG', 'HEURE_ACCDN')
            .withColumn('date', to_date(col('DT_ACCDN'), format='yyyy/MM/dd'))
            .withColumn("hour", split(col('HEURE_ACCDN'), ':')[0].cast("int"))
            .drop('DT_ACCDN', 'HEURE_ACCDN')
            .withColumnRenamed('LOC_LAT', 'loc_lat')
            .withColumnRenamed('LOC_LONG', 'loc_long')
            .withColumnRenamed('ACCIDENT_ID', 'accident_id')
            .dropna()
            .limit(20)).persist()

In [7]:
from preprocess import match_accidents_with_roads
road_features_df = get_road_features_df(spark, road_df=road_df)
match_accident_road = match_accidents_with_roads(road_df, accidents_df)
accident_with_weather = add_weather_columns(spark, accidents_df)
positive_samples = extract_year_month_day(
        accident_with_weather
        .join(match_accident_road, 'accident_id')
        .join(road_features_df, 'street_id'))

positive_samples.write.parquet(cache_path)

Skip extracting road features: already done


In [8]:
positive_samples.show()

+----------+-----------+--------------+---------+---------+----------+-----------+----------+----------+---------+----------+----------+----+---------+----------+------------+------------------+-----------+----+-----+---+
| street_id|accident_id|Dew_Point_Temp|     Hmdx|  Rel_Hum| Stn_Press|       Temp|Visibility|Wind_Chill| Wind_Dir|  Wind_Spd|      date|hour|  loc_lat|  loc_long|street_level|     street_length|street_type|year|month|day|
+----------+-----------+--------------+---------+---------+----------+-----------+----------+----------+---------+----------+----------+----+---------+----------+------------+------------------+-----------+----+-----+---+
|1262420041|         19|    0.88246626|     null| 72.13579| 100.61591|   5.527146|      25.0|      null|27.181822|  7.800327|2012-04-26|   0| 45.50203| -73.81524|     Primary|13.050115618030041|  boulevard|2012|    4| 26|
|1424668385|          7|     7.3442655|     null| 74.98749|  100.2302|  11.638672|  20.46455|      null|22.14739