In [1]:
cd ..

/home/user/Documents/workspace/projects/accident-prediction-montreal


In [2]:
from accidents_montreal import fetch_accidents_montreal,\
                               extract_accidents_montreal_df,\
                               get_accident_df
from road_network import distance_intermediate_formula,\
                         distance_measure,\
                         get_road_features_df,\
                         get_road_df
from weather import add_weather_columns, extract_year_month_day
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import row_number, col, rank, avg, split, to_date, \
                                  rand, monotonically_increasing_id
from os.path import isdir
from shutil import rmtree
import datetime
from preprocess import generate_dates_df, init_spark

In [3]:
spark = init_spark()
cache_path = 'data/negative-samples.parquet'
if isdir(cache_path):
    try:
        print("test")
        #return spark.read.parquet(cache_path)
    except Exception:
        print('Failed reading from disk cache')
        rmtree(cache_path)

dates_df = generate_dates_df("01/01/2012", "01/01/2017", spark).limit(20)
road_df = get_road_df(spark).limit(20)
road_features_df = get_road_features_df(spark, road_df=road_df)

Skip fetching road network: already downloaded
Extracting road network dataframe...
Extracting road network dataframe done
Skip extracting road features: already done


In [4]:
road_df.columns

['street_name',
 'street_type',
 'center_long',
 'center_lat',
 'coord_long',
 'coord_lat',
 'street_id']

In [5]:
road_df = (road_df.select(['center_long', 'center_lat', 'street_id'])
                  .withColumnRenamed('center_lat', 'loc_lat')
                  .withColumnRenamed('center_long', 'loc_long')
                  .orderBy(rand())
                  .persist())

In [None]:
negative_samples = (dates_df.rdd
                            .cartesian(road_df.rdd)
                            .map(lambda row: row[0] + row[1])
                            .toDF(['date', 'hour', 'loc_long',
                                   'loc_lat', 'street_id'])
                            .withColumn('accident_id',
                                        monotonically_increasing_id())
                            .persist())

negative_samples = (add_weather_columns(spark, negative_samples)
                    .join(road_features_df, 'street_id'))

negative_samples.write.parquet(cache_path)