In [3]:
import os
import sys
import numpy as np
from matplotlib import pyplot as plt

try:
    import pyspark
except ImportError:
    import findspark
    findspark.init()
    import pyspark
    
    
# give notebook access to crymepipelines app modules
CRYMEPIPELINES_PATH = '/home/ben/.envs/cc/CrymeClarity/crymepipelines/src'
sys.path.insert(0, CRYMEPIPELINES_PATH)
sys.path.insert(1, CRYMEPIPELINES_PATH+'/tasks')


#build spark session
APP_NAME = 'CRYME_PIPELINE_DEV'
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()


In [4]:
# import cyrmepipelines app modules
import csv
from datetime import datetime, timedelta
import os
import pickle as p
import shutil
from pyspark.sql.functions import udf, unix_timestamp
from pyspark.sql.types import StringType

from shared.objects.samples import SamplesManager
from shared.settings import CF_TRUST_DELAY, START_DATE, cf_conn, cp_conn, TMP_DIR, BIN_DIR
from tasks.base import SparkCrymeTask, NativeCrymeTask
from utils import crime_occ_udf, ts_to_minutes_in_day_udf, ts_to_hour_of_day_udf, ts_to_day_of_week_udf, ts_conv, safety_rel_crimes, actb_lat, actb_lon, space_dist
from tasks.mixins import SearchForCrimesMixin
from tasks.constants import crime_group_mapping

In [5]:
crime_incidents = spark.read.parquet(CRYMEPIPELINES_PATH+'/tmp/features_crime_incidents.parquet')
loc_time_samples = SparkCrymeTask(spark).load_df_from_cp('location_time_samples')

In [6]:
loc_time_samples = loc_time_samples.withColumn('lat_bb', actb_lat(loc_time_samples.latitude))
loc_time_samples = loc_time_samples.withColumn('lon_bb', actb_lon(loc_time_samples.longitude))
# convert datetime to unix timestamp
loc_time_samples = loc_time_samples.withColumn('timestamp_unix', unix_timestamp(loc_time_samples.timestamp))

In [7]:
results = None
for i in range(-1, 2):
    for j in range(-1, 2):
        subsample = loc_time_samples.withColumn('lat_bb', loc_time_samples.lat_bb + i)
        subsample = subsample.withColumn('lon_bb', loc_time_samples.lon_bb + j)

        results_subsample = subsample.join(
            crime_incidents,
            (subsample.lat_bb == crime_incidents.lat_bb_c) & (subsample.lon_bb == crime_incidents.lon_bb_c)
        )
        

In [8]:
results_subsample.show(3)

+---+--------+---------+---------+------+------+--------------+---+------+-----------+--------+--------+-----------+---+---+-------+----------------+-------------+--------+--------+-----------+
| id|latitude|longitude|timestamp|lat_bb|lon_bb|timestamp_unix|_id|crm_cd|crm_cd_desc|date_occ|time_occ|premis_desc|lon|lat|crm_grp|time_occ_seconds|date_occ_unix|lat_bb_c|lon_bb_c|ts_occ_unix|
+---+--------+---------+---------+------+------+--------------+---+------+-----------+--------+--------+-----------+---+---+-------+----------------+-------------+--------+--------+-----------+
+---+--------+---------+---------+------+------+--------------+---+------+-----------+--------+--------+-----------+---+---+-------+----------------+-------------+--------+--------+-----------+



In [10]:
loc_time_samples.sort('timestamp', ascending=False).show(3)

+------+-------------------+-----------------+-------------------+------+------+--------------+
|    id|           latitude|        longitude|          timestamp|lat_bb|lon_bb|timestamp_unix|
+------+-------------------+-----------------+-------------------+------+------+--------------+
|190307|-118.27669872988481|33.99893314222112|2019-04-17 16:58:44| 27106|  9373|    1555545524|
|190444|-118.42547117524487|34.29496886511509|2019-04-17 16:58:29| 27140|  9455|    1555545509|
|190302|-118.36798257059313|34.20741246560202|2019-04-17 16:57:18| 27126|  9431|    1555545438|
+------+-------------------+-----------------+-------------------+------+------+--------------+
only showing top 3 rows



In [9]:
crime_incidents.show(3)

+------------------+------+--------------------+-------------------+--------+--------------------+---------+-------+--------------------+----------------+-------------+--------+--------+-----------+
|               _id|crm_cd|         crm_cd_desc|           date_occ|time_occ|         premis_desc|      lon|    lat|             crm_grp|time_occ_seconds|date_occ_unix|lat_bb_c|lon_bb_c|ts_occ_unix|
+------------------+------+--------------------+-------------------+--------+--------------------+---------+-------+--------------------+----------------+-------------+--------+--------+-----------+
|row-3p3f~hp6y-yrri|   442|SHOPLIFTING - PET...|2018-11-12 00:00:00|    1210|DIY CENTER (LOWE'...|-118.5928|34.1847|Shoplifting/Pickp...|           43800|   1542009600|    7834|   16348| 1542053400|
|row-3p3h-xkhf.yxkc|   210|             ROBBERY|2018-12-28 00:00:00|    1600|              STREET|-118.2842|34.0524|Robbery (Violent ...|           57600|   1545984000|    7803|   16305| 1546041600|
|row-

In [15]:
crime_incidents.filter(crime_incidents.lat_bb_c==27126).show()

+---+------+-----------+--------+--------+-----------+---+---+-------+----------------+-------------+--------+--------+-----------+
|_id|crm_cd|crm_cd_desc|date_occ|time_occ|premis_desc|lon|lat|crm_grp|time_occ_seconds|date_occ_unix|lat_bb_c|lon_bb_c|ts_occ_unix|
+---+------+-----------+--------+--------+-----------+---+---+-------+----------------+-------------+--------+--------+-----------+
+---+------+-----------+--------+--------+-----------+---+---+-------+----------------+-------------+--------+--------+-----------+

