In [1]:
#import findspark
#findspark.init()
import pyspark
from matplotlib import pyplot as plt
from utils import *
from pyspark.sql.functions import collect_list
APP_NAME = 'Exploration-Notebook'
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()

In [19]:
ds = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri",
"mongodb://localhost/crymeclarity.incidents").load()

In [20]:
from pyspark.sql.types import ArrayType, StructType, IntegerType, StructField, StringType, FloatType, TimestampType, DecimalType
from pyspark.sql.functions import udf

def assign_coordinate_to_lat_box(latitude):
    lat_box = abs(int(latitude/(1*.008726950000000073)))
    return lat_box
def assign_coordinate_to_lon_box(longitude):
    lon_box = abs(int(longitude/(1*0.007254180000003885)))
    return lon_box

def time_occ_to_seconds(time_occ):
    return int(time_occ[:2])*60**2 + int(time_occ[2:])*60

actb_lat = udf(assign_coordinate_to_lat_box, IntegerType())
actb_lon = udf(assign_coordinate_to_lon_box, IntegerType())
ts_conv = udf(cla_timestamp_to_datetime, TimestampType())
t_occ_conv = udf(time_occ_to_seconds, IntegerType())

In [21]:
ds = ds.withColumn('lat_bb_c', actb_lat(ds.location_1.coordinates[0]))
ds = ds.withColumn('lon_bb_c', actb_lon(ds.location_1.coordinates[1]))
ds = ds.withColumn('date_occ', ts_conv(ds.date_occ))
ds = ds.withColumn('time_occ_seconds', t_occ_conv(ds.time_occ))

In [22]:
A = spark.read.format("jdbc").options(
url ="jdbc:mysql://localhost/crymeweb?serverTimezone=UTC",
driver="com.mysql.jdbc.Driver",
dbtable="safety_safetyanalysisrequest",
user="root",
password=""
).load()


In [23]:
A = A.withColumn('lat_bb', actb_lat(A.latitude))
A = A.withColumn('lon_bb', actb_lon(A.longitude))

In [24]:
import pyspark.sql.functions as psf
from pyspark.sql.functions import col
import mpu
space_dist = udf(lambda w, x, y, z: mpu.haversine_distance((w, x), (y, z))*0.621371, FloatType())


results = None
for i in range(-1, 2):
    for j in range(-1, 2):
        B = A.withColumn('lat_bb', A.lat_bb + i)
        B = B.withColumn('lon_bb', A.lon_bb + j)
        
        res = B.join(ds, (B.lat_bb == ds.lat_bb_c)& (B.lon_bb == ds.lon_bb_c))
        res = res.withColumn('timestamp_unix', psf.unix_timestamp(res.timestamp))
        res = res.withColumn('date_occ_unix', psf.unix_timestamp(res.date_occ))
        res = res.withColumn('ts_occ_unix', res.date_occ_unix + res.time_occ_seconds)
        
        res = res.filter(res.ts_occ_unix - res.timestamp_unix < 3600)
        res = res.filter(res.ts_occ_unix - res.timestamp_unix > 0)
        
        res = res.withColumn('distance', space_dist(
                res.longitude,
                res.latitude,
                res.location_1.coordinates[1],
                res.location_1.coordinates[0],
            ))
        res = res.filter(res.distance < .5)
        results = results.union(res) if results else res


In [26]:
results.count()

214

In [27]:
re = results.select(':id', 'id').toPandas()

In [46]:
A.filter(A.id==51777).show()

+-----+-------------------+-----------------+--------------------+--------+--------+------+------+
|   id|           latitude|        longitude|           timestamp|estimate|model_id|lat_bb|lon_bb|
+-----+-------------------+-----------------+--------------------+--------+--------+------+------+
|51777|-118.19438636182308|34.09015904415087|2019-03-17 04:51:...|    null|    null| 13543|  4699|
+-----+-------------------+-----------------+--------------------+--------+--------+------+------+



In [49]:
ds.filter(ds[':id']=='row-ttw5~9nep~aydw').show()

+---------------------------+---------------------------+---------------------------+---------------------------+---------------------------+---------------------------+--------------------+------------------+--------------------+-----------------+------------------+-------+----------+------+--------+--------+--------------------+------------+-------------------+--------------------+---------+--------------------+--------------------+---------+---------+-----------+--------------------+-----------+------+-----------+--------+--------+------------+--------+-----------+--------------+--------+--------+----------------+
|:@computed_region_2dna_qi2s|:@computed_region_k96s_3jcv|:@computed_region_kqwf_mjcx|:@computed_region_qz3q_ghft|:@computed_region_tatf_ua23|:@computed_region_ur2y_g4cx|         :created_at|               :id|         :updated_at|         :version|               _id|area_id| area_name|crm_cd|crm_cd_1|crm_cd_2|         crm_cd_desc|cross_street|           date_occ|          

In [47]:
re[re['id']==51777]

Unnamed: 0,:id,id
186,row-nfx7-2ybp~ti7t,51777
187,row-ttw5~9nep~aydw,51777


In [42]:
re.groupby('id').id.agg('count').sort_values(ascending=False)

id
49959    3
47136    2
51777    2
53186    2
48768    2
54613    2
52195    2
46400    2
52473    2
55996    1
49246    1
49613    1
49553    1
49365    1
49337    1
49256    1
49215    1
49218    1
49766    1
49156    1
48941    1
48881    1
48834    1
48799    1
48651    1
48603    1
48600    1
48587    1
49760    1
49940    1
        ..
51199    1
51187    1
51122    1
55937    1
51830    1
51894    1
53393    1
51915    1
53383    1
53375    1
53351    1
53331    1
53252    1
53236    1
53185    1
53126    1
52959    1
52929    1
52872    1
52759    1
52755    1
52555    1
52522    1
52490    1
52448    1
52333    1
52199    1
52141    1
51947    1
46005    1
Name: id, Length: 204, dtype: int64