## Clean & Pipe Recent Crime Events


---
Obj: Identify valid crimes for our analysis and pipe them into our web app.

In [1]:
import os
import sys

try:
    import pyspark
except ImportError:
    import findspark
    findspark.init()
    import pyspark
    
    
# give notebook access to crymepipelines app modules
CRYMEPIPELINES_PATH = '/home/ben/.envs/cc/CrymeClarity/crymepipelines/src'
sys.path.insert(0, CRYMEPIPELINES_PATH)

#build spark session
APP_NAME = 'CRYME_PIPELINE_DEV'
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()


In [None]:
# import cyrmepipelines app modules
import csv
from datetime import datetime, timedelta
import os
import pickle as p
import shutil

from shared.objects.samples import SamplesManager
from shared.settings import CF_TRUST_DELAY, START_DATE, cf_conn, cp_conn, TMP_DIR, BIN_DIR
from tasks.base import SparkCrymeTask, NativeCrymeTask
from utils import crime_occ_udf, ts_to_minutes_in_day_udf, ts_to_hour_of_day_udf, ts_to_day_of_week_udf, ts_conv, safety_rel_crimes
from tasks.mixins import SearchForCrimesMixin

In [None]:
raw_crime_incidents = SparkCrymeTask(spark).load_df_from_crymefeeder("incidents")

In [None]:
raw_crime_incidents.count()

In [None]:
raw_crime_incidents.show(1)

Thats a lotta data, lets just look at rows with "date occ" within the last 30 days.

In [None]:
crime_incidents = raw_crime_incidents.withColumn('date_occ', ts_conv(raw_crime_incidents.date_occ))
crime_incidents = crime_incidents.filter(crime_incidents.date_occ > datetime.now().date() - timedelta(days=30))

#### Crime Types

In [None]:
by_type = crime_incidents.groupBy('crm_cd_desc').agg({'_id': 'count', 'crm_cd': 'first'}).orderBy("count(_id)")
by_type.show(by_type.count(), False)

So not all of these seem like real "safety" threats, for example "Theft of Identity". I think I should create a datastructure containing all crime_cd_descs or crime_cds that could be considered violent or threatening. (Added to utils.py)

In [None]:
crime_incidents = crime_incidents.filter(crime_incidents.crm_cd.isin(list(safety_rel_crimes.keys())))

Finally, there are a bunch of columns that dont seem too interesting so lets drop those and flatten the coordinates.

In [None]:
crime_incidents = crime_incidents.withColumn('Latitude', crime_incidents.location_1.coordinates[0])
crime_incidents = crime_incidents.withColumn('Longitude', crime_incidents.location_1.coordinates[1])

In [None]:
crime_incidents.first()

In [None]:
crime_incidents = crime_incidents.select(['_id', 'crm_cd', 'crm_cd_desc', 'date_occ', 'time_occ', 'premis_desc', 'longitude', 'latitude'])

In [None]:
from pyspark.sql.functions import monotonically_increasing_id

crime_incidents = crime_incidents.withColumn("id", monotonically_increasing_id())
crime_incidents = crime_incidents.withColumnRenamed("_id", "row_id")

In [None]:
crime_incidents.filter(crime_incidents.row_id=='row-zwf3_jww3_st7i').show()

In [None]:
SparkCrymeTask(spark).write_to_cw(crime_incidents, 'crime_crimeincident')


In [None]:
from shared.settings import CRYMEWEB_DB_URL

In [None]:
CRYMEWEB_DB_URL

In [78]:
a = spark.read.parquet(CRYMEPIPELINES_PATH + '/tmp/features_crime_incidents.parquet')

In [79]:
a.show()

+------------------+------+--------------------+-------------------+--------+--------------------+---------+---------+-------+----------------+-------------+--------+--------+-----------+
|               _id|crm_cd|         crm_cd_desc|           date_occ|time_occ|         premis_desc|      lon|      lat|crm_grp|time_occ_seconds|date_occ_unix|lat_bb_c|lon_bb_c|ts_occ_unix|
+------------------+------+--------------------+-------------------+--------+--------------------+---------+---------+-------+----------------+-------------+--------+--------+-----------+
|row-4vvu~7t7z-p5wk|   440|THEFT PLAIN - PET...|2018-10-06 00:00:00|    2204|              STREET|-118.2564|  33.9721|      T|           79440|   1538809200|    7785|   16301| 1538888640|
|row-4vvw-q5n2_nyhf|   310|            BURGLARY|2018-12-08 00:00:00|    1930|MULTI-UNIT DWELLI...|-118.3923|  34.1457|      B|           70200|   1544256000|    7825|   16320| 1544326200|
|row-4vvz-xfab.jh5h|   210|             ROBBERY|2019-01-30 0