### Preprocessing the Stop, Question and Frisk Data (2017-2022)

- Drop columns
- Borough mapping
- 2017.00 to 2017
- Null value mapping
- Date preprocessing (Stop Frisk Date)

In [2]:
import sys
sys.path.append("..")

In [25]:
from snowflake.snowpark import Session
import snowflake.snowpark.functions as F
from snowflake.snowpark.functions import when, col
from snowflake.snowpark.functions import expr
from datetime import date
from helpers import SnowflakeHelper
import json
import os
import matplotlib.pyplot as plt
import pandas as pd

In [4]:
borough_mapping = {
    'PBBX': 'BRONX', 
    'PBSI': 'STATEN ISLAND', 
    'PBMN': 'MANHATTAN', 
    'PBMS': 'MANHATTAN',
    'PBBN': 'BROOKLYN', 
    'PBBS': 'BROOKLYN', 
    'PBQS': 'QUEENS', 
    'PBQN': 'QUEENS',
    'STATEN IS': 'STATEN ISLAND'
}

null_value_mapping = {
    '(null)' : None,
    'NaN' : None,
    '(' : None,
    'NULL': None
}

In [5]:
snowflake_helper = SnowflakeHelper()
snowflake_config = './../helpers/snowflake_config.json'
session = snowflake_helper.create_snowpark_session(snowflake_config)
session.use_schema("SAFEGUARDING_NYC_SCHEMA_BRONZE")

[SUCCESS] Config file loaded successfully!
[SUCCESS] Snowspark Session created successfully!


In [6]:
sqf_data = session.table('SQF')

In [7]:
sqf_data.show()

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [8]:
sqf_data.count()

102421

In [9]:
columns_to_drop= ['_AIRBYTE_RAW_ID', '_AIRBYTE_EXTRACTED_AT', '_AIRBYTE_META', 'STOP_LOCATION_ZIP_CODE', 'ID_CARD_IDENTIFIES_OFFICER_FLAG', 'OFFICER_NOT_EXPLAINED_STOP_DESCRIPTION',\
                    'SUSPECTS_ACTIONS_CASING_FLAG', 'SUMMONS_ISSUED_FLAG', 'VERBAL_IDENTIFIES_OFFICER_FLAG', 'SEARCH_BASIS_ADMISSION_FLAG', 'SEARCH_BASIS_OTHER_FLAG', 'SEARCH_BASIS_CONSENT_FLAG',\
                    'BACKROUND_CIRCUMSTANCES_SUSPECT_KNOWN_TO_CARRY_WEAPON_FLAG', 'RECORD_STATUS_CODE', 'PHYSICAL_FORCE_RESTRAINT_USED_FLAG', 'PHYSICAL_FORCE_HANDCUFF_SUSPECT_FLAG',\
                    'OTHER_PERSON_STOPPED_FLAG', 'SUSPECTS_ACTIONS_PROXIMITY_TO_SCENE_FLAG', 'DEMEANOR_CODE', 'SUPERVISING_OFFICER_COMMAND_CODE', 'STOP_ID_ANONY', 'ISSUING_OFFICER_COMMAND_CODE',\
                    'LOCATION_IN_OUT_CODE', 'JURISDICTION_DESCRIPTION', 'PHYSICAL_FORCE_VERBAL_INSTRUCTION_FLAG']
sqf_data = sqf_data.drop(*columns_to_drop)
sqf_data.show()

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [10]:
sqf_data.group_by('STOP_LOCATION_PATROL_BORO_NAME')\
        .count()\
        .show()

----------------------------------------------
|"STOP_LOCATION_PATROL_BORO_NAME"  |"COUNT"  |
----------------------------------------------
|PBBN                              |16363    |
|PBBX                              |23550    |
|PBSI                              |3640     |
|0238                              |1        |
|0237                              |1        |
|0183                              |1        |
|1011                              |1        |
|0154                              |1        |
|1022                              |1        |
|PBQN                              |8618     |
----------------------------------------------



In [11]:
sqf_data.group_by('STOP_LOCATION_BORO_NAME')\
        .count()\
        .show()

---------------------------------------
|"STOP_LOCATION_BORO_NAME"  |"COUNT"  |
---------------------------------------
|MANHATTAN                  |25673    |
|0208760                    |2        |
|0986759                    |1        |
|PBBN                       |3        |
|PBSI                       |1        |
|PBBX                       |5        |
|QUEENS                     |17440    |
|STATEN ISLAND              |3094     |
|0237177                    |1        |
|0208169                    |1        |
---------------------------------------



In [12]:
mapping_expr = when(col('STOP_LOCATION_PATROL_BORO_NAME') == 'PBBX', 'BRONX')
for key, value in borough_mapping.items():
    mapping_expr = mapping_expr.when(col("STOP_LOCATION_PATROL_BORO_NAME") == key, value)

df = sqf_data.withColumn("STOP_LOCATION_PATROL_BORO_NAME", mapping_expr.otherwise(col("STOP_LOCATION_PATROL_BORO_NAME")))

mapping_expr = when(col('STOP_LOCATION_BORO_NAME') == 'PBBX', 'BRONX')
for key, value in borough_mapping.items():
    mapping_expr = mapping_expr.when(col("STOP_LOCATION_BORO_NAME") == key, value)

df = df.withColumn("STOP_LOCATION_BORO_NAME", mapping_expr.otherwise(col("STOP_LOCATION_BORO_NAME")))

In [13]:
df.show()

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [14]:
df = df.filter(col('STOP_LOCATION_PATROL_BORO_NAME').isin(list(borough_mapping.values())))
df = df.filter(col('STOP_LOCATION_BORO_NAME').isin(list(borough_mapping.values())))

In [15]:
df.count()

101989

In [16]:
df.group_by('STOP_LOCATION_BORO_NAME')\
        .count()\
        .show()

---------------------------------------
|"STOP_LOCATION_BORO_NAME"  |"COUNT"  |
---------------------------------------
|MANHATTAN                  |25673    |
|QUEENS                     |17440    |
|STATEN ISLAND              |3640     |
|BRONX                      |23550    |
|BROOKLYN                   |31686    |
---------------------------------------



In [17]:
df.group_by('STOP_LOCATION_PATROL_BORO_NAME')\
        .count()\
        .show()

----------------------------------------------
|"STOP_LOCATION_PATROL_BORO_NAME"  |"COUNT"  |
----------------------------------------------
|MANHATTAN                         |25673    |
|BRONX                             |23550    |
|BROOKLYN                          |31686    |
|QUEENS                            |17440    |
|STATEN ISLAND                     |3640     |
----------------------------------------------



In [18]:
df.group_by('YEAR2')\
        .count()\
        .sort('YEAR2', ascending=True)\
        .show()

---------------------
|"YEAR2"  |"COUNT"  |
---------------------
|2017.00  |11197    |
|2018     |11008    |
|2019     |26918    |
|2020     |13715    |
|2021     |8947     |
|2022     |30204    |
---------------------



In [19]:
mapping_expr_year = when(col('YEAR2') == '2017.00', '2017')
df = df.withColumn("YEAR2", mapping_expr_year.otherwise(col("YEAR2")))
df.group_by('YEAR2')\
        .count()\
        .sort('YEAR2', ascending=True)\
        .show()

---------------------
|"YEAR2"  |"COUNT"  |
---------------------
|2017     |11197    |
|2018     |11008    |
|2019     |26918    |
|2020     |13715    |
|2021     |8947     |
|2022     |30204    |
---------------------



In [20]:
for column in df.columns:
    for key, value in null_value_mapping.items():
        df = df.withColumn(column, when(col(column) == key, value).otherwise(col(column)))

In [20]:
for col, type in df.dtypes:
    distinct_values = df.select(col).distinct()
    print(f"Distinct values in {col}:")
    print('=' * 35)
    distinct_values.show()

Distinct values in PHYSICAL_FORCE_OC_SPRAY_USED_FLAG:
---------------------------------------
|"PHYSICAL_FORCE_OC_SPRAY_USED_FLAG"  |
---------------------------------------
|NULL                                 |
|Y                                    |
---------------------------------------

Distinct values in SEARCH_BASIS_HARD_OBJECT_FLAG:
-----------------------------------
|"SEARCH_BASIS_HARD_OBJECT_FLAG"  |
-----------------------------------
|NULL                             |
|Y                                |
-----------------------------------

Distinct values in SUSPECT_WEIGHT:
--------------------
|"SUSPECT_WEIGHT"  |
--------------------
|225               |
|235               |
|210               |
|190               |
|170               |
|165               |
|280               |
|135               |
|110               |
|182               |
--------------------

Distinct values in STOP_LOCATION_SECTOR_CODE:
-------------------------------
|"STOP_LOCATION_SECTOR_CODE"  

In [21]:
for column in df.columns:
    for key, value in null_value_mapping.items():
        df = df.withColumn(column, when(col(column) == key, value).otherwise(col(column)))

In [22]:
df.show()

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [24]:
df.select('STOP_FRISK_DATE').distinct().show()

---------------------
|"STOP_FRISK_DATE"  |
---------------------
|2021-07-29         |
|2021-08-01         |
|2021-07-30         |
|2021-08-03         |
|2021-08-05         |
|2021-08-06         |
|2021-08-04         |
|2021-07-31         |
|2021-08-02         |
|2021-08-07         |
---------------------



In [26]:
sql_expr_for_date_conversion = """
    CASE
        WHEN REGEXP_LIKE(STOP_FRISK_DATE, '\\\\d{4}-\\\\d{2}-\\\\d{2}') THEN TO_DATE(STOP_FRISK_DATE, 'YYYY-MM-DD')
        WHEN REGEXP_LIKE(STOP_FRISK_DATE, '\\\\d{1,2}/\\\\d{1,2}/\\\\d{4}') THEN TO_DATE(STOP_FRISK_DATE, 'MM/DD/YYYY')
        ELSE NULL
    END
"""

df_with_converted_date = df.withColumn("STOP_FRISK_DATE", expr(sql_expr_for_date_conversion))

In [27]:
df_with_converted_date.show()

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [28]:
table_name = "SAFEGUARDING_NYC_SCHEMA_SILVER.SQF"
df.write.saveAsTable(table_name, mode="overwrite")