In [1]:
import sys
sys.path.append("..")

In [2]:
from snowflake.snowpark import Session, dataframe
import snowflake.snowpark.functions as F
from snowflake.snowpark.functions import when, col
from datetime import date
from helpers import SnowflakeHelper
from snowflake.snowpark.functions import col
from snowflake.snowpark.functions import date_format
import json
import os

In [3]:
null_value_mapping = {
    '(null)': None,
    'Nan': None,
    'NONE': None,
    'nan': None,
    'U': None,
    'UNKNOWN': None
}

age_mapping = {
    '224': None, 
    '1020': None, 
    '940': None, 
    '1022': None
}

race_mapping = {
    'ASIAN / PACIFIC ISLANDER' : 'ASIAN/PACIFIC ISLANDER'
}

In [4]:
snowflake_helper = SnowflakeHelper()
snowflake_config = './../helpers/snowflake_config.json'
session = snowflake_helper.create_snowpark_session(snowflake_config)
session.use_schema("SAFEGUARDING_NYC_SCHEMA_BRONZE")

[SUCCESS] Config file loaded successfully!
[SUCCESS] Snowspark Session created successfully!


In [5]:
shooting_data = session.table('SHOOTING_INCIDENTS')

In [6]:
shooting_data.show()

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"_AIRBYTE_RAW_ID"                     |"_AIRBYTE_EXTRACTED_AT"    |"_AIRBYTE_META"  |"VIC_RACE"      |"OCCUR_TIME"  |"X_COORD_CD"  |"INCIDENT_KEY"  |"VIC_AGE_GROUP"  |"LOC_OF_OCCUR_DESC"  |"LATITUDE"   |"PERP_RACE"     |"Y_COORD_CD"  |"STATISTICAL_MURDER_FLAG"  |"LONGITUDE"   |"VIC_SEX"  |"BORO"     |"LON_LAT"                                      |"PERP_SEX"  |"LOCATION_DESC"            |"OCCUR_DATE"  |"PRECINCT"  |"LOC_CLASSFCTN_DESC"  |"PERP_AGE_GROUP"  |"JURISDICTION_CODE"  |
------------------------------

In [7]:
# Dropping unnecessary columns

columns_to_drop = ["_AIRBYTE_RAW_ID", "_AIRBYTE_EXTRACTED_AT", "_AIRBYTE_META", "X_COORD_CD", "Y_COORD_CD", "LON_LAT", "JURISDICTION_CODE", "LOC_OF_OCCUR_DESC", "LOC_CLASSFCTN_DESC"]
shooting_data = shooting_data.drop(*columns_to_drop)
shooting_data.show()

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"VIC_RACE"      |"OCCUR_TIME"  |"INCIDENT_KEY"  |"VIC_AGE_GROUP"  |"LATITUDE"   |"PERP_RACE"     |"STATISTICAL_MURDER_FLAG"  |"LONGITUDE"   |"VIC_SEX"  |"BORO"     |"PERP_SEX"  |"LOCATION_DESC"            |"OCCUR_DATE"  |"PRECINCT"  |"PERP_AGE_GROUP"  |
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|BLACK           |19:13:00      |28832628        |<18              |40.70370632  |BLACK           |FALSE                      |-73.94325706  |F          |BROOKLYN   |M           |MULTI DWELL - PUBLIC HOUS  |05/04/2007    |90          |

In [8]:
shooting_data.printSchema()

root
 |-- "VIC_RACE": StringType(16777216) (nullable = True)
 |-- "OCCUR_TIME": StringType(16777216) (nullable = True)
 |-- "INCIDENT_KEY": StringType(16777216) (nullable = True)
 |-- "VIC_AGE_GROUP": StringType(16777216) (nullable = True)
 |-- "LATITUDE": StringType(16777216) (nullable = True)
 |-- "PERP_RACE": StringType(16777216) (nullable = True)
 |-- "STATISTICAL_MURDER_FLAG": StringType(16777216) (nullable = True)
 |-- "LONGITUDE": StringType(16777216) (nullable = True)
 |-- "VIC_SEX": StringType(16777216) (nullable = True)
 |-- "BORO": StringType(16777216) (nullable = True)
 |-- "PERP_SEX": StringType(16777216) (nullable = True)
 |-- "LOCATION_DESC": StringType(16777216) (nullable = True)
 |-- "OCCUR_DATE": StringType(16777216) (nullable = True)
 |-- "PRECINCT": StringType(16777216) (nullable = True)
 |-- "PERP_AGE_GROUP": StringType(16777216) (nullable = True)


In [9]:
# Distinct Values
check_columns = ["VIC_RACE", "VIC_AGE_GROUP", "PERP_RACE", "STATISTICAL_MURDER_FLAG", "VIC_SEX", "BORO", "PERP_SEX", "LOCATION_DESC", "PERP_AGE_GROUP"]

for column in check_columns:
    distinct_values = shooting_data.select(column).distinct()
    print(f"Distinct values in {column}:")
    distinct_values.show()


Distinct values in VIC_RACE:
----------------------------------
|"VIC_RACE"                      |
----------------------------------
|BLACK                           |
|WHITE                           |
|NULL                            |
|BLACK HISPANIC                  |
|WHITE HISPANIC                  |
|ASIAN / PACIFIC ISLANDER        |
|AMERICAN INDIAN/ALASKAN NATIVE  |
----------------------------------

Distinct values in VIC_AGE_GROUP:
-------------------
|"VIC_AGE_GROUP"  |
-------------------
|<18              |
|25-44            |
|45-64            |
|1022             |
|NULL             |
|65+              |
|18-24            |
-------------------

Distinct values in PERP_RACE:
----------------------------------
|"PERP_RACE"                     |
----------------------------------
|BLACK HISPANIC                  |
|ASIAN / PACIFIC ISLANDER        |
|WHITE HISPANIC                  |
|AMERICAN INDIAN/ALASKAN NATIVE  |
|NULL                            |
|BLACK              

In [10]:
# Null Value Mapping

for column in shooting_data.columns:
    for key, value in null_value_mapping.items():
        shooting_data = shooting_data.withColumn(column, when(col(column) == key, value).otherwise(col(column)))

In [11]:
# Age Mapping

age_columns = ["PERP_AGE_GROUP", "VIC_AGE_GROUP"]

for column in age_columns:
    for key, value in age_mapping.items():
        shooting_data = shooting_data.withColumn(column, when(col(column) == key, value).otherwise(col(column)))

In [12]:
# Race Mapping

race_columns = ["VIC_RACE", "PERP_RACE" ]

for column in race_columns:
    for key, value in race_mapping.items():
        shooting_data = shooting_data.withColumn(column, when(col(column) == key, value).otherwise(col(column)))

In [13]:
# Distinct Values of Preprocessed Data

for column in check_columns:
    distinct_values = shooting_data.select(column).distinct()
    print(f"Distinct values in {column}:")
    distinct_values.show() 

Distinct values in VIC_RACE:
----------------------------------
|"VIC_RACE"                      |
----------------------------------
|BLACK HISPANIC                  |
|WHITE HISPANIC                  |
|AMERICAN INDIAN/ALASKAN NATIVE  |
|BLACK                           |
|WHITE                           |
|ASIAN/PACIFIC ISLANDER          |
|NULL                            |
----------------------------------

Distinct values in VIC_AGE_GROUP:
-------------------
|"VIC_AGE_GROUP"  |
-------------------
|<18              |
|25-44            |
|45-64            |
|18-24            |
|NULL             |
|65+              |
-------------------

Distinct values in PERP_RACE:
----------------------------------
|"PERP_RACE"                     |
----------------------------------
|NULL                            |
|BLACK                           |
|ASIAN/PACIFIC ISLANDER          |
|WHITE                           |
|BLACK HISPANIC                  |
|WHITE HISPANIC                  |
|AMER

In [14]:
# Count of Distinct Values

for column in shooting_data.columns:
    distinct_count = shooting_data.select(column).distinct().count()
    print(f"Number of distinct values in {column}: {distinct_count}")

Number of distinct values in OCCUR_TIME: 1421
Number of distinct values in INCIDENT_KEY: 21420
Number of distinct values in LATITUDE: 12552
Number of distinct values in STATISTICAL_MURDER_FLAG: 2
Number of distinct values in LONGITUDE: 12537
Number of distinct values in VIC_SEX: 3
Number of distinct values in BORO: 5
Number of distinct values in PERP_SEX: 3
Number of distinct values in LOCATION_DESC: 39
Number of distinct values in OCCUR_DATE: 5761
Number of distinct values in PRECINCT: 77
Number of distinct values in PERP_AGE_GROUP: 6
Number of distinct values in VIC_AGE_GROUP: 6
Number of distinct values in VIC_RACE: 7
Number of distinct values in PERP_RACE: 7


In [15]:
shooting_data.show()

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"OCCUR_TIME"  |"INCIDENT_KEY"  |"LATITUDE"   |"STATISTICAL_MURDER_FLAG"  |"LONGITUDE"   |"VIC_SEX"  |"BORO"     |"PERP_SEX"  |"LOCATION_DESC"            |"OCCUR_DATE"  |"PRECINCT"  |"PERP_AGE_GROUP"  |"VIC_AGE_GROUP"  |"VIC_RACE"      |"PERP_RACE"     |
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|19:13:00      |28832628        |40.70370632  |FALSE                      |-73.94325706  |F          |BROOKLYN   |M           |MULTI DWELL - PUBLIC HOUS  |05/04/2007    |90          |<18               |<18              |BLACK          

In [16]:
# Uploading data to Silver Schema

table_name = "SAFEGUARDING_NYC_SCHEMA_SILVER.shooting_incidents"
shooting_data.write.saveAsTable(table_name, mode="overwrite")