## Imports

In [1]:
import sys
sys.path.append("..")

In [2]:
from snowflake.snowpark import Session
from snowflake.snowpark.functions import when, col, count
from datetime import date
from helpers import SnowflakeHelper
import json
import os

In [3]:
snowflake_helper = SnowflakeHelper()
snowflake_config = './../helpers/snowflake_config.json'
session = snowflake_helper.create_snowpark_session(snowflake_config)
session.use_schema("SAFEGUARDING_NYC_SCHEMA_BRONZE")

[INFO] No schema passed, using default schema SAFEGUARDING_NYC_SCHEMA_BRONZE for the session
[SUCCESS] Config file loaded successfully!
[SUCCESS] Snowspark Session created successfully on schema SAFEGUARDING_NYC_SCHEMA_BRONZE!


## Extracting Data

In [4]:
use_of_force_subjects = session.table('use_of_force_subjects')

In [5]:
use_of_force_subjects.show()

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"_AIRBYTE_RAW_ID"                     |"_AIRBYTE_EXTRACTED_AT"    |"_AIRBYTE_META"  |"SUBJECT INJURED"  |"SUBJECT INJURY LEVEL"  |"SUBJECT RACE"  |"FORCE AGAINST MOS"  |"TRI INCIDENT NUMBER"  |"SUBJECT GENDER"  |"SUBJECT USED FORCE"  |"AGE"  |
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|ae4cc5ae-a408-4d2d-867d-93ffe47ebdec  |2023-12-02 23:15:31-08:00  |{                |N                  |No Injury               |BLACK           |Physical Force       |2020020962067          |MALE              |Y                     |NULL   |
|                   

In [6]:
use_of_force_subjects.count()

29961

## Pre-processing

### Dropping non-important columns

In [7]:
use_of_force_subjects =use_of_force_subjects.drop(["_AIRBYTE_RAW_ID", "_AIRBYTE_EXTRACTED_AT", "_AIRBYTE_META", "SUBJECT INJURED", "SUBJECT INJURY LEVEL", "SUBJECT USED FORCE"])

In [8]:
use_of_force_subjects.show()

-------------------------------------------------------------------------------------------
|"SUBJECT RACE"  |"FORCE AGAINST MOS"  |"TRI INCIDENT NUMBER"  |"SUBJECT GENDER"  |"AGE"  |
-------------------------------------------------------------------------------------------
|BLACK           |Physical Force       |2020020962067          |MALE              |NULL   |
|BLACK           |Physical Force       |2020020962143          |MALE              |NULL   |
|BLACK           |Physical Force       |2020024962080          |MALE              |NULL   |
|BLACK           |Physical Force       |2020025962038          |MALE              |NULL   |
|BLACK           |Physical Force       |2020025962049          |MALE              |NULL   |
|BLACK           |Physical Force       |2020025962140          |MALE              |NULL   |
|BLACK           |Physical Force       |2020026962081          |MALE              |NULL   |
|BLACK           |Physical Force       |2020032962033          |MALE            

#### Checking for non-distinct values

In [9]:
def get_distinct_counts(dataframe, column_name):
    return dataframe.groupBy(column_name).agg(count("*").alias("count")).sort("count", ascending=False)

columns_to_check = ["SUBJECT GENDER", "AGE", "SUBJECT RACE"]  # List of columns you want to check
for col_name in columns_to_check:
    distinct_counts = get_distinct_counts(use_of_force_subjects, col_name)
    print(f"Distinct counts for {col_name}:")
    distinct_counts.show()

Distinct counts for SUBJECT GENDER:
------------------------------
|"SUBJECT GENDER"  |"COUNT"  |
------------------------------
|MALE              |24863    |
|FEMALE            |4160     |
|UNK               |938      |
------------------------------

Distinct counts for AGE:
-------------------
|"AGE"  |"COUNT"  |
-------------------
|NULL   |1229     |
|27     |1147     |
|28     |1138     |
|30     |1136     |
|24     |1132     |
|25     |1121     |
|31     |1110     |
|29     |1102     |
|26     |1102     |
|23     |1047     |
-------------------

Distinct counts for SUBJECT RACE:
----------------------------
|"SUBJECT RACE"  |"COUNT"  |
----------------------------
|BLACK           |16559    |
|HISPANIC        |8655     |
|WHITE           |2358     |
|OTHER           |1591     |
|ASIAN           |741      |
|AMER INDIAN     |57       |
----------------------------



### Making values of rows compliant

#### Making GENDER compliant

In [10]:
use_of_force_subjects = use_of_force_subjects.withColumn("SUBJECT GENDER", when(col("SUBJECT GENDER") == "UNK", None).otherwise(col("SUBJECT GENDER")))

#### Making RACE Compliant

In [12]:
use_of_force_subjects = use_of_force_subjects.withColumn(
    "SUBJECT RACE", 
    when(col("SUBJECT RACE") == "ASIAN", "ASIAN/PACIFIC ISLANDER")
    .when(col("SUBJECT RACE") == "AMER INDIAN", "AMERICAN INDIAN/ALASKAN NATIVE")
    .otherwise(col("SUBJECT RACE"))
)


In [14]:
def get_distinct_counts(dataframe, column_name):
    return dataframe.groupBy(column_name).agg(count("*").alias("count")).sort("count", ascending=False)

columns_to_check = ["SUBJECT GENDER", "SUBJECT RACE"]  # List of columns you want to check
for col_name in columns_to_check:
    distinct_counts = get_distinct_counts(use_of_force_subjects, col_name)
    print(f"Distinct counts for {col_name}:")
    distinct_counts.show()

Distinct counts for SUBJECT GENDER:
------------------------------
|"SUBJECT GENDER"  |"COUNT"  |
------------------------------
|MALE              |24863    |
|FEMALE            |4160     |
|NULL              |938      |
------------------------------

Distinct counts for SUBJECT RACE:
--------------------------------------------
|"SUBJECT RACE"                  |"COUNT"  |
--------------------------------------------
|BLACK                           |16559    |
|HISPANIC                        |8655     |
|WHITE                           |2358     |
|OTHER                           |1591     |
|ASIAN/PACIFIC ISLANDER          |741      |
|AMERICAN INDIAN/ALASKAN NATIVE  |57       |
--------------------------------------------



## Exporting the processed data to Silver Medallion

In [25]:
table_name = "SAFEGUARDING_NYC_SCHEMA_SILVER.use_of_force_subjects"
use_of_force_subjects.write.saveAsTable(table_name, mode="overwrite")