## Imports

In [18]:
import sys
sys.path.append("..")

In [19]:
from snowflake.snowpark import Session
from snowflake.snowpark.functions import when, col, count, to_date
from datetime import date
from helpers import SnowflakeHelper
import json
import os

In [20]:
snowflake_helper = SnowflakeHelper()
snowflake_config = './../helpers/snowflake_config.json'
session = snowflake_helper.create_snowpark_session(snowflake_config)
session.use_schema("SAFEGUARDING_NYC_SCHEMA_BRONZE")

[INFO] No schema passed, using default schema SAFEGUARDING_NYC_SCHEMA_BRONZE for the session
[SUCCESS] Config file loaded successfully!
[SUCCESS] Snowspark Session created successfully on schema SAFEGUARDING_NYC_SCHEMA_BRONZE!


## Extracting Data

In [21]:
use_of_force_incidents = session.table('use_of_force_incidents')

In [22]:
use_of_force_incidents.show()

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"_AIRBYTE_RAW_ID"                     |"_AIRBYTE_EXTRACTED_AT"    |"_AIRBYTE_META"  |"FORCETYPE"     |"INCIDENT PCT"  |"OCCURRENCE DATE"  |"BASISFORENCOUNTER"          |"TRI INCIDENT NUMBER"  |"YEARMONTHSHORT"  |"PATROL BOROUGH"  |
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|54af0531-ebec-4624-a195-2d67a0d2cdbd  |2023-12-02 23:15:30-08:00  |{                |Physical Force  |75              |09/21/2023         |CRIME/VIOLATION IN PROGRESS  |2023094962399          |2023 Sep          |PBBN              |
|                                      |                           |

In [23]:
use_of_force_incidents.count()

28531

## Pre-processing

### Dropping non-important columns

In [24]:
use_of_force_incidents =use_of_force_incidents.drop(["_AIRBYTE_RAW_ID", "_AIRBYTE_EXTRACTED_AT", "_AIRBYTE_META", "BASISFORENCOUNTER", "YEARMONTHSHORT"])

In [25]:
use_of_force_incidents.show()

--------------------------------------------------------------------------------------------------
|"FORCETYPE"     |"INCIDENT PCT"  |"OCCURRENCE DATE"  |"TRI INCIDENT NUMBER"  |"PATROL BOROUGH"  |
--------------------------------------------------------------------------------------------------
|Physical Force  |75              |09/21/2023         |2023094962399          |PBBN              |
|Physical Force  |40              |09/13/2023         |2023059962344          |PBBX              |
|Physical Force  |45              |09/13/2023         |2023064962120          |PBBX              |
|Physical Force  |121             |09/08/2023         |2023140962087          |PBSI              |
|Physical Force  |34              |08/27/2023         |2023053962127          |PBMN              |
|Physical Force  |48              |08/24/2023         |2023067962183          |PBBX              |
|Physical Force  |42              |08/23/2023         |2023061962210          |PBBX              |
|Physical 

### Dropping Nan values

In [26]:
use_of_force_incidents = use_of_force_incidents.filter(col("PATROL BOROUGH").is_not_null())

In [27]:
use_of_force_incidents.count()

28529

### Borough-Mapping

In [28]:
borough_mapping = {
    "PBBX": "BRONX", 
    "PBSI": "STATEN ISLAND", 
    "PBMN": "MANHATTAN", 
    "PBMS": "MANHATTAN",
    "PBBN": "BROOKLYN", 
    "PBBS": "BROOKLYN", 
    "PBQS": "QUEENS", 
    "PBQN": "QUEENS"
}

In [29]:
mapping_expr = when(col("PATROL BOROUGH") == 'PBBX', 'BRONX')
for key, value in borough_mapping.items():
    mapping_expr = mapping_expr.when(col("PATROL BOROUGH") == key, value)

use_of_force_incidents = use_of_force_incidents.withColumn("PATROL BOROUGH", mapping_expr.otherwise(col("PATROL BOROUGH")))

In [30]:
use_of_force_incidents.show()

--------------------------------------------------------------------------------------------------
|"FORCETYPE"     |"INCIDENT PCT"  |"OCCURRENCE DATE"  |"TRI INCIDENT NUMBER"  |"PATROL BOROUGH"  |
--------------------------------------------------------------------------------------------------
|Physical Force  |75              |09/21/2023         |2023094962399          |BROOKLYN          |
|Physical Force  |40              |09/13/2023         |2023059962344          |BRONX             |
|Physical Force  |45              |09/13/2023         |2023064962120          |BRONX             |
|Physical Force  |121             |09/08/2023         |2023140962087          |STATEN ISLAND     |
|Physical Force  |34              |08/27/2023         |2023053962127          |MANHATTAN         |
|Physical Force  |48              |08/24/2023         |2023067962183          |BRONX             |
|Physical Force  |42              |08/23/2023         |2023061962210          |BRONX             |
|Physical 

#### Making Date Compiant

In [31]:
use_of_force_incidents = use_of_force_incidents.withColumn("OCCURRENCE DATE", to_date(col("OCCURRENCE DATE"), 'MM/DD/YYYY'))

In [32]:
use_of_force_incidents.show()

--------------------------------------------------------------------------------------------------
|"FORCETYPE"     |"INCIDENT PCT"  |"TRI INCIDENT NUMBER"  |"PATROL BOROUGH"  |"OCCURRENCE DATE"  |
--------------------------------------------------------------------------------------------------
|Physical Force  |75              |2023094962399          |BROOKLYN          |2023-09-21         |
|Physical Force  |40              |2023059962344          |BRONX             |2023-09-13         |
|Physical Force  |45              |2023064962120          |BRONX             |2023-09-13         |
|Physical Force  |121             |2023140962087          |STATEN ISLAND     |2023-09-08         |
|Physical Force  |34              |2023053962127          |MANHATTAN         |2023-08-27         |
|Physical Force  |48              |2023067962183          |BRONX             |2023-08-24         |
|Physical Force  |42              |2023061962210          |BRONX             |2023-08-23         |
|Physical 

### Checking Distinct Values

In [16]:
def get_distinct_counts(dataframe, column_name):
    return dataframe.groupBy(column_name).agg(count("*").alias("count")).sort("count", ascending=False)

columns_to_check = ["FORCETYPE", "PATROL BOROUGH"]  # List of columns you want to check
for col_name in columns_to_check:
    distinct_counts = get_distinct_counts(use_of_force_incidents, col_name)
    print(f"Distinct counts for {col_name}:")
    distinct_counts.show()


Distinct counts for FORCETYPE:
--------------------------------------
|"FORCETYPE"               |"COUNT"  |
--------------------------------------
|Physical Force            |22914    |
|Electrical Weapon         |4759     |
|OC Spray                  |479      |
|Impact Weapon             |242      |
|Firearm                   |120      |
|Restraining Mesh Blanket  |11       |
|Police Canine             |4        |
--------------------------------------

Distinct counts for PATROL BOROUGH:
------------------------------
|"PATROL BOROUGH"  |"COUNT"  |
------------------------------
|BROOKLYN          |8200     |
|BRONX             |7133     |
|MANHATTAN         |6874     |
|QUEENS            |5085     |
|STATEN ISLAND     |1237     |
------------------------------



## Exporting the processed data to Silver Medallion

In [17]:
table_name = "SAFEGUARDING_NYC_SCHEMA_SILVER.use_of_force_incidents"
use_of_force_incidents.write.saveAsTable(table_name, mode="overwrite")