# All installs

In [11]:
# !pip install pyspark

In [12]:
# !pip freeze > requirements.txt

# DB info

In [20]:
# edit your info here

Properties={'user': 'amrit', 'password': '1234'}
URL='jdbc:postgresql://localhost:5432/crimes_in_boston'


# Imports

In [13]:
from pyspark.sql import SparkSession , functions as F

from pyspark.sql.window import Window

spark = SparkSession.builder.appName('Crimes_in_Boston')\
    .config('spark.driver.extraClassPath', '/usr/lib/jvm/java-17-openjdk-amd64/lib/postgresql-42.5.0.jar')\
    .getOrCreate()

# Load data into spark

In [14]:
crimes_df = spark.read.csv('DATA/crimes.csv', header=True, inferSchema=True)
offense_codes_df = spark.read.csv('DATA/offense_codes.csv', header=True, inferSchema=True)
police_district_codes_df = spark.read.csv('DATA/police_district_codes.csv', header=True, inferSchema=True)

# Preprocessing

perform all preprocessing here

In [15]:
# for SHOOTING column replace null values with 'N'
crimes_df = crimes_df.withColumn('SHOOTING', F.when(F.col('SHOOTING').isNull(), 'N')\
    .otherwise(F.col('SHOOTING')))

# remove OFFENSE_DESCRIPTION column
crimes_df = crimes_df.drop('OFFENSE_DESCRIPTION')

crimes_df.show(5)

# keep only first duplicate value in offence_code_df
offense_codes_df = offense_codes_df.dropDuplicates(['CODE'])
offense_codes_df.sort ('CODE').show(5)

+---------------+------------+--------------------+--------+--------------+--------+-------------------+----+-----+-----------+----+----------+-----------+-----------+------------+--------------------+
|INCIDENT_NUMBER|OFFENSE_CODE|  OFFENSE_CODE_GROUP|DISTRICT|REPORTING_AREA|SHOOTING|   OCCURRED_ON_DATE|YEAR|MONTH|DAY_OF_WEEK|HOUR|  UCR_PART|     STREET|        Lat|        Long|            Location|
+---------------+------------+--------------------+--------+--------------+--------+-------------------+----+-----+-----------+----+----------+-----------+-----------+------------+--------------------+
|     I182070945|         619|             Larceny|     D14|           808|       N|2018-09-02 13:00:00|2018|    9|     Sunday|  13|  Part One| LINCOLN ST|42.35779134|-71.13937053|(42.35779134, -71...|
|     I182070943|        1402|           Vandalism|     C11|           347|       N|2018-08-21 00:00:00|2018|    8|    Tuesday|   0|  Part Two|   HECLA ST|42.30682138|-71.06030035|(42.30682138

## 1. (window function) Partition by district , order by  year and then count the offenses including rolling count for each district

In [21]:
## 1. (window function) Partition by district , order by  year and then rolling count the offenses

# create new column year  incuding  onlu INCIDENT_NUMBER , DISTRICT , YEAR
window_df = crimes_df.select('INCIDENT_NUMBER', 'DISTRICT', 'YEAR')

# create window partition by DISTRICT and order by YEAR
window = Window.partitionBy('DISTRICT').orderBy('YEAR')

# remove null values from DISTRICT column
window_df = window_df.filter(F.col('DISTRICT').isNotNull())

#  column count_incidents
window_df = window_df.groupBy('DISTRICT', 'YEAR').agg(F.count('INCIDENT_NUMBER').alias('count_incidents'))

# rolling_count_incidents
window_df = window_df.withColumn('rolling_count_incidents', F.sum('count_incidents').over(window))

window_df.show()


################### SAVE to POSTGRES #######################
window_df.write.jdbc(url=URL, table='rolling_count', mode='overwrite', properties=Properties)



+--------+----+---------------+-----------------------+
|DISTRICT|YEAR|count_incidents|rolling_count_incidents|
+--------+----+---------------+-----------------------+
|      A1|2015|           6015|                   6015|
|      A1|2016|          10923|                  16938|
|      A1|2017|          11375|                  28313|
|      A1|2018|           7404|                  35717|
|     A15|2015|           1027|                   1027|
|     A15|2016|           1986|                   3013|
|     A15|2017|           2167|                   5180|
|     A15|2018|           1325|                   6505|
|      A7|2015|           2426|                   2426|
|      A7|2016|           4130|                   6556|
|      A7|2017|           4264|                  10820|
|      A7|2018|           2724|                  13544|
|      B2|2015|           8687|                   8687|
|      B2|2016|          15706|                  24393|
|      B2|2017|          15680|                 