In [23]:
  ! pip install -r test1.txt



In [24]:
pip install pyspark

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [25]:
from pyspark.sql import SparkSession , functions as F

spark = SparkSession.builder.appName('Crime')\
    .config('spark.driver.extraClassPath', '/usr/lib/jvm/java-11-openjdk-amd64/lib/postgresql-42.5.0.jar')\
    .getOrCreate()

In [26]:
police_df=spark.read.csv('police_district_code.csv')

In [27]:
police_df.show()

+-------------+-------------+
|          _c0|          _c1|
+-------------+-------------+
|District_Code|District_Name|
|           A1|     Downtown|
|          A15|  Charlestown|
|           A7|  East Boston|
|           B2|      Roxbury|
|           B3|     Mattapan|
|           C6| South Boston|
|          C11|   Dorchester|
|           D4|    South End|
|          D14|     Brighton|
|           E5| West Roxbury|
|          E13|Jamaica Plain|
|          E18|    Hyde Park|
+-------------+-------------+



In [28]:
Properties={'user': 'shijal', 'password': 'shijal@123'}
URL='jdbc:postgresql://localhost:5432/postgres'

In [29]:
police_df.write.jdbc(url=URL, table='police', mode='overwrite', properties=Properties)

In [30]:
crime_df=spark.read.csv('crimes.csv', header=True)

In [31]:
crime_df.show()

+---------------+------------+--------------------+--------------------+--------+--------------+--------+-------------------+----+-----+-----------+----+----------+-----------------+-----------+------------+--------------------+
|INCIDENT_NUMBER|OFFENSE_CODE|  OFFENSE_CODE_GROUP| OFFENSE_DESCRIPTION|DISTRICT|REPORTING_AREA|SHOOTING|   OCCURRED_ON_DATE|YEAR|MONTH|DAY_OF_WEEK|HOUR|  UCR_PART|           STREET|        Lat|        Long|            Location|
+---------------+------------+--------------------+--------------------+--------+--------------+--------+-------------------+----+-----+-----------+----+----------+-----------------+-----------+------------+--------------------+
|     I182070945|       00619|             Larceny|  LARCENY ALL OTHERS|     D14|           808|    null|2018-09-02 13:00:00|2018|    9|     Sunday|  13|  Part One|       LINCOLN ST|42.35779134|-71.13937053|(42.35779134, -71...|
|     I182070943|       01402|           Vandalism|           VANDALISM|     C11|   

In [32]:
crimes_df = spark.read.csv('/home/shijal/Desktop/Crime/crimes.csv', header=True, inferSchema=True)
offense_codes_df = spark.read.csv('/home/shijal/Desktop/Crime/offense_codes.csv', header=True, inferSchema=True)
police_district_codes_df = spark.read.csv('/home/shijal/Desktop/Crime/police_district_code.csv', header=True, inferSchema=True)

                                                                                

In [33]:
# for SHOOTING column replace null values with 'N'
crimes_df = crimes_df.withColumn('SHOOTING', F.when(F.col('SHOOTING').isNull(), 'N')\
    .otherwise(F.col('SHOOTING')))

# remove OFFENSE_DESCRIPTION column
crimes_df = crimes_df.drop('OFFENSE_DESCRIPTION')
crimes_df.show(5)

# keep only first duplicate value in offence_code_df
offense_codes_df = offense_codes_df.dropDuplicates(['CODE'])
offense_codes_df.sort ('CODE').show(5)

+---------------+------------+--------------------+--------+--------------+--------+-------------------+----+-----+-----------+----+----------+-----------+-----------+------------+--------------------+
|INCIDENT_NUMBER|OFFENSE_CODE|  OFFENSE_CODE_GROUP|DISTRICT|REPORTING_AREA|SHOOTING|   OCCURRED_ON_DATE|YEAR|MONTH|DAY_OF_WEEK|HOUR|  UCR_PART|     STREET|        Lat|        Long|            Location|
+---------------+------------+--------------------+--------+--------------+--------+-------------------+----+-----+-----------+----+----------+-----------+-----------+------------+--------------------+
|     I182070945|         619|             Larceny|     D14|           808|       N|2018-09-02 13:00:00|2018|    9|     Sunday|  13|  Part One| LINCOLN ST|42.35779134|-71.13937053|(42.35779134, -71...|
|     I182070943|        1402|           Vandalism|     C11|           347|       N|2018-08-21 00:00:00|2018|    8|    Tuesday|   0|  Part Two|   HECLA ST|42.30682138|-71.06030035|(42.30682138

11. Find the number of crime happened  for each year.

In [39]:
## 11. Find the  number of crime happened  for each year

year_crime_count = crime_df.groupBy('YEAR').count().sort('YEAR')

year_crime_count.show()


################### SAVE to POSTGRES #######################
year_crime_count.write.jdbc(url=URL, table='year1_crime_count', mode='overwrite', properties=Properties)


+----+------+
|YEAR| count|
+----+------+
|2015| 53388|
|2016| 99114|
|2017|100886|
|2018| 65685|
+----+------+



12. How many Verbal Disputes crimes were committed in 2018.

In [41]:
##  12. How many Verbal Disputes crimes were committed in 2018.

verbal_dispute = crimes_df.filter(F.col('OFFENSE_CODE_GROUP')== 'Verbal Disputes')

verbal_dispute = verbal_dispute.filter(F.col('YEAR')== 2018)

verbal_dispute = verbal_dispute.count()

verbal_dispute_df = spark.createDataFrame([(verbal_dispute,)], ['count'])

verbal_dispute_df.show()

################### SAVE to POSTGRES #######################

verbal_dispute_df.write.jdbc(url=URL, table='verbal_dispute', mode='overwrite', properties=Properties)

+-----+
|count|
+-----+
| 3055|
+-----+



13. Find how many times ‘Auto Theft’ happened in each year.

In [42]:
## 13. Find how many times ‘Auto Theft’ happened in each year

auto_theft = crimes_df.filter(F.col('OFFENSE_CODE_GROUP')== 'Auto Theft')

auto_theft = auto_theft.groupBy('YEAR').count().sort('YEAR')

auto_theft.show()

################### SAVE to POSTGRES #######################
auto_theft.write.jdbc(url=URL, table='auto_theft', mode='overwrite', properties=Properties)

+----+-----+
|YEAR|count|
+----+-----+
|2015|  988|
|2016| 1537|
|2017| 1393|
|2018|  933|
+----+-----+



14. Reporting Area having highest Shooting incident. 

In [43]:
## 14.  Reporting Area having highest Shooting incident. 

shooting_df = crimes_df.where (crimes_df.SHOOTING == 'Y')

shooting_df = shooting_df.groupBy('REPORTING_AREA').count().sort('count', ascending=False)

shooting_df = shooting_df.filter(F.col('REPORTING_AREA').isNotNull())

shooting_df.show()

################### SAVE to POSTGRES #######################
shooting_df.write.jdbc(url=URL, table='shooting_df', mode='overwrite', properties=Properties)


+--------------+-----+
|REPORTING_AREA|count|
+--------------+-----+
|              |   29|
|           238|   18|
|           326|   18|
|           912|   17|
|           329|   17|
|           909|   16|
|           427|   16|
|           332|   16|
|           911|   15|
|           328|   13|
|           465|   13|
|           281|   11|
|           316|   10|
|           444|   10|
|           279|    9|
|           319|    9|
|           936|    9|
|           327|    9|
|           450|    9|
|           906|    9|
+--------------+-----+
only showing top 20 rows



15. Arrange street based on high “Homicide” incident

In [44]:
## 15. Arrange street based on high “Homicide” incident

homicide_df = crimes_df.where (crimes_df.OFFENSE_CODE_GROUP == 'Homicide')

homicide_df = homicide_df.groupBy('STREET').count().sort('count', ascending=False)

homicide_df = homicide_df.filter(F.col('STREET').isNotNull())

homicide_df.show()

################### SAVE to POSTGRES #######################
homicide_df.write.jdbc(url=URL, table='homicide_df', mode='overwrite', properties=Properties)


+-----------------+-----+
|           STREET|count|
+-----------------+-----+
|        CENTRE ST|    4|
|    WASHINGTON ST|    4|
|         RIVER ST|    3|
|       BOWDOIN ST|    3|
|    BLUE HILL AVE|    3|
|      HARTFORD ST|    2|
|   SOUTHAMPTON ST|    2|
|     HOMESTEAD ST|    2|
|        PARKER ST|    2|
|MASSACHUSETTS AVE|    2|
|     INTERVALE ST|    2|
|       HANCOCK ST|    2|
|   NIGHTINGALE ST|    2|
|      DEARBORN ST|    2|
|        ALPINE ST|    2|
|        HOSMER ST|    2|
|       TREMONT ST|    2|
|   DORCHESTER AVE|    2|
|       WINDSOR ST|    2|
|        NAZING ST|    2|
+-----------------+-----+
only showing top 20 rows

