In [22]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import udf,col,countDistinct,date_format,row_number
from pyspark.sql.window import Window

In [50]:
spark = SparkSession.builder.appName("crime_boston").getOrCreate()

crime_df = spark.read.csv("/home/saurav/Downloads/crime.csv",header=True,inferSchema=True)

crime_df.show()

                                                                                

+---------------+------------+--------------------+--------------------+--------+--------------+--------+-------------------+----+-----+-----------+----+----------+-----------------+-----------+------------+--------------------+
|INCIDENT_NUMBER|OFFENSE_CODE|  OFFENSE_CODE_GROUP| OFFENSE_DESCRIPTION|DISTRICT|REPORTING_AREA|SHOOTING|   OCCURRED_ON_DATE|YEAR|MONTH|DAY_OF_WEEK|HOUR|  UCR_PART|           STREET|        Lat|        Long|            Location|
+---------------+------------+--------------------+--------------------+--------+--------------+--------+-------------------+----+-----+-----------+----+----------+-----------------+-----------+------------+--------------------+
|     I182070945|         619|             Larceny|  LARCENY ALL OTHERS|     D14|           808|    null|2018-09-02 13:00:00|2018|    9|     Sunday|  13|  Part One|       LINCOLN ST|42.35779134|-71.13937053|(42.35779134, -71...|
|     I182070943|        1402|           Vandalism|           VANDALISM|     C11|   

## 1. Find all the list of dates in 2017 where ‘VANDALISM’ happened.

In [61]:
vandalism_2017 = crime_df.filter(crime_df['OFFENSE_DESCRIPTION']=='VANDALISM').select(crime_df['OCCURRED_ON_DATE'],crime_df['OFFENSE_DESCRIPTION'])
vandalism_2017.show()

#saving O/P in postgres
vandalism_2017.write.format('jdbc').options(url='jdbc:postgresql://localhost:5432/postgres', driver='org.postgresql.Driver',
                                   dbtable='Q1', user='fusemachines', password='hello123').mode('overwrite').save()

+-------------------+-------------------+
|   OCCURRED_ON_DATE|OFFENSE_DESCRIPTION|
+-------------------+-------------------+
|2018-08-21 00:00:00|          VANDALISM|
|2018-09-01 12:00:00|          VANDALISM|
|2018-09-03 15:00:00|          VANDALISM|
|2018-08-31 17:00:00|          VANDALISM|
|2018-09-03 13:44:00|          VANDALISM|
|2018-08-17 12:10:00|          VANDALISM|
|2018-09-03 05:30:00|          VANDALISM|
|2018-09-03 07:44:00|          VANDALISM|
|2018-08-31 07:00:00|          VANDALISM|
|2018-09-02 23:20:00|          VANDALISM|
|2018-09-02 21:57:00|          VANDALISM|
|2018-09-02 20:08:00|          VANDALISM|
|2018-09-02 18:58:00|          VANDALISM|
|2018-09-02 17:15:00|          VANDALISM|
|2018-09-02 16:58:00|          VANDALISM|
|2018-09-01 14:30:00|          VANDALISM|
|2018-09-02 17:12:00|          VANDALISM|
|2018-09-01 23:00:00|          VANDALISM|
|2018-07-30 12:51:00|          VANDALISM|
|2018-09-02 09:26:00|          VANDALISM|
+-------------------+-------------

## 2.Show the data frame where the District is  null and then fill the null District with “District not Verified”. (udf) 

In [62]:
def remove_na(replacenull):
    return "District Not Verifed"
udf_name = udf(remove_na)

null_district = crime_df.filter(crime_df['DISTRICT'].isNull())
null_district.show()

fill_na = null_district.select(crime_df['INCIDENT_NUMBER'],crime_df['OFFENSE_CODE'],crime_df['OFFENSE_CODE_GROUP'],crime_df['OFFENSE_DESCRIPTION'],udf_name(null_district['DISTRICT']))\
.withColumnRenamed('remove_na(DISTRICT)','DISTRICT')

fill_na.show()

# *******ALTERNATIVE**********
# null_district.na.fill('District not verified',subset=['DISTRICT']).show()

# Saving O/P in Postgres
fill_na.write.format('jdbc').options(url='jdbc:postgresql://localhost:5432/postgres', driver='org.postgresql.Driver',
                                   dbtable='Q2', user='fusemachines', password='hello123').mode('overwrite').save()

+---------------+------------+--------------------+--------------------+--------+--------------+--------+-------------------+----+-----+-----------+----+----------+-------+-----------+------------+--------------------+
|INCIDENT_NUMBER|OFFENSE_CODE|  OFFENSE_CODE_GROUP| OFFENSE_DESCRIPTION|DISTRICT|REPORTING_AREA|SHOOTING|   OCCURRED_ON_DATE|YEAR|MONTH|DAY_OF_WEEK|HOUR|  UCR_PART| STREET|        Lat|        Long|            Location|
+---------------+------------+--------------------+--------------------+--------+--------------+--------+-------------------+----+-----+-----------+----+----------+-------+-----------+------------+--------------------+
|     I182070920|        3006|  Medical Assistance|SICK/INJURED/MEDI...|    null|              |    null|2018-09-03 19:43:00|2018|    9|     Monday|  19|Part Three|   null|42.35287456| -71.0738297|(42.35287456, -71...|
|     I182070913|        3006|  Medical Assistance|SICK/INJURED/MEDI...|    null|              |    null|2018-09-03 18:46:00

## 3.Show the year and total number of Robbery happens in each year.

In [64]:
filtering_robbery = crime_df.filter((crime_df.OFFENSE_CODE_GROUP=="Robbery")).select(crime_df.YEAR,crime_df.OFFENSE_CODE_GROUP)
total_robbery = filtering_robbery.groupBy("YEAR").count().orderBy("Year").withColumnRenamed("count","Total Robbery in Year")
total_robbery.show()

# Saving O/P in Postgres
total_robbery.write.format('jdbc').options(url='jdbc:postgresql://localhost:5432/postgres', driver='org.postgresql.Driver',
                                   dbtable='Q3', user='fusemachines', password='hello123').mode('overwrite').save()

+----+---------------------+
|YEAR|Total Robbery in Year|
+----+---------------------+
|2015|                  948|
|2016|                 1506|
|2017|                 1376|
|2018|                  794|
+----+---------------------+



## 4.Show all  Offense_codes and names which are not listed in crime.csv but in offense_code.csv.

In [65]:
dfj1 = spark.read.csv("/home/saurav/Downloads/crime.csv",header=True,inferSchema=True)
dfj2 = spark.read.csv("/home/saurav/Downloads/offense_codes.csv",header=True,inferSchema=True)

result = dfj2.join(dfj1,dfj1.OFFENSE_CODE==dfj2.CODE,"left_anti")
result.show()

# Saving O/P in Postgres
result.write.format('jdbc').options(url='jdbc:postgresql://localhost:5432/postgres', driver='org.postgresql.Driver',
                                   dbtable='Q4', user='fusemachines', password='hello123').mode('overwrite').save()

                                                                                

+----+--------------------+
|CODE|                NAME|
+----+--------------------+
|1731|              INCEST|
|1711|OPEN & GROSS LEWD...|
| 242|RAPE - ATTEMPT - ...|
| 254|RAPE - COMPLETE -...|
| 271|RAPE - COMPLETE -...|
| 244|RAPE - ATTEMPT - ...|
| 251|RAPE - COMPLETE -...|
| 241|RAPE - ATTEMPT - ...|
| 243|RAPE - ATTEMPT - ...|
| 261|RAPE - ATTEMPT - ...|
| 252|RAPE - COMPLETE -...|
| 253|RAPE - COMPLETE -...|
|1730|SEXUAL ASSAULT IN...|
|1704|      STATUTORY RAPE|
|2915| VAL - MISCELLANEOUS|
|1721|FAILURE TO REGIST...|
|1702|INDECENT ASSAULT ...|
|1703|INDECENT EXPOSURE...|
| 114|KILLING OF POLICE...|
|1902|GAMBLING - OPERAT...|
+----+--------------------+
only showing top 20 rows



## 5.List offense_description which is occurred on Sunday around time ‘21:30:00’

In [66]:
filter_day = crime_df.filter((crime_df.DAY_OF_WEEK == 'Sunday')).select(crime_df.OFFENSE_DESCRIPTION,crime_df.DAY_OF_WEEK,crime_df.OCCURRED_ON_DATE)
select_time = filter_day.select(crime_df.OFFENSE_DESCRIPTION,date_format('OCCURRED_ON_DATE','HH:mm:ss')).withColumnRenamed('date_format(OCCURRED_ON_DATE, HH:mm:ss)','Time')
final_result =select_time.filter(select_time.Time == '21:30:00')
final_result.show()

# Saving O/P in Postgres
final_result.write.format('jdbc').options(url='jdbc:postgresql://localhost:5432/postgres', driver='org.postgresql.Driver',
                                   dbtable='Q5', user='fusemachines', password='hello123').mode('overwrite').save()

+--------------------+--------+
| OFFENSE_DESCRIPTION|    Time|
+--------------------+--------+
|INVESTIGATE PROPERTY|21:30:00|
|M/V - LEAVING SCE...|21:30:00|
|    ROBBERY - STREET|21:30:00|
|LARCENY THEFT FRO...|21:30:00|
|M/V - LEAVING SCE...|21:30:00|
|LARCENY THEFT FRO...|21:30:00|
|  PROPERTY - MISSING|21:30:00|
|M/V ACCIDENT - PR...|21:30:00|
|M/V - LEAVING SCE...|21:30:00|
|M/V - LEAVING SCE...|21:30:00|
|    ROBBERY - STREET|21:30:00|
|INVESTIGATE PROPERTY|21:30:00|
|M/V - LEAVING SCE...|21:30:00|
|        SUDDEN DEATH|21:30:00|
|LARCENY THEFT FRO...|21:30:00|
|M/V ACCIDENT - PR...|21:30:00|
|M/V ACCIDENT INVO...|21:30:00|
|VIOL. OF RESTRAIN...|21:30:00|
|           VANDALISM|21:30:00|
|LARCENY THEFT FRO...|21:30:00|
+--------------------+--------+
only showing top 20 rows

