-- Notepad to myself --

# Challenge: DataFrame API

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate() 

In [2]:
from pyspark.sql.functions import to_timestamp, col, lit
df = spark.read.csv('data/Crimes-2021.csv', header=True, inferSchema=True) \
    .withColumn('Date', to_timestamp(col('Date'), 'MM/dd/yyyy hh:mm:ss a'))
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Case Number: string (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- Block: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Primary Type: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Location Description: string (nullable = true)
 |-- Arrest: boolean (nullable = true)
 |-- Domestic: boolean (nullable = true)
 |-- Beat: integer (nullable = true)
 |-- District: integer (nullable = true)
 |-- Ward: integer (nullable = true)
 |-- Community Area: integer (nullable = true)
 |-- FBI Code: string (nullable = true)
 |-- X Coordinate: integer (nullable = true)
 |-- Y Coordinate: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Updated On: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Location: string (nullable = true)



### 1. Add the reported crimes for an additional day, 12-Nov-2021, to our dataset

In [5]:
crimes = df.filter(col('Date') == lit('2021-11-12'))
crimes.count()

17

In [6]:
df.union(crimes).orderBy('Date', ascending=False).show(5)

+--------+-----------+-------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|      ID|Case Number|               Date|               Block|IUCR|        Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Latitude|    Longitude|            Location|
+--------+-----------+-------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|12609426|   JF133671|2021-12-31 23:59:00|     023XX W 64TH ST|0810|               THEFT|           OVER $500|SCHOOL - PUBLIC B

### 2. What are the top 10 number of reported crimes by Primary Type, in descending order of occurence?

In [7]:
df.groupBy('Primary Type').count().orderBy('count', ascending=False).show(10)

+-------------------+-----+
|       Primary Type|count|
+-------------------+-----+
|              THEFT|40749|
|            BATTERY|40447|
|    CRIMINAL DAMAGE|25090|
|            ASSAULT|20333|
| DECEPTIVE PRACTICE|16995|
|      OTHER OFFENSE|13908|
|MOTOR VEHICLE THEFT|10591|
|  WEAPONS VIOLATION| 8969|
|            ROBBERY| 7920|
|           BURGLARY| 6655|
+-------------------+-----+
only showing top 10 rows



### 3. What percentage of reported crimes resulted in an arrest?

In [8]:
df.select('Arrest').distinct().show()

+------+
|Arrest|
+------+
|  true|
| false|
+------+



In [10]:
100 * df.filter(col('Arrest') == 'true').count() / df.select('Arrest').count()

12.4307890647925

### 4. What are the top 3 locations for reported crimes?

In [11]:
df.groupBy('Location Description').count().orderBy('count', ascending=False).show(3)

+--------------------+-----+
|Location Description|count|
+--------------------+-----+
|              STREET|52060|
|           APARTMENT|44455|
|           RESIDENCE|32036|
+--------------------+-----+
only showing top 3 rows

