# Working with rows

## Check for JAVA_HOME

In [1]:
import os
print(os.getenv("JAVA_HOME")) # check for the correct java version (should be 1.8 for using spark)

/Library/Java/JavaVirtualMachines/jdk1.8.0_202.jdk/Contents/Home


## Setup environment

In [2]:
import findspark
findspark.init()
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate() 

## Downloading and preprocessing Chicago's Reported Crime Data

In [3]:
from pyspark.sql.functions import to_timestamp,col,lit
rc = spark.read.csv('Crimes.csv',header=True).withColumn('Date',to_timestamp(col('Date'),'MM/dd/yyyy hh:mm:ss a'))

## Working with rows

**Duplicate (From the csv file) the reported crimes for an additional day, 12-Nov-2018, to our dataset.**

##### Get 12-Nov crimes 

In [53]:
from pyspark.sql.functions import lit
duplicate_crimes = spark.read.csv(
    'Crimes.csv',header=True).withColumn(
    'Date',to_timestamp(col('Date'),'MM/dd/yyyy hh:mm:ss a')).filter(col('Date') == lit('2018-11-12')
)
duplicate_crimes.count()

3

##### Merge datframes

In [55]:
rc.filter(col('Date') == lit('2018-11-12')).count()

3

In [56]:
rc = rc.union(duplicate_crimes)
rc.filter(col('Date') == lit('2018-11-12')).count() # should be 3*2=6 rows 

6

**What are the top 10 number of reported crimes by Primary type, in descending order of occurence?**

In [57]:
rc.columns

['ID',
 'Case Number',
 'Date',
 'Block',
 'IUCR',
 'Primary Type',
 'Description',
 'Location Description',
 'Arrest',
 'Domestic',
 'Beat',
 'District',
 'Ward',
 'Community Area',
 'FBI Code',
 'X Coordinate',
 'Y Coordinate',
 'Year',
 'Updated On',
 'Latitude',
 'Longitude',
 'Location']

In [60]:
rc.groupBy('Primary Type').count().orderBy('count', ascending=False).show(10)

+-------------------+-------+
|       Primary Type|  count|
+-------------------+-------+
|              THEFT|1435837|
|            BATTERY|1245191|
|    CRIMINAL DAMAGE| 778458|
|          NARCOTICS| 714982|
|            ASSAULT| 423891|
|      OTHER OFFENSE| 423570|
|           BURGLARY| 391000|
|MOTOR VEHICLE THEFT| 316981|
| DECEPTIVE PRACTICE| 268268|
|            ROBBERY| 258102|
+-------------------+-------+
only showing top 10 rows

