# Apache Spark by Example

#### Install Spark

In [0]:
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.3.1/spark-2.3.1-bin-hadoop2.7.tgz
!tar xf spark-2.3.1-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.3.1-bin-hadoop2.7"

In [0]:
!ls

sample_data		       spark-2.3.1-bin-hadoop2.7.tgz.1
spark-2.3.1-bin-hadoop2.7      spark-2.3.1-bin-hadoop2.7.tgz.2
spark-2.3.1-bin-hadoop2.7.tgz  spark-2.3.1-bin-hadoop2.7.tgz.3


In [1]:
import findspark
findspark.init()
from pyspark import SparkContext

sc = SparkContext.getOrCreate()
sc

In [2]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate() 
spark

## Downloading and preprocessing Chicago's Reported Crime Data

In [62]:
from pyspark.sql.functions import to_timestamp,col,lit
rc = spark.read.csv('reported-crimes.csv',header=True).withColumn('Date',to_timestamp(col('Date'),'MM/dd/yyyy hh:mm')). \
filter(col('Date') <= lit('2020-11-11'))
rc.toPandas().head(4)

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,11866088,JC478887,2019-10-18,011XX N SPRINGFIELD AVE,281,CRIMINAL SEXUAL ASSAULT,NON-AGGRAVATED,RESIDENCE,False,False,...,37,23,2,1150179,1907222,2019,8/5/2020 15:51,41.90132451,-87.723822,"(41.901324508, -87.723822)"
1,12120312,JD314345,2019-11-28,058XX N PAULINA ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,RESIDENCE,False,False,...,40,77,11,1164169,1938740,2019,8/1/2020 15:46,41.98752728,-87.67154152,"(41.987527284, -87.671541524)"
2,12118728,JD308178,2019-12-10,005XX E 32ND ST,2826,OTHER OFFENSE,HARASSMENT BY ELECTRONIC MEANS,RESIDENCE,False,False,...,4,35,26,1180622,1883679,2019,7/31/2020 15:45,41.83607329,-87.61272873,"(41.836073285, -87.612728731)"
3,12118262,JD312068,2019-11-25,008XX N LATROBE AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,RESIDENCE,False,False,...,37,25,11,1141201,1905176,2019,7/30/2020 15:44,41.89588035,-87.75684996,"(41.895880347, -87.756849962)"


In [9]:
rc.count()

2390

In [11]:
rc1 = spark.read.csv('reported-crimes.csv',header=True).filter(col('Primary Type') == 'THEFT')
rc1.count()

62413

## Schemas

In [12]:
rc.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Case Number: string (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- Block: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Primary Type: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Location Description: string (nullable = true)
 |-- Arrest: string (nullable = true)
 |-- Domestic: string (nullable = true)
 |-- Beat: string (nullable = true)
 |-- District: string (nullable = true)
 |-- Ward: string (nullable = true)
 |-- Community Area: string (nullable = true)
 |-- FBI Code: string (nullable = true)
 |-- X Coordinate: string (nullable = true)
 |-- Y Coordinate: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Updated On: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- Location: string (nullable = true)



In [13]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, TimestampType, BooleanType

In [14]:
labels = [
          ('ID', StringType()),
          ('Case Number', StringType()),
          ('Date', TimestampType()),
          ('Block', StringType()),
          ('IUCR', StringType()),
          ('Primary Type', StringType()),
          ('Description', StringType()),
          ('Location Description', StringType()),
          ('Arrest', BooleanType()),
          ('Domestic', BooleanType()),
          ('Beat', StringType()),
          ('District', StringType()),
          ('Ward', StringType()),
          ('Community Area', StringType()),
          ('FBI Code', StringType()),
          ('X Coordinate', StringType()),
          ('Y Coordinate', StringType()),
          ('Year', IntegerType()),
          ('Updated On', StringType()),
          ('Latitude', DoubleType()),
          ('Longitude', DoubleType()),
          ('Location', StringType()),
]

In [15]:
schema = StructType([StructField(x[0], x[1], True) for x in labels])
schema

StructType(List(StructField(ID,StringType,true),StructField(Case Number,StringType,true),StructField(Date,TimestampType,true),StructField(Block,StringType,true),StructField(IUCR,StringType,true),StructField(Primary Type,StringType,true),StructField(Description,StringType,true),StructField(Location Description,StringType,true),StructField(Arrest,BooleanType,true),StructField(Domestic,BooleanType,true),StructField(Beat,StringType,true),StructField(District,StringType,true),StructField(Ward,StringType,true),StructField(Community Area,StringType,true),StructField(FBI Code,StringType,true),StructField(X Coordinate,StringType,true),StructField(Y Coordinate,StringType,true),StructField(Year,IntegerType,true),StructField(Updated On,StringType,true),StructField(Latitude,DoubleType,true),StructField(Longitude,DoubleType,true),StructField(Location,StringType,true)))

In [16]:
rc = spark.read.csv('reported-crimes.csv', schema=schema)
rc.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Case Number: string (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- Block: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Primary Type: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Location Description: string (nullable = true)
 |-- Arrest: boolean (nullable = true)
 |-- Domestic: boolean (nullable = true)
 |-- Beat: string (nullable = true)
 |-- District: string (nullable = true)
 |-- Ward: string (nullable = true)
 |-- Community Area: string (nullable = true)
 |-- FBI Code: string (nullable = true)
 |-- X Coordinate: string (nullable = true)
 |-- Y Coordinate: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Updated On: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Location: string (nullable = true)



## Working with columns

In [17]:
rc.select('IUCR').show(5)

+----+
|IUCR|
+----+
|IUCR|
|2017|
|1185|
|2014|
|2826|
+----+
only showing top 5 rows



In [18]:
rc.select(rc.IUCR).show(5)

+----+
|IUCR|
+----+
|IUCR|
|2017|
|1185|
|2014|
|2826|
+----+
only showing top 5 rows



In [19]:
rc.select(col('IUCR')).show(5)

+----+
|IUCR|
+----+
|IUCR|
|2017|
|1185|
|2014|
|2826|
+----+
only showing top 5 rows



**Display only the first 4 rows of the column names Case Number, Date and Arrest**

In [23]:
rc.select('Case Number', 'Date', 'Arrest').show(4)

+-----------+-------------------+------+
|Case Number|               Date|Arrest|
+-----------+-------------------+------+
|   JC478887|2019-10-18 00:00:00| FALSE|
|   JD312016|2019-10-10 00:00:00| FALSE|
|   JC516490|2019-10-30 00:00:00|  TRUE|
|   JC475135|2019-10-16 00:00:00|  TRUE|
+-----------+-------------------+------+
only showing top 4 rows



In [24]:
rc.select(col('Case Number'), col('Date'), col('Arrest')).show(4)

+-----------+-------------------+------+
|Case Number|               Date|Arrest|
+-----------+-------------------+------+
|   JC478887|2019-10-18 00:00:00| FALSE|
|   JD312016|2019-10-10 00:00:00| FALSE|
|   JC516490|2019-10-30 00:00:00|  TRUE|
|   JC475135|2019-10-16 00:00:00|  TRUE|
+-----------+-------------------+------+
only showing top 4 rows



**Add a column with name One, with entries all 1s**

In [32]:
from pyspark.sql.functions import lit
rc=rc.withColumn('One', lit(1))

In [33]:
rc.select('One', 'Date', 'Arrest').show(4)

+---+-------------------+------+
|One|               Date|Arrest|
+---+-------------------+------+
|  1|2019-10-18 00:00:00| FALSE|
|  1|2019-10-10 00:00:00| FALSE|
|  1|2019-10-30 00:00:00|  TRUE|
|  1|2019-10-16 00:00:00|  TRUE|
+---+-------------------+------+
only showing top 4 rows



**Remove the column IUCR**

In [34]:
rc.drop('IUCR').columns

['ID',
 'Case Number',
 'Date',
 'Block',
 'Primary Type',
 'Description',
 'Location Description',
 'Arrest',
 'Domestic',
 'Beat',
 'District',
 'Ward',
 'Community Area',
 'FBI Code',
 'X Coordinate',
 'Y Coordinate',
 'Year',
 'Updated On',
 'Latitude',
 'Longitude',
 'Location',
 'One']

## Working with rows

**Get 12-Nov crimes**

In [51]:
from pyspark.sql.functions import lit
duplicate_crimes = spark.read.csv(
    'reported-crimes.csv',header=True).withColumn(
    'Date',to_timestamp(col('Date'),'MM/dd/yyyy hh:mm')).filter(col('Date') == lit('2019-11-12'))
duplicate_crimes.count()

93

In [56]:
rc.filter(col('Date') == lit('2019-11-12')).count()

93

In [57]:
rc = rc.union(duplicate_crimes)
rc.filter(col('Date') == lit('2019-11-12')).count() # should be 93*2=186 rows

186

**What are the top 10 number of reported crimes by Primary type, in descending order of occurence?**

In [58]:
rc.groupBy('Primary Type').count().orderBy('count', ascending=False).show(10)

+-------------------+-----+
|       Primary Type|count|
+-------------------+-----+
|              THEFT| 1835|
|            BATTERY| 1008|
| DECEPTIVE PRACTICE|  801|
|            ASSAULT|  548|
|      OTHER OFFENSE|  481|
|          NARCOTICS|  471|
|    CRIMINAL DAMAGE|  416|
|           BURGLARY|  247|
|  CRIMINAL TRESPASS|  182|
|MOTOR VEHICLE THEFT|  168|
+-------------------+-----+
only showing top 10 rows



**What percentage of reported crimes resulted in an arrest?**

In [66]:
rc.select('Arrest').distinct().show()

+------+
|Arrest|
+------+
| FALSE|
|  TRUE|
+------+



In [65]:
import pyspark.sql.functions as func

count_arrests = rc.groupBy("Arrest").count().withColumn('percent', func.round(col('count')/rc.count(),3))
count_arrests.show(5)

+------+-----+-------+
|Arrest|count|percent|
+------+-----+-------+
| FALSE| 5229|  0.797|
|  TRUE| 1333|  0.203|
+------+-----+-------+



**What are the top 3 locations for reported crimes?**

In [67]:
rc.groupby('Location Description').count().orderBy("count",ascending=False).show(3)

+--------------------+-----+
|Location Description|count|
+--------------------+-----+
|           RESIDENCE| 1226|
|              STREET| 1045|
|           APARTMENT|  898|
+--------------------+-----+
only showing top 3 rows



In [70]:
from pyspark.sql import functions
fn=dir(functions)
for i in fn:
    print(i)

Column
DataFrame
DataType
PandasUDFType
PythonEvalType
SparkContext
StringType
UserDefinedFunction
__all__
__builtins__
__cached__
__doc__
__file__
__loader__
__name__
__package__
__spec__
_binary_mathfunctions
_collect_list_doc
_collect_set_doc
_create_binary_mathfunction
_create_column_from_literal
_create_column_from_name
_create_function
_create_function_over_column
_create_udf
_create_window_function
_functions
_functions_1_4_over_column
_functions_1_6_over_column
_functions_2_1_over_column
_functions_2_4
_functions_deprecated
_functions_over_column
_lit_doc
_options_to_str
_string_functions
_test
_to_java_column
_to_seq
_window_functions
_wrap_deprecated_function
abs
acos
add_months
approx_count_distinct
array
array_contains
array_distinct
array_except
array_intersect
array_join
array_max
array_min
array_position
array_remove
array_repeat
array_sort
array_union
arrays_overlap
arrays_zip
asc
asc_nulls_first
asc_nulls_last
ascii
asin
atan
atan2
avg
base64
basestring
bin
bitwiseNOT


## Built-in Functions

### String Functions
**Display the Primary Type column in lower and upper characters, and the first 4 characters of the column**

In [80]:
from pyspark.sql.functions import lower, upper, substring
help(substring)

Help on function substring in module pyspark.sql.functions:

substring(str, pos, len)
    Substring starts at `pos` and is of length `len` when str is String type or
    returns the slice of byte array that starts at `pos` in byte and is of length `len`
    when str is Binary type.
    
    .. note:: The position is not zero based, but 1 based index.
    
    >>> df = spark.createDataFrame([('abcd',)], ['s',])
    >>> df.select(substring(df.s, 1, 2).alias('s')).collect()
    [Row(s='ab')]
    
    .. versionadded:: 1.5



In [83]:
rc.select(lower(col('Primary Type')), upper(col('Primary Type')), substring(col('Primary Type'),1,4)).show(7, truncate=False)

+--------------------------+--------------------------+-----------------------------+
|lower(Primary Type)       |upper(Primary Type)       |substring(Primary Type, 1, 4)|
+--------------------------+--------------------------+-----------------------------+
|criminal sexual assault   |CRIMINAL SEXUAL ASSAULT   |CRIM                         |
|deceptive practice        |DECEPTIVE PRACTICE        |DECE                         |
|other offense             |OTHER OFFENSE             |OTHE                         |
|deceptive practice        |DECEPTIVE PRACTICE        |DECE                         |
|burglary                  |BURGLARY                  |BURG                         |
|criminal damage           |CRIMINAL DAMAGE           |CRIM                         |
|offense involving children|OFFENSE INVOLVING CHILDREN|OFFE                         |
+--------------------------+--------------------------+-----------------------------+
only showing top 7 rows



### Numeric Functions


Show the oldest date and the most recent date

In [78]:
from pyspark.sql.functions import min, max
rc.select(min("Date"), max("Date")).show()

+-------------------+-------------------+
|          min(Date)|          max(Date)|
+-------------------+-------------------+
|2019-10-10 00:00:00|2019-12-31 00:00:00|
+-------------------+-------------------+



**Date**


**What is 3 days earlier that the oldest date and 3 days later than the most recent date?**

In [87]:
from pyspark.sql.functions import date_add, lit, date_sub
rc.select(date_add(min("Date"),-3)).show()
rc.select(date_add(max("Date"),3)).show()

+-----------------------+
|date_add(min(Date), -3)|
+-----------------------+
|             2019-10-07|
+-----------------------+

+----------------------+
|date_add(max(Date), 3)|
+----------------------+
|            2020-01-03|
+----------------------+



In [88]:
rc.select(date_sub(min(col('Date')), 3), date_add(max(col('Date')), 3)).show()

+----------------------+----------------------+
|date_sub(min(Date), 3)|date_add(max(Date), 3)|
+----------------------+----------------------+
|            2019-10-07|            2020-01-03|
+----------------------+----------------------+

