-- Notepad to myself --

# Working with Joins

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate() 

In [2]:
from pyspark.sql.functions import to_timestamp, col
df = spark.read.csv('data/Crimes-2021.csv', header=True, inferSchema=True) \
    .withColumn('Date', to_timestamp(col('Date'), 'MM/dd/yyyy hh:mm:ss a'))
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Case Number: string (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- Block: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Primary Type: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Location Description: string (nullable = true)
 |-- Arrest: boolean (nullable = true)
 |-- Domestic: boolean (nullable = true)
 |-- Beat: integer (nullable = true)
 |-- District: integer (nullable = true)
 |-- Ward: integer (nullable = true)
 |-- Community Area: integer (nullable = true)
 |-- FBI Code: string (nullable = true)
 |-- X Coordinate: integer (nullable = true)
 |-- Y Coordinate: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Updated On: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Location: string (nullable = true)



Download the police station dataset from the same Chicago data portal as an additional dataframe -> https://data.cityofchicago.org/Public-Safety/Police-Stations/z8bn-74gv

Notice that in the reported crimes dataset, we have the *district numbers*, but we don't have the *district names*. It just so happens that in the police station dataset, we have both the district numbers and the names. What we'll do then is to join the two tables together and then use the police station dataframe to look up the associated district number from our original report crimes dataframe.

In [3]:
ps = spark.read.csv('data/Police-Stations.csv', header=True, inferSchema=True)
ps.printSchema()

root
 |-- DISTRICT: string (nullable = true)
 |-- DISTRICT NAME: string (nullable = true)
 |-- ADDRESS: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- STATE: string (nullable = true)
 |-- ZIP: integer (nullable = true)
 |-- WEBSITE: string (nullable = true)
 |-- PHONE: string (nullable = true)
 |-- FAX: string (nullable = true)
 |-- TTY: string (nullable = true)
 |-- X COORDINATE: double (nullable = true)
 |-- Y COORDINATE: double (nullable = true)
 |-- LATITUDE: double (nullable = true)
 |-- LONGITUDE: double (nullable = true)
 |-- LOCATION: string (nullable = true)



The reported crimes dataset has only the *district number*. Add the *district name* by joining with the police station dataset

In [4]:
df.cache()
df.count()

207879

In [5]:
ps.select(col('DISTRICT')).distinct().show(10)

+--------+
|DISTRICT|
+--------+
|       7|
|      15|
|      11|
|       3|
|       8|
|      22|
|      16|
|       5|
|      18|
|      17|
+--------+
only showing top 10 rows



In [6]:
df.select('District').distinct().show(10)

+--------+
|District|
+--------+
|      31|
|      12|
|      22|
|       1|
|       6|
|      16|
|       3|
|      20|
|       5|
|      19|
+--------+
only showing top 10 rows



Joining two tables

In [7]:
df.join(ps, df.District == ps.DISTRICT, 'left_outer').printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Case Number: string (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- Block: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Primary Type: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Location Description: string (nullable = true)
 |-- Arrest: boolean (nullable = true)
 |-- Domestic: boolean (nullable = true)
 |-- Beat: integer (nullable = true)
 |-- District: integer (nullable = true)
 |-- Ward: integer (nullable = true)
 |-- Community Area: integer (nullable = true)
 |-- FBI Code: string (nullable = true)
 |-- X Coordinate: integer (nullable = true)
 |-- Y Coordinate: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Updated On: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- DISTRICT: string (nullable = true)
 |-- DISTRICT NAME: string (nullable = true)
 |-- ADDR

In [8]:
df.columns

['ID',
 'Case Number',
 'Date',
 'Block',
 'IUCR',
 'Primary Type',
 'Description',
 'Location Description',
 'Arrest',
 'Domestic',
 'Beat',
 'District',
 'Ward',
 'Community Area',
 'FBI Code',
 'X Coordinate',
 'Y Coordinate',
 'Year',
 'Updated On',
 'Latitude',
 'Longitude',
 'Location']

In [9]:
ps.columns

['DISTRICT',
 'DISTRICT NAME',
 'ADDRESS',
 'CITY',
 'STATE',
 'ZIP',
 'WEBSITE',
 'PHONE',
 'FAX',
 'TTY',
 'X COORDINATE',
 'Y COORDINATE',
 'LATITUDE',
 'LONGITUDE',
 'LOCATION']

In [10]:
# drop arbitrary columns
spark.conf.set('spark.sql.caseSensitive', True) # normally drop() is NOT case sensitive

df_joined = df.join(ps, df.District == ps.DISTRICT, 'left_outer') \
    .drop('DISTRICT',
          'ADDRESS',
          'CITY',
          'STATE',
          'ZIP',
          'WEBSITE',
          'PHONE',
          'FAX',
          'TTY',
          'X COORDINATE',
          'Y COORDINATE',
          'LATITUDE',
          'LONGITUDE',
          'LOCATION') \
    .withColumnRenamed('DISTRICT NAME', 'District Name')

In [11]:
df_final = df_joined.select(sorted(df_joined.columns))

In [12]:
df_final.columns

['Arrest',
 'Beat',
 'Block',
 'Case Number',
 'Community Area',
 'Date',
 'Description',
 'District',
 'District Name',
 'Domestic',
 'FBI Code',
 'ID',
 'IUCR',
 'Latitude',
 'Location',
 'Location Description',
 'Longitude',
 'Primary Type',
 'Updated On',
 'Ward',
 'X Coordinate',
 'Y Coordinate',
 'Year']

In [13]:
df_final.select('District', 'District Name').distinct().show(100)

+--------+--------------+
|District| District Name|
+--------+--------------+
|      31|          null|
|      15|        Austin|
|      22|   Morgan Park|
|       2|     Wentworth|
|       1|       Central|
|       6|       Gresham|
|      11|      Harrison|
|      12|     Near West|
|       5|       Calumet|
|      14|   Shakespeare|
|      18|    Near North|
|      17|   Albany Park|
|       8|  Chicago Lawn|
|      19|     Town Hall|
|       7|     Englewood|
|      24|   Rogers Park|
|       4| South Chicago|
|      25| Grand Central|
|       9|       Deering|
|      10|         Ogden|
|       3|Grand Crossing|
|      20|       Lincoln|
|      16|Jefferson Park|
+--------+--------------+



'District 31' is missing in Police Station dataset.