# Working with joins

## Download and install Spark

In [1]:
!ls -l ../datasets/sparkbyexamples/

total 63068
-rw-rw-rw- 1 jovyan root    143130 Dec  5 09:23 appl_stock.csv
-rw-rw-rw- 1 jovyan root    884736 Dec  5 09:23 chinook.db
-rw-rw-rw- 1 jovyan root    305596 Dec  5 09:23 chinook.zip
-rw-rw-rw- 1 jovyan root        73 Dec  5 09:23 people.json
-rw-rw-rw- 1 jovyan users 57191790 Dec  4 11:59 reported-crimes.csv
-rw-rw-rw- 1 jovyan root       196 Dec  5 09:23 sales_info.csv
-rw-rw-rw- 1 jovyan root   6043326 Dec  5 09:23 sqlite_latest.jar


## Downloading and preprocessing Chicago's Reported Crime Data

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate() 

In [6]:
from pyspark.sql.functions import to_timestamp,col,lit
crimes_path ="../datasets/sparkbyexamples/reported-crimes.csv"
rc = spark.read.csv(crimes_path,header=True)\
.withColumn('Date',to_timestamp(col('Date'),'MM/dd/yyyy hh:mm:ss a'))\
.filter(col('Date') <= lit('2023-11-11'))
rc.show(5)

+--------+-----------+-------------------+--------------------+----+-----------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|      ID|Case Number|               Date|               Block|IUCR|     Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Latitude|    Longitude|            Location|
+--------+-----------+-------------------+--------------------+----+-----------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|12592454|   JF113025|2022-01-14 15:55:00|   067XX S MORGAN ST|2826|    OTHER OFFENSE|HARASSMENT BY ELE...|           RESIDENCE| false| 

## Joins

**Download police station data**

In [8]:
!wget -O police-stations.csv https://data.cityofchicago.org/api/views/z8bn-74gv/rows.csv?accessType=DOWNLOAD
!mv police-stations.csv ../datasets/sparkbyexamples/
stations_path ="../datasets/sparkbyexamples/police-stations.csv"

--2024-12-05 10:09:40--  https://data.cityofchicago.org/api/views/z8bn-74gv/rows.csv?accessType=DOWNLOAD
Resolving data.cityofchicago.org (data.cityofchicago.org)... 100.28.82.57, 174.129.43.10, 35.170.133.124
Connecting to data.cityofchicago.org (data.cityofchicago.org)|100.28.82.57|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/csv]
Saving to: ‘police-stations.csv’

police-stations.csv     [ <=>                ]   5.57K  --.-KB/s    in 0s      

2024-12-05 10:09:40 (836 MB/s) - ‘police-stations.csv’ saved [5699]



In [9]:
ps = spark.read.csv(stations_path,header=True)
ps.show(5)

+------------+-------------+-------------------+-------+-----+-----+--------------------+------------+------------+------------+------------+------------+-----------+------------+--------------------+
|    DISTRICT|DISTRICT NAME|            ADDRESS|   CITY|STATE|  ZIP|             WEBSITE|       PHONE|         FAX|         TTY|X COORDINATE|Y COORDINATE|   LATITUDE|   LONGITUDE|            LOCATION|
+------------+-------------+-------------------+-------+-----+-----+--------------------+------------+------------+------------+------------+------------+-----------+------------+--------------------+
|Headquarters| Headquarters|3510 S Michigan Ave|Chicago|   IL|60653|http://home.chica...|        NULL|        NULL|        NULL| 1177731.401| 1881697.404|41.83070169|-87.62339535|(41.8307016873, -...|
|          18|   Near North| 1160 N Larrabee St|Chicago|   IL|60610|http://home.chica...|312-742-5870|312-742-5771|312-742-5773| 1172080.029| 1908086.527|41.90324165|-87.64335214|(41.9032416531, -

**The reported crimes dataset has only the district number. Add the district name by joining with the police station dataset**

In [10]:
rc.cache()
rc.count()

239618

In [11]:
ps.select( col('DISTRICT')).distinct().show(7)

+--------+
|DISTRICT|
+--------+
|       7|
|      15|
|      11|
|       3|
|       8|
|      22|
|      16|
+--------+
only showing top 7 rows



In [16]:
rc.select( col('district')).distinct().show(10)

+--------+
|district|
+--------+
|     009|
|     012|
|     024|
|    null|
|     031|
|     015|
|     006|
|     019|
|     020|
|     011|
+--------+
only showing top 10 rows



In [17]:
from pyspark.sql.functions import lpad

In [20]:
ps.select( lpad(col('DISTRICT'),3, '0')).show(10)

+--------------------+
|lpad(DISTRICT, 3, 0)|
+--------------------+
|                 Hea|
|                 018|
|                 019|
|                 020|
|                 022|
|                 024|
|                 025|
|                 001|
|                 002|
|                 003|
+--------------------+
only showing top 10 rows



In [22]:
ps = ps.withColumn('Formated_district', lpad(col('DISTRICT'),3, '0'))
ps.show(5)

+------------+-------------+-------------------+-------+-----+-----+--------------------+------------+------------+------------+------------+------------+-----------+------------+--------------------+-----------------+
|    DISTRICT|DISTRICT NAME|            ADDRESS|   CITY|STATE|  ZIP|             WEBSITE|       PHONE|         FAX|         TTY|X COORDINATE|Y COORDINATE|   LATITUDE|   LONGITUDE|            LOCATION|Formated_district|
+------------+-------------+-------------------+-------+-----+-----+--------------------+------------+------------+------------+------------+------------+-----------+------------+--------------------+-----------------+
|Headquarters| Headquarters|3510 S Michigan Ave|Chicago|   IL|60653|http://home.chica...|        null|        null|        null| 1177731.401| 1881697.404|41.83070169|-87.62339535|(41.8307016873, -...|              Hea|
|          18|   Near North| 1160 N Larrabee St|Chicago|   IL|60610|http://home.chica...|312-742-5870|312-742-5771|312-742-5

In [27]:
rc.join(ps, rc.District == ps.Formated_district , 'left_outer').show(5)

+--------+-----------+-------------------+--------------------+----+------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+--------+-------------+--------------------+-------+-----+-----+--------------------+------------+------------+------------+------------+------------+-----------+------------+--------------------+-----------------+
|      ID|Case Number|               Date|               Block|IUCR|Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Latitude|    Longitude|            Location|DISTRICT|DISTRICT NAME|             ADDRESS|   CITY|STATE|  ZIP|             WEBSITE|       PHONE|         FAX|         TTY|X COORDINATE|Y COORDINATE|   LATITUDE|   LONGITUDE|            LOCATION|Formated_district|
+-----

In [28]:
ps.columns

['DISTRICT',
 'DISTRICT NAME',
 'ADDRESS',
 'CITY',
 'STATE',
 'ZIP',
 'WEBSITE',
 'PHONE',
 'FAX',
 'TTY',
 'X COORDINATE',
 'Y COORDINATE',
 'LATITUDE',
 'LONGITUDE',
 'LOCATION',
 'Formated_district']

In [30]:
rc.join(ps, rc.District == ps.Formated_district , 'left_outer').drop(
 'ADDRESS',
 'CITY',
 'STATE',
 'ZIP',
 'WEBSITE',
 'PHONE',
 'FAX',
 'TTY',
 'X COORDINATE',
 'Y COORDINATE',
 'LATITUDE',
 'LONGITUDE',
 'LOCATION',
 'Formated_district'
).show(5)

+--------+-----------+-------------------+--------------------+----+------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+----+--------------------+--------+-------------+
|      ID|Case Number|               Date|               Block|IUCR|Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|Year|          Updated On|DISTRICT|DISTRICT NAME|
+--------+-----------+-------------------+--------------------+----+------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+----+--------------------+--------+-------------+
|10224738|   HY411648|2015-09-05 13:30:00|     043XX S WOOD ST|0486|     BATTERY|DOMESTIC BATTERY ...|           RESIDENCE| false|    true|0924|     009|  12|            61|     08B|2015|02/10/2018 03:50:...|       9|      Deering|
|10224739|   HY411615|2015-09-04 11:30:00| 008XX N CENTRAL AVE|0870|    