# RDDs

## Check for JAVA_HOME

In [1]:
import os
print(os.getenv("JAVA_HOME")) # check for the correct java version (should be 1.8 for using spark)

/Library/Java/JavaVirtualMachines/jdk1.8.0_202.jdk/Contents/Home


## Setup environment

In [2]:
import findspark
findspark.init()
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate() 

In [3]:
from pyspark.sql.functions import to_timestamp,col
ps = spark.read.csv('Police_Stations.csv',header=True)

# same using RDDs 
ps_rdd = sc.textFile('Police_Stations.csv')

In [4]:
ps_rdd.first() # that's the header 

'DISTRICT,DISTRICT NAME,ADDRESS,CITY,STATE,ZIP,WEBSITE,PHONE,FAX,TTY,X COORDINATE,Y COORDINATE,LATITUDE,LONGITUDE,LOCATION'

In [5]:
ps_rdd_header = ps_rdd.first() 

In [6]:
# rest 
ps_rdd_rest = ps_rdd.filter(lambda x: x != ps_rdd_header)
ps_rdd_rest.first() # first element of the dataset (RDD)

'1,Central,1718 S State St,Chicago,IL,60616,http://home.chicagopolice.org/community/districts/1st-district-central/,312-745-4290,312-745-3694,312-745-3693,1176569.052,1891771.704,41.85837259,-87.62735617,"(41.8583725929, -87.627356171)"'

In [6]:
ps.columns

['DISTRICT',
 'DISTRICT NAME',
 'ADDRESS',
 'CITY',
 'STATE',
 'ZIP',
 'WEBSITE',
 'PHONE',
 'FAX',
 'TTY',
 'X COORDINATE',
 'Y COORDINATE',
 'LATITUDE',
 'LONGITUDE',
 'LOCATION']

**How many police stations are there?**

In [7]:
ps.count()

24

In [16]:
# RDDs way 
ps_rdd_rest.map(lambda x: x.split(',')) # thats all the cells, each cell on a array
ps_rdd_rest.map(lambda x: x.split(',')).count() # we can count array number to get that

24

**Display the District ID, District name, Address and Zip for the police station with District ID 7**



In [9]:
ps.filter(col('DISTRICT')==7).select('DISTRICT', 'DISTRICT NAME','ADDRESS','ZIP').show()

+--------+-------------+--------------+-----+
|DISTRICT|DISTRICT NAME|       ADDRESS|  ZIP|
+--------+-------------+--------------+-----+
|       7|    Englewood|1438 W 63rd St|60636|
+--------+-------------+--------------+-----+



In [20]:
# RDDs way 
(
    ps_rdd_rest.filter(lambda x: x.split(',')[0] == '7').
    map(lambda line : (
        line.split(",")[0], 
        line.split(",")[1], 
        line.split(",")[2], 
        line.split(",")[5]
    )
       ).collect()
)

[('7', 'Englewood', '1438 W 63rd St', '60636')]

**Police stations 10 and 11 are geographically close to each other. Display the District ID, District name, address and zip code**

In [15]:
ps.filter(col('DISTRICT').isin([10,11])).select('DISTRICT', 'DISTRICT NAME','ADDRESS','ZIP').show()

+--------+-------------+------------------+-----+
|DISTRICT|DISTRICT NAME|           ADDRESS|  ZIP|
+--------+-------------+------------------+-----+
|      10|        Ogden|  3315 W Ogden Ave|60623|
|      11|     Harrison|3151 W Harrison St|60612|
+--------+-------------+------------------+-----+



In [21]:
# RDDs way 
(
    ps_rdd_rest.filter(lambda x: x.split(',')[0] in ['10','11']).
    map(lambda line : (
        line.split(",")[0], 
        line.split(",")[1], 
        line.split(",")[2], 
        line.split(",")[5]
    )
       ).collect()
)

[('10', 'Ogden', '3315 W Ogden Ave', '60623'),
 ('11', 'Harrison', '3151 W Harrison St', '60612')]