# Working with columns

## Setup environment

In [1]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate() 
spark

## Downloading and preprocessing Chicago's Reported Crime Data

In [2]:
from pyspark.sql.functions import to_timestamp,col,lit
path ="../datasets/sparkbyexamples/police-stations.csv"
rc = spark.read.csv(path,header=True)
rc.show(5)

+------------+-------------+-------------------+-------+-----+-----+--------------------+------------+------------+------------+------------+------------+-----------+------------+--------------------+
|    DISTRICT|DISTRICT NAME|            ADDRESS|   CITY|STATE|  ZIP|             WEBSITE|       PHONE|         FAX|         TTY|X COORDINATE|Y COORDINATE|   LATITUDE|   LONGITUDE|            LOCATION|
+------------+-------------+-------------------+-------+-----+-----+--------------------+------------+------------+------------+------------+------------+-----------+------------+--------------------+
|Headquarters| Headquarters|3510 S Michigan Ave|Chicago|   IL|60653|http://home.chica...|        NULL|        NULL|        NULL| 1177731.401| 1881697.404|41.83070169|-87.62339535|(41.8307016873, -...|
|          18|   Near North| 1160 N Larrabee St|Chicago|   IL|60610|http://home.chica...|312-742-5870|312-742-5771|312-742-5773| 1172080.029| 1908086.527|41.90324165|-87.64335214|(41.9032416531, -

## Working with columns

**Display only the first 5 rows of the column name IUCR **

**Spark can infer the schema by default.
Spark takes a look at a couple of rows of the data, and tries to determine what kind of column each should be.**

**in a production environment, you want to explicitly define your schemas.**

In [3]:
rc.printSchema()

root
 |-- DISTRICT: string (nullable = true)
 |-- DISTRICT NAME: string (nullable = true)
 |-- ADDRESS: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- STATE: string (nullable = true)
 |-- ZIP: string (nullable = true)
 |-- WEBSITE: string (nullable = true)
 |-- PHONE: string (nullable = true)
 |-- FAX: string (nullable = true)
 |-- TTY: string (nullable = true)
 |-- X COORDINATE: string (nullable = true)
 |-- Y COORDINATE: string (nullable = true)
 |-- LATITUDE: string (nullable = true)
 |-- LONGITUDE: string (nullable = true)
 |-- LOCATION: string (nullable = true)



**in a production environment, you want to explicitly define your schemas.**

In [4]:
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, BooleanType, DoubleType, IntegerType

In [6]:
labels = [
('ID',StringType()),
('Case Number',StringType()), 
('Block',StringType()), 
('IUCR',StringType()),
('Primary Type',StringType()), 
('Description',StringType()), 
('Location Description',StringType()), 
('Arrest',StringType()), 
('Domestic',BooleanType()), 
('Beat',StringType()), 
('District',StringType()), 
('Ward',StringType()),
('Community Area',StringType()), 
('FBI Code',StringType()),
('X Coordinate',StringType()), 
('Y Coordinate',StringType()), 
('Year',IntegerType()),
('Updated On',StringType()), 
('Latitude',DoubleType()), 
('Longitude',DoubleType()),
('Location',StringType())
]


In [7]:
Myschema = StructType([StructField (x[0] ,x[1], True) for x in labels])
Myschema

StructType([StructField('ID', StringType(), True), StructField('Case Number', StringType(), True), StructField('Block', StringType(), True), StructField('IUCR', StringType(), True), StructField('Primary Type', StringType(), True), StructField('Description', StringType(), True), StructField('Location Description', StringType(), True), StructField('Arrest', StringType(), True), StructField('Domestic', BooleanType(), True), StructField('Beat', StringType(), True), StructField('District', StringType(), True), StructField('Ward', StringType(), True), StructField('Community Area', StringType(), True), StructField('FBI Code', StringType(), True), StructField('X Coordinate', StringType(), True), StructField('Y Coordinate', StringType(), True), StructField('Year', IntegerType(), True), StructField('Updated On', StringType(), True), StructField('Latitude', DoubleType(), True), StructField('Longitude', DoubleType(), True), StructField('Location', StringType(), True)])

In [16]:
rc2 = spark.read.csv(path,schema=Myschema)
rc2.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Case Number: string (nullable = true)
 |-- Block: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Primary Type: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Location Description: string (nullable = true)
 |-- Arrest: string (nullable = true)
 |-- Domestic: boolean (nullable = true)
 |-- Beat: string (nullable = true)
 |-- District: string (nullable = true)
 |-- Ward: string (nullable = true)
 |-- Community Area: string (nullable = true)
 |-- FBI Code: string (nullable = true)
 |-- X Coordinate: string (nullable = true)
 |-- Y Coordinate: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Updated On: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Location: string (nullable = true)



In [10]:
rc.show(3, truncate=False)

+------------+-------------+-------------------+-------+-----+-----+---------------------------------------------------------------------------+------------+------------+------------+------------+------------+-----------+------------+-------------------------------+
|DISTRICT    |DISTRICT NAME|ADDRESS            |CITY   |STATE|ZIP  |WEBSITE                                                                    |PHONE       |FAX         |TTY         |X COORDINATE|Y COORDINATE|LATITUDE   |LONGITUDE   |LOCATION                       |
+------------+-------------+-------------------+-------+-----+-----+---------------------------------------------------------------------------+------------+------------+------------+------------+------------+-----------+------------+-------------------------------+
|Headquarters|Headquarters |3510 S Michigan Ave|Chicago|IL   |60653|http://home.chicagopolice.org                                              |NULL        |NULL        |NULL        |1177731.401 |188

**Display only the first 5 rows of the column name IUCR**



In [17]:
print(rc2.columns)

['ID', 'Case Number', 'Block', 'IUCR', 'Primary Type', 'Description', 'Location Description', 'Arrest', 'Domestic', 'Beat', 'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate', 'Y Coordinate', 'Year', 'Updated On', 'Latitude', 'Longitude', 'Location']


In [19]:
rc2.select('IUCR').show(5)

+-------+
|   IUCR|
+-------+
|   CITY|
|Chicago|
|Chicago|
|Chicago|
|Chicago|
+-------+
only showing top 5 rows



  **Display only the first 4 rows of the column names Case Number, Date and Arrest**

In [21]:
rc2.select('Case Number', 'Arrest' ).show(4)

+-------------+------------+
|  Case Number|      Arrest|
+-------------+------------+
|DISTRICT NAME|       PHONE|
| Headquarters|        NULL|
|   Near North|312-742-5870|
|    Town Hall|312-744-8320|
+-------------+------------+
only showing top 4 rows



** Add a column with name One, with entries all 1s **

In [22]:
rc2.withColumn('One', lit(1)).show(5)

+------------+-------------+-------------------+-------+------------+-----------+--------------------+------------+--------+------------+------------+------------+--------------+------------+--------------------+------------+----+----------+--------+---------+--------+---+
|          ID|  Case Number|              Block|   IUCR|Primary Type|Description|Location Description|      Arrest|Domestic|        Beat|    District|        Ward|Community Area|    FBI Code|        X Coordinate|Y Coordinate|Year|Updated On|Latitude|Longitude|Location|One|
+------------+-------------+-------------------+-------+------------+-----------+--------------------+------------+--------+------------+------------+------------+--------------+------------+--------------------+------------+----+----------+--------+---------+--------+---+
|    DISTRICT|DISTRICT NAME|            ADDRESS|   CITY|       STATE|        ZIP|             WEBSITE|       PHONE|    NULL|         TTY|X COORDINATE|Y COORDINATE|      LATITUDE|

** Remove the column IUCR **

In [24]:
rc2= rc.drop('IUCR')
print(rc2.columns)

['DISTRICT', 'DISTRICT NAME', 'ADDRESS', 'CITY', 'STATE', 'ZIP', 'WEBSITE', 'PHONE', 'FAX', 'TTY', 'X COORDINATE', 'Y COORDINATE', 'LATITUDE', 'LONGITUDE', 'LOCATION']
