# Working with columns

## Setup environment

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate() 
spark

## Downloading and preprocessing Chicago's Reported Crime Data

In [5]:
!wget -O police-stations.csv https://data.cityofchicago.org/api/views/9hwr-2zxp/rows.csv?accessType=DOWNLOAD
!mv police-stations.csv ../datasets/sparkbyexamples/

--2024-11-26 21:57:52--  https://data.cityofchicago.org/api/views/9hwr-2zxp/rows.csv?accessType=DOWNLOAD
Resolving data.cityofchicago.org (data.cityofchicago.org)... 35.170.133.124, 100.28.82.57, 174.129.43.10
Connecting to data.cityofchicago.org (data.cityofchicago.org)|35.170.133.124|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/csv]
Saving to: ‘police-stations.csv’

police-stations.csv     [    <=>             ]  54.54M  2.41MB/s    in 24s     

2024-11-26 21:58:17 (2.26 MB/s) - ‘police-stations.csv’ saved [57189093]



In [6]:
from pyspark.sql.functions import to_timestamp,col,lit
path ="../datasets/sparkbyexamples/police-stations.csv"
rc = spark.read.csv(path,header=True)
rc.show(5)

+--------+-----------+--------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|      ID|Case Number|                Date|               Block|IUCR|        Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Latitude|    Longitude|            Location|
+--------+-----------+--------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|12651978|   JF185322|03/24/2022 05:07:...|  012XX S WABASH AVE|0910| MOTOR VEHICLE THEFT|          AUTOMOBILE|         GAS 

## Working with columns

**Display only the first 5 rows of the column name IUCR **

**Spark can infer the schema by default.
Spark takes a look at a couple of rows of the data, and tries to determine what kind of column each should be.**

**in a production environment, you want to explicitly define your schemas.**

In [6]:
rc.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Case Number: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Block: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Primary Type: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Location Description: string (nullable = true)
 |-- Arrest: string (nullable = true)
 |-- Domestic: string (nullable = true)
 |-- Beat: string (nullable = true)
 |-- District: string (nullable = true)
 |-- Ward: string (nullable = true)
 |-- Community Area: string (nullable = true)
 |-- FBI Code: string (nullable = true)
 |-- X Coordinate: string (nullable = true)
 |-- Y Coordinate: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Updated On: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- Location: string (nullable = true)



**in a production environment, you want to explicitly define your schemas.**

In [8]:
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, BooleanType, DoubleType, IntegerType

In [9]:
labels = [
('ID',StringType()),
('Case Number',StringType()), 
('Block',StringType()), 
('IUCR',StringType()),
('Primary Type',StringType()), 
('Description',StringType()), 
('Location Description',StringType()), 
('Arrest',StringType()), 
('Domestic',BooleanType()), 
('Beat',StringType()), 
('District',StringType()), 
('Ward',StringType()),
('Community Area',StringType()), 
('FBI Code',StringType()),
('X Coordinate',StringType()), 
('Y Coordinate',StringType()), 
('Year',IntegerType()),
('Updated On',StringType()), 
('Latitude',DoubleType()), 
('Longitude',DoubleType()),
('Location',StringType())
]


In [10]:
Myschema = StructType([StructField (x[0] ,x[1], True) for x in labels])
Myschema

StructType([StructField('ID', StringType(), True), StructField('Case Number', StringType(), True), StructField('Block', StringType(), True), StructField('IUCR', StringType(), True), StructField('Primary Type', StringType(), True), StructField('Description', StringType(), True), StructField('Location Description', StringType(), True), StructField('Arrest', StringType(), True), StructField('Domestic', BooleanType(), True), StructField('Beat', StringType(), True), StructField('District', StringType(), True), StructField('Ward', StringType(), True), StructField('Community Area', StringType(), True), StructField('FBI Code', StringType(), True), StructField('X Coordinate', StringType(), True), StructField('Y Coordinate', StringType(), True), StructField('Year', IntegerType(), True), StructField('Updated On', StringType(), True), StructField('Latitude', DoubleType(), True), StructField('Longitude', DoubleType(), True), StructField('Location', StringType(), True)])

In [14]:
rc1 = spark.read.csv(path,schema=Myschema)
rc1.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Case Number: string (nullable = true)
 |-- Block: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Primary Type: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Location Description: string (nullable = true)
 |-- Arrest: string (nullable = true)
 |-- Domestic: boolean (nullable = true)
 |-- Beat: string (nullable = true)
 |-- District: string (nullable = true)
 |-- Ward: string (nullable = true)
 |-- Community Area: string (nullable = true)
 |-- FBI Code: string (nullable = true)
 |-- X Coordinate: string (nullable = true)
 |-- Y Coordinate: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Updated On: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Location: string (nullable = true)



In [17]:
df= spark.read.option('infeerSchema', True).csv(path, header=True)
df.printSchema()
df.show(3, truncate=False)

root
 |-- ID: string (nullable = true)
 |-- Case Number: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Block: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Primary Type: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Location Description: string (nullable = true)
 |-- Arrest: string (nullable = true)
 |-- Domestic: string (nullable = true)
 |-- Beat: string (nullable = true)
 |-- District: string (nullable = true)
 |-- Ward: string (nullable = true)
 |-- Community Area: string (nullable = true)
 |-- FBI Code: string (nullable = true)
 |-- X Coordinate: string (nullable = true)
 |-- Y Coordinate: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Updated On: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- Location: string (nullable = true)

+--------+-----------+----------------------+--------------------+----+--------------------------+--------

In [18]:
print(df.columns)

['ID', 'Case Number', 'Date', 'Block', 'IUCR', 'Primary Type', 'Description', 'Location Description', 'Arrest', 'Domestic', 'Beat', 'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate', 'Y Coordinate', 'Year', 'Updated On', 'Latitude', 'Longitude', 'Location']


**Display only the first 5 rows of the column name IUCR**



In [23]:
df.select(col('IUCR')).show(5)

+----+
|IUCR|
+----+
|0910|
|2826|
|1752|
|1544|
|1562|
+----+
only showing top 5 rows



  **Display only the first 4 rows of the column names Case Number, Date and Arrest**

In [21]:
df.select('Case Number', 'Arrest','Date' ).show(4)

+-----------+------+--------------------+
|Case Number|Arrest|                Date|
+-----------+------+--------------------+
|   JF185322| false|03/24/2022 05:07:...|
|   JF113025| false|01/14/2022 03:55:...|
|   JF124024| false|01/13/2022 04:00:...|
|   JF346553|  true|08/05/2022 09:00:...|
+-----------+------+--------------------+
only showing top 4 rows



** Add a column with name One, with entries all 1s **

In [22]:
df.withColumn('One', lit(1)).show(5)

+--------+-----------+--------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+---+
|      ID|Case Number|                Date|               Block|IUCR|        Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Latitude|    Longitude|            Location|One|
+--------+-----------+--------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+---+
|12651978|   JF185322|03/24/2022 05:07:...|  012XX S WABASH AVE|0910| MOTOR VEHICLE THEFT|          AUTOMOBILE| 

** Remove the column IUCR **

In [16]:
rc2= rc.drop('IUCR')
print(rc2.columns)

['ID', 'Case Number', 'Date', 'Block', 'Primary Type', 'Description', 'Location Description', 'Arrest', 'Domestic', 'Beat', 'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate', 'Y Coordinate', 'Year', 'Updated On', 'Latitude', 'Longitude', 'Location']
