-- Notepad to myself --

# Working with Rows

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate() 

In [2]:
from pyspark.sql.functions import to_timestamp, col
df = spark.read.csv('data/Crimes-2021.csv', header=True, inferSchema=True) \
    .withColumn('Date', to_timestamp(col('Date'), 'MM/dd/yyyy hh:mm:ss a'))
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Case Number: string (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- Block: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Primary Type: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Location Description: string (nullable = true)
 |-- Arrest: boolean (nullable = true)
 |-- Domestic: boolean (nullable = true)
 |-- Beat: integer (nullable = true)
 |-- District: integer (nullable = true)
 |-- Ward: integer (nullable = true)
 |-- Community Area: integer (nullable = true)
 |-- FBI Code: string (nullable = true)
 |-- X Coordinate: integer (nullable = true)
 |-- Y Coordinate: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Updated On: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Location: string (nullable = true)



### 1. Filtering Rows in PySpark

We can filter rows based on certain conditions, so in PySpark we specify the DataFrame.filter() and then we specify the condition that we're looking to filter by. In pandas it's very similar, where you just specify the DataFrame.column() within square brackets of the Dataframe.

In [10]:
df.filter(col('ID') == 12343475).take(1)

[Row(ID=12343475, Case Number='JE202728', Date=datetime.datetime(2021, 4, 16, 20, 45), Block='056XX N RIDGE AVE', IUCR='0820', Primary Type='THEFT', Description='$500 AND UNDER', Location Description='OTHER (SPECIFY)', Arrest=False, Domestic=False, Beat=2013, District=20, Ward=48, Community Area=77, FBI Code='06', X Coordinate=None, Y Coordinate=None, Year=2021, Updated On='04/23/2021 04:51:47 PM', Latitude=None, Longitude=None, Location=None)]

### 1. Filtering Rows in pandas

In [None]:
import pandas as pd
df2 = df.toPandas()

In [11]:
df2[df2.ID == 12343475]

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,12343475,JE202728,2021-04-16 20:45:00,056XX N RIDGE AVE,820,THEFT,$500 AND UNDER,OTHER (SPECIFY),False,False,...,48.0,77,6,,,2021,04/23/2021 04:51:47 PM,,,


### 2. Get Unique Rows in PySpark

In [12]:
df.select('IUCR').distinct().show(5)

+----+
|IUCR|
+----+
|1090|
|1512|
|0558|
|0895|
|0461|
+----+
only showing top 5 rows



### 2. Get Unique Rows in pandas

In [28]:
df2.IUCR.unique()[:5]

array(['0820', '500E', '1752', '0850', '0486'], dtype=object)

### 3. Sort Rows in PySpark

Now sorting is a very important function and in PySpark we use orderBy. In pandas we would've used the sort_values() function and we provided the column name. 

In [31]:
df.orderBy(col('ID'), ascending=False).select('ID').show(5)

+--------+
|      ID|
+--------+
|12847891|
|12847621|
|12847574|
|12847342|
|12846733|
+--------+
only showing top 5 rows



### 3. Sort Rows in pandas

In [32]:
df2.ID.sort_values(ascending=False).head(5)

195268    12847891
195236    12847621
195241    12847574
195247    12847342
207847    12846733
Name: ID, dtype: int32

In [33]:
df2['ID'].sort_values(ascending=False).head(5)

195268    12847891
195236    12847621
195241    12847574
195247    12847342
207847    12846733
Name: ID, dtype: int32

### 4. Append Rows in PySpark

Since DataFrames are immutable we cannot just add to the DataFrame. Instead what we could do is union the original DataFrame with a new one. This concatenates two DataFrames and we need to make sure that both DataFrames have the same number of columns and the same Schema. Otherwise the union will fail. In pandas we would've used the concat() function. 

In [42]:
df3 = df.limit(5)
df3.show()

+--------+-----------+-------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|      ID|Case Number|               Date|               Block|IUCR|        Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Latitude|    Longitude|            Location|
+--------+-----------+-------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|12343475|   JE202728|2021-04-16 20:45:00|   056XX N RIDGE AVE|0820|               THEFT|      $500 AND UNDER|     OTHER (SPECI

In [46]:
df3.select('ID').show()

+--------+
|      ID|
+--------+
|12343475|
|12602803|
|12565821|
|12540388|
|12541139|
+--------+



In [47]:
df.union(df3).select('ID').tail(10)

[Row(ID=12845432),
 Row(ID=12845249),
 Row(ID=12846092),
 Row(ID=12841050),
 Row(ID=12839669),
 Row(ID=12343475),
 Row(ID=12602803),
 Row(ID=12565821),
 Row(ID=12540388),
 Row(ID=12541139)]

### 4. Append Rows in pandas

In [52]:
df4 = df2.head(5)
df4

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,12343475,JE202728,2021-04-16 20:45:00,056XX N RIDGE AVE,0820,THEFT,$500 AND UNDER,OTHER (SPECIFY),False,False,...,48.0,77,06,,,2021,04/23/2021 04:51:47 PM,,,
1,12602803,JF125633,2021-10-21 11:00:00,083XX S STONY ISLAND AVE,500E,OTHER OFFENSE,EAVESDROPPING,OTHER (SPECIFY),False,False,...,8.0,45,26,1188260.0,1849805.0,2021,02/27/2022 03:46:31 PM,41.742941,-87.585783,"(41.74294124, -87.585783412)"
2,12565821,JE475344,2021-12-11 20:00:00,077XX S MAY ST,1752,OFFENSE INVOLVING CHILDREN,AGGRAVATED CRIMINAL SEXUAL ABUSE BY FAMILY MEMBER,RESIDENCE,False,True,...,17.0,71,17,1169996.0,1853419.0,2021,09/27/2022 04:46:33 PM,41.753274,-87.652598,"(41.753274445, -87.652598299)"
3,12540388,JE444591,2021-11-14 06:00:00,086XX S COTTAGE GROVE AVE,0850,THEFT,ATTEMPT THEFT,CONVENIENCE STORE,False,False,...,6.0,44,06,1183071.0,1847869.0,2021,11/21/2021 03:48:03 PM,41.737751,-87.604856,"(41.737750767, -87.604855911)"
4,12541139,JE445494,2021-11-14 04:00:00,034XX W 38TH ST,0486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,True,...,12.0,58,08B,1154073.0,1879187.0,2021,11/21/2021 03:48:03 PM,41.824317,-87.710266,"(41.824316537, -87.710266215)"


In [54]:
pd.concat([df2, df4]).ID.tail(10)

207874    12845432
207875    12845249
207876    12846092
207877    12841050
207878    12839669
0         12343475
1         12602803
2         12565821
3         12540388
4         12541139
Name: ID, dtype: int32