# Spark SQL

In [2]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName('Spark DataFrames') \
    .getOrCreate()
sc = spark.sparkContext

In [16]:
import pandas as pd
data_dict = {'a': 1, 'b': 2, 'c': 3, 'd':3, 'e':1}
pandas_df = pd.DataFrame.from_dict(
    data_dict, orient='index', columns=['position'])
pandas_df

Unnamed: 0,position
a,1
b,2
c,3
d,3
e,1


In [17]:
spark_df = spark.createDataFrame(pandas_df)
spark_df

DataFrame[position: bigint]

In [18]:
spark_df.show()

+--------+
|position|
+--------+
|       1|
|       2|
|       3|
|       3|
|       1|
+--------+



### Running sql queries against DataFrames

In [19]:
spark_df.createOrReplaceTempView('my_table')

In [45]:
spark.sql("SELECT * FROM my_table LIMIT 5").show()

+--------+
|position|
+--------+
|       1|
|       2|
|       3|
|       3|
|       1|
+--------+



Multi-line statements need the use of triple quotes `"""`

In [46]:
spark.sql("SHOW TABLES;").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|        | my_table|       true|
+--------+---------+-----------+



In [21]:
spark.sql("""
    SELECT position
    FROM my_table
    LIMIT 2
""").show()

+--------+
|position|
+--------+
|       1|
|       2|
+--------+



That's convenient, but we can use PySpark DataFrames API to perform the same operations.

#### `.limit(num)`
Like SQL's `LIMIT`.  
Limit the DataFrame to `num` rows.

In [22]:
spark_df.limit(2).show()

+--------+
|position|
+--------+
|       1|
|       2|
+--------+



#### `.filter(...)`

In [23]:
spark_df.filter(spark_df.position >= 3).show()

+--------+
|position|
+--------+
|       3|
|       3|
+--------+



--- 
> 💡 We can even mix both APIs

---

In [24]:
spark_df.limit(2).selectExpr("position * 2", "abs(position)").show()

+--------------+-------------+
|(position * 2)|abs(position)|
+--------------+-------------+
|             2|            1|
|             4|            2|
+--------------+-------------+



#### `.dropDuplicates(...)`

In [26]:
spark_df.show()

+--------+
|position|
+--------+
|       1|
|       2|
|       3|
|       3|
|       1|
+--------+



In [27]:
spark_df.dropDuplicates().show()

+--------+
|position|
+--------+
|       1|
|       3|
|       2|
+--------+



#### `.distinct()`

In [28]:
spark_df.distinct().show()

+--------+
|position|
+--------+
|       1|
|       3|
|       2|
+--------+



#### `.orderBy(...)`
Alias to `.sort(...)`

In [29]:
spark_df.orderBy('position').show()

+--------+
|position|
+--------+
|       1|
|       1|
|       2|
|       3|
|       3|
+--------+



We can call `.desc()` to get a descending order, but that means we need an actual `Column` object to call it on. 

In [30]:
spark_df.orderBy(spark_df.position.desc()).show()

+--------+
|position|
+--------+
|       3|
|       3|
|       2|
|       1|
|       1|
+--------+



#### `.groupBy(...)`

In [31]:
spark_df.groupBy('position')

<pyspark.sql.group.GroupedData at 0x7fcd417cfb50>

Returns a `GroupedData` object. We need to take some action on this.

In [32]:
# Another action, this one works
spark_df.groupBy('position').count()

DataFrame[position: bigint, count: bigint]

⚠️ When applied to a DataFrame, `.count()` is an action. In this case it returns a `DataFrame`, e.g. still waiting for an action.

In [33]:
spark_df.groupBy('position').count().show()

+--------+-----+
|position|count|
+--------+-----+
|       1|    2|
|       3|    2|
|       2|    1|
+--------+-----+



#### Chaining everything together

In [34]:
spark_df \
    .filter(spark_df.position < 2) \
    .groupBy('position') \
    .count() \
    .orderBy('count') \
    .limit(5) \
    .show()

+--------+-----+
|position|count|
+--------+-----+
|       1|    2|
+--------+-----+



Question: what if we want to order by descending count?

In [36]:
spark_df \
    .filter(spark_df.position < 10) \
    .groupBy('position') \
    .count() \
    .orderBy('count', ascending=False) \
    .limit(5) \
    .show()

+--------+-----+
|position|count|
+--------+-----+
|       1|    2|
|       3|    2|
|       2|    1|
+--------+-----+



In [38]:
spark_df \
    .filter(spark_df.position < 10) \
    .groupBy('position') \
    .count() \
    .orderBy(spark_df.position.desc()) \
    .limit(5) \
    .show()

+--------+-----+
|position|count|
+--------+-----+
|       3|    2|
|       2|    1|
|       1|    2|
+--------+-----+



### Adding columns
Using pure select is possible, but can feel tedious

In [39]:
spark_df.select('*', spark_df.position.alias('newColumn')).show()

+--------+---------+
|position|newColumn|
+--------+---------+
|       1|        1|
|       2|        2|
|       3|        3|
|       3|        3|
|       1|        1|
+--------+---------+



#### `.withColumn(...)`
It's usually easier to use `.withColumn` for the same effect.

In [40]:
spark_df.withColumn('newColumn', spark_df.position).show()

+--------+---------+
|position|newColumn|
+--------+---------+
|       1|        1|
|       2|        2|
|       3|        3|
|       3|        3|
|       1|        1|
+--------+---------+



#### `withColumnRenamed(...)`

In [41]:
spark_df.withColumnRenamed('position', 'newName').show()

+-------+
|newName|
+-------+
|      1|
|      2|
|      3|
|      3|
|      1|
+-------+



### Displaying the DataFrame
For when `.show()` won't cut it...

#### Converting to pandas'
Using `toPandas()`: this is an action, it will compute.  
Hence, do **NOT** forget to `limit` or you'll explode the memory (unless the DataFrame is small, like the result of an aggregate).

In [42]:
pandas_df = spark_df.limit(5).toPandas()
pandas_df

Unnamed: 0,position
0,1
1,2
2,3
3,3
4,1
