In [3]:
# a SparkSession object can perform the most common data processing tasks
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
spark = SparkSession.builder.appName('test').getOrCreate() # will return existing session if one was
                                                           # created before and was not closed

In [16]:
# let PySpark infer the schema
df = spark.read.csv('heart.csv', inferSchema=True, header=True)

# run an SQL query on the data
df.createOrReplaceTempView("df") # tell PySpark how the table will be called in the SQL query
spark.sql("""SELECT * from df""").show(2)

# we also choose columns using SQL sytnx, with a command that combins '.select()' and '.sql()'
df.selectExpr("age >= 40 as older", "age").show(2)

+---+---+-------------+---------+-----------+---------+----------+-----+--------------+-------+--------+------------+
|Age|Sex|ChestPainType|RestingBP|Cholesterol|FastingBS|RestingECG|MaxHR|ExerciseAngina|Oldpeak|ST_Slope|HeartDisease|
+---+---+-------------+---------+-----------+---------+----------+-----+--------------+-------+--------+------------+
| 40|  M|          ATA|      140|        289|        0|    Normal|  172|             N|    0.0|      Up|           0|
| 49|  F|          NAP|      160|        180|        0|    Normal|  156|             N|    1.0|    Flat|           1|
+---+---+-------------+---------+-----------+---------+----------+-----+--------------+-------+--------+------------+
only showing top 2 rows

+-----+---+
|older|age|
+-----+---+
| true| 40|
| true| 49|
+-----+---+
only showing top 2 rows



In [14]:
x = spark.sql("""SELECT COUNT(*) from df""").collect()[0][0]
print(x, type(x))

918 <class 'int'>


# your turn

use spark.sql() to perform the following actions
1. get the max and min cholesterol
2. group by ChestPainType and get the mean Cholesterol
3. how many incidents of each ChestPainType for each sex?


# solution

In [27]:
### 1 get the max and min cholesterol

# solution A (with spark.sql)
answer = spark.sql("""SELECT MIN(Cholesterol), MAX(Cholesterol) from df""")
answer.explain()
print(answer) # the answer is somewhere out there
answer.show()  # this is the action
min_, max_ = answer.collect()[0]
print(min_, max_)

# solution B (with selectExpr)
df.selectExpr("MIN(Cholesterol)", "MAX(Cholesterol)").show()

# solution C (with pyspark dataframes: select and F.min and F.max)
answer = df.select(F.min("Cholesterol"), F.max("Cholesterol"))
answer.explain()
answer.show()  # this is the action

== Physical Plan ==
*(2) HashAggregate(keys=[], functions=[min(Cholesterol#507), max(Cholesterol#507)])
+- Exchange SinglePartition
   +- *(1) HashAggregate(keys=[], functions=[partial_min(Cholesterol#507), partial_max(Cholesterol#507)])
      +- *(1) FileScan csv [Cholesterol#507] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/naya/notebooks/05_spark/02_dataframes/heart.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<Cholesterol:int>
DataFrame[min(Cholesterol): int, max(Cholesterol): int]
+----------------+----------------+
|min(Cholesterol)|max(Cholesterol)|
+----------------+----------------+
|               0|             603|
+----------------+----------------+

0 603
+----------------+----------------+
|min(Cholesterol)|max(Cholesterol)|
+----------------+----------------+
|               0|             603|
+----------------+----------------+

== Physical Plan ==
*(2) HashAggregate(keys=[], functions=[min(Cholesterol#507), max(Cholesterol#5

In [29]:
### 2. group by ChestPainType and get the mean Cholesterol

# Solution A (with spark.sql)
spark.sql("""SELECT ChestPainType, AVG(Cholesterol) from df GROUP BY ChestPainType""").show()


# Solution B (with dataframes)
df.groupBy("ChestPainType").mean("Cholesterol").show()



+-------------+------------------+
|ChestPainType|  avg(Cholesterol)|
+-------------+------------------+
|          NAP| 197.4384236453202|
|          ATA|233.04624277456648|
|           TA|207.06521739130434|
|          ASY| 186.6451612903226|
+-------------+------------------+

+-------------+------------------+
|ChestPainType|  avg(Cholesterol)|
+-------------+------------------+
|          NAP| 197.4384236453202|
|          ATA|233.04624277456648|
|           TA|207.06521739130434|
|          ASY| 186.6451612903226|
+-------------+------------------+



In [30]:
# 3. how many incidents of each ChestPainType for each sex?
spark.sql("""
    SELECT sex, COUNT(ChestPainType)
    FROM df
    GROUP BY sex
    """).show()

+---+--------------------+
|sex|count(ChestPainType)|
+---+--------------------+
|  F|                 193|
|  M|                 725|
+---+--------------------+

