- ["Quickstart: DataFrame"](https://spark.apache.org/docs/latest/api/python/getting_started/quickstart_df.html) -- one page guide by the official docs

- [`pyspark.sql.functions`](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html) -- discover useful column-wise functions here

In [14]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

Question 1

In [15]:
df = spark.createDataFrame([
    (x,) for x in range(1, 5)
], schema='x int')

In [16]:
from pyspark.sql.functions import pow

df.withColumn("x squared", pow(df.x, 2)).show()

+---+---------+
|  x|x squared|
+---+---------+
|  1|      1.0|
|  2|      4.0|
|  3|      9.0|
|  4|     16.0|
+---+---------+



Question 2

In [17]:
from pyspark.sql.functions import max

df.select(max(df.x)).show()

+------+
|max(x)|
+------+
|     4|
+------+



Question 3

In [18]:
from pyspark.sql.functions import avg

df.select(avg(df.x)).show()

+------+
|avg(x)|
+------+
|   2.5|
+------+



Question 4

In [19]:
file_path = 'data/foo.csv'

df.write.csv(file_path, header=True, mode='overwrite') # 'overwrite' if the file alr exists

spark.read.csv(file_path, header=True).show()

+---+
|  x|
+---+
|  1|
|  2|
|  3|
|  4|
+---+



Question 5

In [20]:
df.show(3) # display the first 3 rows, defaults to 20

df.printSchema()

+---+
|  x|
+---+
|  1|
|  2|
|  3|
+---+
only showing top 3 rows

root
 |-- x: integer (nullable = true)



Question 6

In [21]:
df.describe('x').show()

# another alternative is
# df.select(df.x).describe().show()

+-------+------------------+
|summary|                 x|
+-------+------------------+
|  count|                 4|
|   mean|               2.5|
| stddev|1.2909944487358056|
|    min|                 1|
|    max|                 4|
+-------+------------------+

