In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [3]:
df = spark.createDataFrame([
    ('apple', 'red', 3),
    ('banana', 'yellow', 5),
    ('strawberry', 'red', 7)
], schema='fruit string, color string, quantity int')

Question 1

In [4]:
df.filter(df.color == 'red').show()

df.withColumn('quantity squared for no reason', pow(df.quantity, 2)).show()

                                                                                

+----------+-----+--------+
|     fruit|color|quantity|
+----------+-----+--------+
|     apple|  red|       3|
|strawberry|  red|       7|
+----------+-----+--------+

+----------+------+--------+------------------------------+
|     fruit| color|quantity|quantity squared for no reason|
+----------+------+--------+------------------------------+
|     apple|   red|       3|                           9.0|
|    banana|yellow|       5|                          25.0|
|strawberry|   red|       7|                          49.0|
+----------+------+--------+------------------------------+



Question 2

In [5]:
print(df.count(), 'rows')
df.show()

3 rows
+----------+------+--------+
|     fruit| color|quantity|
+----------+------+--------+
|     apple|   red|       3|
|    banana|yellow|       5|
|strawberry|   red|       7|
+----------+------+--------+



Question 3

In [6]:
from pyspark.sql.functions import sum, avg

df.select(avg(df.quantity), sum(df.quantity)).show()

+-------------+-------------+
|avg(quantity)|sum(quantity)|
+-------------+-------------+
|          5.0|           15|
+-------------+-------------+



Question 4

In [7]:
file_path = 'data/foo.csv'

df.write.csv(file_path, header=True, mode='overwrite') # 'overwrite' if the file alr exists

Question 5

[Relevant StackOverflow post](https://stackoverflow.com/questions/48927271/count-number-of-words-in-a-spark-dataframe)

`read.text()` will read the text file into a DataFrame. Each line will be stored in a separate row (as a string).

[`functions.split()`](https://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html#pyspark.sql.functions.split) will split each line into an array (`Row`) of words.
[`functions.size()`](https://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html#pyspark.sql.functions.size) will measure the size of this `Row` instance for each line. 

Finally sum the word-counts for all individual lines using [`functions.sum()`](https://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html#pyspark.sql.functions.size). 

In [8]:
from pyspark.sql.functions import size, split, sum

file_path = 'data/shakespeare.txt'

df = spark.read.text(file_path) 

df = df.withColumn('words', size(split(df.value, ' '))) # append a column for words
df.select(sum(df.words)).show()

+----------+
|sum(words)|
+----------+
|       256|
+----------+



In [9]:
from pyspark.sql.functions import udf

@udf
def word_count(string):
    return len(string.split())

df.select(sum(word_count(df.value)).alias("word count")).show() # usage

+----------+
|word count|
+----------+
|     256.0|
+----------+



                                                                                

In [10]:
spark.stop()