In [19]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [20]:
df = spark.createDataFrame([
    ('apple', 'red', 3),
    ('banana', 'yellow', 5),
    ('strawberry', 'red', 7)
], schema='fruit string, color string, quantity int')

Question 1

In [21]:
df.filter(df.color == 'red').show()

df.withColumn('quantity squared for no reason', pow(df.quantity, 2)).show()

+----------+-----+--------+
|     fruit|color|quantity|
+----------+-----+--------+
|     apple|  red|       3|
|strawberry|  red|       7|
+----------+-----+--------+

+----------+------+--------+------------------------------+
|     fruit| color|quantity|quantity squared for no reason|
+----------+------+--------+------------------------------+
|     apple|   red|       3|                           9.0|
|    banana|yellow|       5|                          25.0|
|strawberry|   red|       7|                          49.0|
+----------+------+--------+------------------------------+



Question 2

In [22]:
print(df.count(), 'rows')
df.show()

3 rows
+----------+------+--------+
|     fruit| color|quantity|
+----------+------+--------+
|     apple|   red|       3|
|    banana|yellow|       5|
|strawberry|   red|       7|
+----------+------+--------+



Question 3

In [23]:
from pyspark.sql.functions import sum, avg

df.select(avg(df.quantity), sum(df.quantity)).show()

+-------------+-------------+
|avg(quantity)|sum(quantity)|
+-------------+-------------+
|          5.0|           15|
+-------------+-------------+



Question 4

In [24]:
file_path = 'data/foo.csv'
df.write.csv(file_path, header=True, mode='overwrite') # 'overwrite' if the file alr exists

Question 5 using DataFrames

In [25]:
from pyspark.sql.functions import udf, sum

file_path = 'data/shakespeare.txt'
df = spark.read.text(file_path) 

@udf
def word_count(string):
    return len(string.split())

df.select(sum(word_count(df.value)).alias("word count")).show() # usage

+----------+
|word count|
+----------+
|     256.0|
+----------+



Question 5 using RDDs

In [39]:
rdd = sc.textFile(file_path)

In [43]:
# count of individual words
word_frequencies = rdd.flatMap(lambda line: line.split()) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda x, y: x + y)

# displaying 5 most frequent words
word_frequencies.top(5, key=lambda t: t[1])

[('of', 14), ('the', 14), ('to', 8), ('and', 7), ('To', 5)]

In [42]:
# count of all words
rdd.flatMap(lambda line: line.split()).count()

256