In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc

In [2]:
# Create a SparkSession
spark = SparkSession.builder.appName("DataFrame-Demo").getOrCreate()

# Using RDDs

In [3]:
rdd = spark.sparkContext.textFile("./data/data.txt")
result_rdd = rdd.flatMap(lambda line: line.split(" ")) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda a, b: a + b) \
    .sortBy(lambda x: x[1], ascending=False)

In [4]:
result_rdd.take(10)

[('the', 12),
 ('of', 7),
 ('a', 7),
 ('in', 5),
 ('distributed', 5),
 ('Spark', 4),
 ('is', 3),
 ('as', 3),
 ('API', 3),
 ('on', 3)]

# Using DataFrames

In [5]:
df = spark.read.text("./data/data.txt")

result_df = df.selectExpr("explode(split(value, ' ')) as word") \
    .groupBy("word").count().orderBy(desc("count"))

In [6]:
result_df.take(10)

[Row(word='the', count=12),
 Row(word='of', count=7),
 Row(word='a', count=7),
 Row(word='in', count=5),
 Row(word='distributed', count=5),
 Row(word='Spark', count=4),
 Row(word='API', count=3),
 Row(word='RDD', count=3),
 Row(word='is', count=3),
 Row(word='on', count=3)]

In [7]:
spark.stop()