In [1]:
# Importing required packages
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc

In [2]:
# Create a SparkSession object as spark with 'DFRDD' app name
spark = SparkSession.builder.appName('DFRDD').getOrCreate()

### Word Count example using the 'PySpark_Sample.txt' file
- Firstly, the job is done by using an rdd, created as rdd
- Secondly, the same job is executed by converting the sample data into data frame format.
- In both cases, give the top 10 words having most occurances

In [3]:
# Create an RDD form the 'PySpark_Sample.txt' file
rdd = spark.sparkContext.textFile('PySpark_Sample.txt')

In [6]:
# Creating the result as another rdd named as result_rdd
result_rdd = rdd.flatMap(lambda line: line.split(" ")) \
                .map(lambda word: (word, 1)) \
                .reduceByKey(lambda a, b: a + b) \
                .sortBy(lambda x: x[1], ascending=False)

In [7]:
# Get most 10 frequent words from result_rdd
result_rdd.take(10)

[('the', 34),
 ('and', 22),
 ('of', 18),
 ('a', 16),
 ('in', 11),
 ('was', 11),
 ('with', 11),
 ('to', 11),
 ('The', 8),
 ('that', 8)]

In [8]:
# Creating a DataFrame object from 'PySpark_Sample.txt' file
df = spark.read.text('PySpark_Sample.txt')

In [9]:
# Executing the sample word count example using the data frame
result_df = df.selectExpr("explode(split(value, ' ')) as word").groupBy("word").count().orderBy(desc("count"))

In [10]:
# Get the most 10 frequent words from result_df
result_df.take(10)

[Row(word='the', count=34),
 Row(word='and', count=22),
 Row(word='of', count=18),
 Row(word='a', count=16),
 Row(word='was', count=11),
 Row(word='in', count=11),
 Row(word='with', count=11),
 Row(word='to', count=11),
 Row(word='The', count=8),
 Row(word='that', count=8)]

In [11]:
# Stopping the SparkSession
spark.stop()