In [1]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-1.8.0-openjdk-amd64"
os.environ["SPARK_HOME"] = "/home/hadoop/work/spark-3.2.0-bin-hadoop2.7"
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.8"
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3.8"

In [2]:
#!pip install -q findspark
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("Word Count RDD")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

21/10/31 11:09:29 WARN util.Utils: Your hostname, hadoop-Lenovo-G50-80 resolves to a loopback address: 127.0.1.1; using 192.168.189.28 instead (on interface wlp3s0)
21/10/31 11:09:29 WARN util.Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/10/31 11:09:30 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import  split,explode,col,count

sc = spark.sparkContext
df = spark.read.text("word_cnt.txt") # return DataFrame object
print(type(df))
df.show(5, truncate=False)

dfwords = df.withColumn('words', split(col('value'), ' ')) \
    .withColumn('word', explode(col('words'))) \
    .drop('value', 'words') \
    .groupby('word') \
    .agg(count('word').alias('count')) \
    .orderBy('count', ascending=False) \
    .show(10, truncate=False)

print("Word Count DataFrame Spark SQL")
sqllines = df.createOrReplaceTempView('lines')

spark.sql("""select word, count(word) count from
 (select explode(split(value,' '))  word from lines) words group by word order by count desc""").show(10)


<class 'pyspark.sql.dataframe.DataFrame'>


                                                                                

+---------------------------------------------------------------------------------------------------------+
|value                                                                                                    |
+---------------------------------------------------------------------------------------------------------+
|it was the best of times it was the worst of times it was the age of wisdom it was the age of foolishness|
+---------------------------------------------------------------------------------------------------------+

+-----------+-----+
|word       |count|
+-----------+-----+
|the        |4    |
|of         |4    |
|was        |4    |
|it         |4    |
|times      |2    |
|age        |2    |
|worst      |1    |
|wisdom     |1    |
|foolishness|1    |
|best       |1    |
+-----------+-----+

Word Count DataFrame Spark SQL
+-----------+-----+
|       word|count|
+-----------+-----+
|        the|    4|
|         of|    4|
|        was|    4|
|         it|    4|
|      

In [5]:
from pyspark.sql.functions import split, col, explode, count

from pyspark import SparkConf, SparkContext

conf = SparkConf().setAppName(" Word Count RDD ")
sc = SparkContext.getOrCreate(conf=conf)

text = sc.textFile('word_cnt.txt') # return RDD object
print(type(text))
print(text.collect())

<class 'pyspark.rdd.RDD'>
['it was the best of times it was the worst of times it was the age of wisdom it was the age of foolishness']


In [6]:
rdd2 = text.flatMap(lambda x: x.split(' '))
print(rdd2.collect())

['it', 'was', 'the', 'best', 'of', 'times', 'it', 'was', 'the', 'worst', 'of', 'times', 'it', 'was', 'the', 'age', 'of', 'wisdom', 'it', 'was', 'the', 'age', 'of', 'foolishness']


In [7]:
rdd3 = rdd2.map(lambda x: (x, 1))
print(rdd3.collect())

[('it', 1), ('was', 1), ('the', 1), ('best', 1), ('of', 1), ('times', 1), ('it', 1), ('was', 1), ('the', 1), ('worst', 1), ('of', 1), ('times', 1), ('it', 1), ('was', 1), ('the', 1), ('age', 1), ('of', 1), ('wisdom', 1), ('it', 1), ('was', 1), ('the', 1), ('age', 1), ('of', 1), ('foolishness', 1)]


In [8]:
rdd3 = rdd2.map(lambda x: (x, 1))
print(rdd3.collect())

rdd4 = rdd3.reduceByKey(lambda x, y: x + y)
print(rdd4.collect())

# all operation in 1 line
print("Word Count in 1 line ")

counts = text.flatMap(lambda line: line.split(" ")) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda x, y: x + y)

print(counts.collect())

[('it', 1), ('was', 1), ('the', 1), ('best', 1), ('of', 1), ('times', 1), ('it', 1), ('was', 1), ('the', 1), ('worst', 1), ('of', 1), ('times', 1), ('it', 1), ('was', 1), ('the', 1), ('age', 1), ('of', 1), ('wisdom', 1), ('it', 1), ('was', 1), ('the', 1), ('age', 1), ('of', 1), ('foolishness', 1)]
[('it', 4), ('was', 4), ('the', 4), ('best', 1), ('of', 4), ('times', 2), ('worst', 1), ('age', 2), ('wisdom', 1), ('foolishness', 1)]
Word Count in 1 line 
[('it', 4), ('was', 4), ('the', 4), ('best', 1), ('of', 4), ('times', 2), ('worst', 1), ('age', 2), ('wisdom', 1), ('foolishness', 1)]
