In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("wordcountdf").master("local[*]").getOrCreate()
sc = spark.sparkContext
# create a silly test dataframe from Python collections (lists)
wordsDF = spark.createDataFrame([('look',), ('spark',), ('tutorial',), ('spark',), ('look',), ('python',)], ['word'])
wordsDF.show()
print(type(wordsDF))
wordsDF.printSchema()

+--------+
|    word|
+--------+
|    look|
|   spark|
|tutorial|
|   spark|
|    look|
|  python|
+--------+

<class 'pyspark.sql.dataframe.DataFrame'>
root
 |-- word: string (nullable = true)



In [3]:
# This use lazy evaluation: results are not computed right away – Spark remembers the set of transformations applied to the base DataFrame. Think of this as a recipe for creating result.
#     Spark Actions like show(), collect() or count() then cause Spark to execute the recipe to transform the source. It is the mechanism for getting results out of Spark.
# Length of each word
# You can create a new DataFrame from our base DF wordsDF by calling the select DataFrame function and pass in the appropriate recipe: we can use the SQL length function to find the number of characters in each word.
# The length function is found in the pyspark.sql.functions module.
from pyspark.sql.functions import length
wordsLengthsDF = wordsDF.select(length('word').alias('lengths'))  # transformation
wordsLengthsDF.show()  # action


+-------+
|lengths|
+-------+
|      4|
|      5|
|      8|
|      5|
|      4|
|      6|
+-------+



In [6]:
# Now, let's count the number of times a particular word appears in the 'word' column. There are multiple ways to perform the counting, but some are much less efficient than others.
# A naive approach would be to call collect on all of the elements and count them in the driver program. While this approach could work for small datasets, we want an approach that will work for any size dataset including terabyte- or petabyte-sized datasets. In addition, performing all of the work in the driver program is slower than performing it in parallel in the workers. For these reasons, we will use data parallel operations.

#
# Using groupBy and count
# Using DataFrames, we can preform aggregations by grouping the data using the groupBy function on the DataFrame. Using groupBy returns a GroupedData object and we can use the functions available for GroupedData to aggregate the groups. For example, we can call avg or count on a GroupedData object to obtain the average of the values in the groups or the number of occurrences in the groups, respectively.
# To find the counts of words, we group by the words and then use the count function to find the number of times that words occur.


wordCountsDF = wordsDF.groupBy('word').count()
wordCountsDF.show()

+--------+-----+
|    word|count|
+--------+-----+
|tutorial|    1|
|   spark|    2|
|    look|    2|
|  python|    1|
+--------+-----+



In [8]:
# Unique words
# Calculate the number of unique words in wordsDF.
uniqueWordsCount = wordCountsDF\
    .count()
print(uniqueWordsCount)

4


In [9]:
averageCount = wordCountsDF.groupBy().mean('count').collect()[0][0]
print(averageCount)

1.5


In [10]:
def wordCount(wordListDF):
    """Creates a DataFrame with word counts.

    Args:
        wordListDF (DataFrame of str): A DataFrame consisting of one string column called 'word'.

    Returns:
        DataFrame of (str, int): A DataFrame containing 'word' and 'count' columns.
    """
    return (wordListDF
            .groupBy('word').count())


# apply the new function to the words DataFrame, it should get the same result
wordCount(wordsDF).show()

+--------+-----+
|    word|count|
+--------+-----+
|tutorial|    1|
|   spark|    2|
|    look|    2|
|  python|    1|
+--------+-----+



+------------------------------------------+
|sentence                                  |
+------------------------------------------+
|Hi, you                                   |
| Look! [No under_score]?!                 |
| *      Remove punctuation then spaces  * |
+------------------------------------------+

+------------------------------------------+
|sentence                                  |
+------------------------------------------+
|Hi, you                                   |
| Look! [No under_score]?!                 |
| *      Remove punctuation then spaces  * |
+------------------------------------------+



In [86]:
from pyspark.sql.functions import col
# Load a text file
# For the next part, we will use the Complete Works of William Shakespeare from Project Gutenberg. To convert a text file into a DataFrame, we use the sqlContext.read.text() method. We also apply the recently defined removePunctuation() function using a select() transformation to strip out the punctuation and change all text to lower case. Since the file is large we use show(15), so that we only print 15 lines.
fileName = "/Users/tech/codes/SparkJourney/data/PrideandPrejudice.txt"

bookDF = spark.read.text(fileName).select(col("value").alias("sentence"))
bookDF.show(15, truncate=False)

+------------------------------------------------------------------------+
|sentence                                                                |
+------------------------------------------------------------------------+
|The Project Gutenberg eBook of Pride and Prejudice, by Jane Austen      |
|                                                                        |
|This eBook is for the use of anyone anywhere in the United States and   |
|most other parts of the world at no cost and with almost no restrictions|
|whatsoever. You may copy it, give it away or re-use it under the terms  |
|of the Project Gutenberg License included with this eBook or online at  |
|www.gutenberg.org. If you are not located in the United States, you     |
|will have to check the laws of the country where you are located before |
|using this eBook.                                                       |
|                                                                        |
|Title: Pride and Prejudi

In [87]:
from pyspark.sql.functions import split, explode
# Words from lines
# Before we can use the wordcount() function, we have to address two issues with the format of the DataFrame:
#     The first issue is that that we need to split each line by its spaces.
# The second issue is we need to filter out empty lines or words.
# We apply a transformation that will split each 'sentence' in the DataFrame by its spaces, and then transform from a DataFrame that contains lists of words into a DataFrame with each word in its own row. To accomplish these two tasks you can use the split and explode functions found in pyspark.sql.functions.
# Once we have a DataFrame with one word per row we can apply the DataFrame operation where to remove the rows that contain ''.
#


bookWordsSplitDF = bookDF.select(split(bookDF.sentence, ' ').alias('split'))
bookWordsSplitDF.show(15)
bookWordsSingleDF = (bookWordsSplitDF.select(explode(bookWordsSplitDF.split).alias('word')))
bookWordsDF = bookWordsSingleDF.where(bookWordsSingleDF.word != '')
bookWordsDF.show()
bookWordsDFCount = bookWordsDF.count()
print(bookWordsDFCount)

+--------------------+
|               split|
+--------------------+
|[The, Project, Gu...|
|                  []|
|[This, eBook, is,...|
|[most, other, par...|
|[whatsoever., You...|
|[of, the, Project...|
|[www.gutenberg.or...|
|[will, have, to, ...|
|[using, this, eBo...|
|                  []|
|[Title:, Pride, a...|
|                  []|
|[Author:, Jane, A...|
|                  []|
|[Release, Date:, ...|
+--------------------+
only showing top 15 rows

+----------+
|      word|
+----------+
|       The|
|   Project|
| Gutenberg|
|     eBook|
|        of|
|     Pride|
|       and|
|Prejudice,|
|        by|
|      Jane|
|    Austen|
|      This|
|     eBook|
|        is|
|       for|
|       the|
|       use|
|        of|
|    anyone|
|  anywhere|
+----------+
only showing top 20 rows

124749
