# Word count

In [49]:
import findspark
findspark.init()


import pyspark
import pyspark.sql.functions as f

from pyspark.sql           import SparkSession
from pyspark.sql.functions import col, explode, udf, upper, lower
from pyspark.sql.types     import Row, StringType, ShortType, IntegerType, LongType, ArrayType



In [96]:
spark = (SparkSession.builder         
         .appName( 'App_word_count' ) 
         .getOrCreate()
        )


In [97]:
file_path = '/home/art/data/songs/eye_of_the_tiger.txt'

In [98]:
df = spark.read.text( file_path )

In [99]:
df.show(3)

+--------------------+
|               value|
+--------------------+
|Rising up, back o...|
|Did my time, took...|
|Went the distance...|
+--------------------+
only showing top 3 rows



In [100]:
df = df.withColumnRenamed( 'value', 'line' )

In [101]:
df.show(3)

+--------------------+
|                line|
+--------------------+
|Rising up, back o...|
|Did my time, took...|
|Went the distance...|
+--------------------+
only showing top 3 rows



### clean data, cast to lower case and remove comas

In [102]:
def fun_clean( line ):
    s = line.replace( ',', '' )
    s = s.lower() 
    return s

udf_clean = udf( lambda line: fun_clean( line ), StringType() )


In [103]:


df_clean = df.select( udf_clean( 'line' ).alias( 'line' )  )



In [104]:
df_clean.show( 5 )

+--------------------+
|                line|
+--------------------+
|rising up back on...|
|did my time took ...|
|went the distance...|
|just a man and hi...|
|so many times it ...|
+--------------------+
only showing top 5 rows



### split each line in a list of words

In [105]:
df_lists = (df_clean 
            .select( f.split( 'line', ' ' ).alias( 'words' ) )
           )


In [106]:
df_lists.show(3)

+--------------------+
|               words|
+--------------------+
|[rising, up, back...|
|[did, my, time, t...|
|[went, the, dista...|
+--------------------+
only showing top 3 rows



In [109]:
df_words = df_lists.select( f.explode( 'words' ).alias( 'word' )  )

In [110]:
df_words.show(3)

+------+
|  word|
+------+
|rising|
|    up|
|  back|
+------+
only showing top 3 rows



In [114]:
df_count = (df_words
            .groupBy( 'word' )
            .count()
           )

In [116]:
df_count.show( 5 )

+------+-----+
|  word|count|
+------+-----+
|  guts|    1|
|   did|    1|
|   got|    1|
|    us|    3|
|hungry|    1|
+------+-----+
only showing top 5 rows



In [117]:
df_top_words = ( df_count
             .orderBy( 'count', ascending= 0 )

)

In [118]:
df_top_words.show()

+---------+-----+
|     word|count|
+---------+-----+
|      the|   48|
|       of|   17|
|       to|   11|
|      eye|   10|
|    tiger|   10|
|      and|    8|
|     it's|    6|
|       up|    5|
|   rising|    5|
|      his|    5|
|     with|    4|
|       in|    4|
|    fight|    4|
| watching|    3|
|    night|    3|
| survivor|    3|
|       on|    3|
|challenge|    3|
|       my|    3|
|  survive|    3|
+---------+-----+
only showing top 20 rows



### Links

A simple word count application

Databricks

https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/3328674740105987/4033840715400609/6441317451288404/latest.html
    