In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import length, element_at, size, split, udf, explode, desc
from pyspark.sql.types import StringType
import re

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .getOrCreate()

eiffel_df = spark.read.json('file:///home/ec2-user/eiffel-tower-reviews.json').select('text')
eiffel_df.show()

Setting default log level to "

22/12/08 22:51:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
+--------------------+
|                text|
+--------------------+
|This is the most ...|
|My significant ot...|
|We had a tour to ...|
|Visited with my w...|
|We went in the ni...|
|Dont hesitate and...|
|I enjoyed the tow...|
|Read through the ...|
|This by far was o...|
|Something you hav...|
|The views are bea...|
|Worth spending a ...|
|Took the tour to ...|
|A fantastic fusio...|
|Whatever you do i...|
|Not to miss..beau...|
|We visited in the...|
|Go for sunset and...|
|We booked weeks a...|
|Eiffel Tower is j...|
+--------------------+
only showing top 20 rows



In [2]:
from pyspark.sql.functions import col, explode, split, desc, lower
from pyspark.ml.feature import StopWordsRemover


stop = StopWordsRemover(inputCol='words', outputCol='most_used_words')
stop.loadDefaultStopWords('english')

eiffel_df2 = eiffel_df.select(split(lower(eiffel_df.text),' ').alias('words')).withColumnRenamed('words', 'words').na.drop()

#Remove stopwords
eiffel_df3 = stop.transform(eiffel_df2)
eiffel_df4 = eiffel_df3.drop('words')

eiffel_df5 = eiffel_df4.select(explode(eiffel_df4.most_used_words)).withColumnRenamed('col', 'most_used_words')
eiffel_df5 = eiffel_df5.filter(eiffel_df5.most_used_words != '')
eiffel_df6 = eiffel_df5.groupBy(eiffel_df5.most_used_words).count().orderBy(desc('count'))
eiffel_df6.show()


+---------------+-----+
|most_used_words|count|
+---------------+-----+
|          tower| 4184|
|         eiffel| 3246|
|             go| 2594|
|          paris| 1997|
|            see| 1973|
|            top| 1876|
|            get| 1833|
|           time| 1661|
|           view| 1581|
|          visit| 1322|
|          views| 1239|
|        tickets| 1222|
|          worth| 1164|
|           must| 1147|
|           went| 1120|
|          night| 1107|
|         tower.| 1039|
|            day| 1038|
|          great| 1038|
|            one| 1024|
+---------------+-----+
only showing top 20 rows



In [3]:
spark.stop()