In [0]:
1. Count the number of reviews per year

In [1]:


%pyspark
df = spark.table("review")

result = df.groupBy(date_format(col("rev_date"), "yyyy").alias("Year")) \
           .count() \
           .withColumnRenamed("count", "Review Count") \
           .orderBy("Year")

z.show(result)



In [2]:
2. Count the number of useful, funny, and cool reviews

In [3]:


%pyspark
df = spark.table("review")

result = df.select(
               sum("rev_useful").alias("Total Useful"),
               sum("rev_funny").alias("Total Funny"),
               sum("rev_cool").alias("Total Cool")
           )

z.show(result)



In [4]:
3. Rank users by the total number of reviews each year

In [5]:


%pyspark
review_df = spark.table("review")
users_df = spark.table("users")

result = review_df.join(users_df, review_df.rev_user_id == users_df.user_id) \
                  .groupBy(users_df.user_name.alias("Review User"), 
                           year(col("rev_date")).alias("Year")) \
                  .count() \
                  .withColumnRenamed("count", "Review Count") \
                  .orderBy("Year", desc("Review Count"))

z.show(result)



In [6]:
4. Extract the Top 20 most common words from all reviews

In [7]:


%pyspark
from pyspark.sql.functions import split, explode, lower, col

df = spark.table("review")

result = df.select("rev_text") \
           .withColumn("word", explode(split(lower(col("rev_text")), "\\W+"))) \
           .groupBy("word") \
           .count() \
           .withColumnRenamed("word", "Word") \
           .withColumnRenamed("count", "Frequency") \
           .orderBy("Frequency", ascending=False) \
           .limit(20)

z.show(result)



In [8]:
5. Extract the Top 10 words from positive reviews (rating > 3)

In [9]:


%pyspark
from pyspark.sql.functions import split, explode, lower, col

df = spark.table("review")

result = df.filter(col("rev_stars") > 3) \
           .withColumn("word", explode(split(lower(col("rev_text")), "\\W+"))) \
           .groupBy("word") \
           .count() \
           .withColumnRenamed("word", "Word") \
           .withColumnRenamed("count", "Frequency") \
           .orderBy("Frequency", ascending=False) \
           .limit(10)

z.show(result)



In [10]:
6. Extract the Top 10 words from negative reviews (rating ≤ 3)

In [11]:


%pyspark
from pyspark.sql.functions import split, explode, lower, col

df = spark.table("review")

result = df.filter(col("rev_stars") <= 3) \
           .withColumn("word", explode(split(lower(col("rev_text")), "\\W+"))) \
           .groupBy("word") \
           .count() \
           .withColumnRenamed("word", "Word") \
           .withColumnRenamed("count", "Frequency") \
           .orderBy("Frequency", ascending=False) \
           .limit(10)

z.show(result)



In [12]:
7. Perform word cloud analysis by filtering words based on part-of-speech tagging

In [13]:


%pyspark
import nltk
import os
from pyspark.sql.functions import udf, explode, col, desc
from pyspark.sql.types import ArrayType, StringType
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk_data_dir = os.path.expanduser("~/nltk_data")
nltk.data.path.insert(0, nltk_data_dir)

stop_words = set(stopwords.words('english'))

@udf(returnType=ArrayType(StringType()))
def extract_filtered_words(text):
    words = word_tokenize(text.lower())
    filtered_words = [word for word in words 
                      if word.isalpha() and len(word) > 2 and word not in stop_words]
    return filtered_words

reviews_df = spark.table("review")

words_df = reviews_df.withColumn("filtered_words", extract_filtered_words(col("rev_text")))

exploded_df = words_df.select(explode(col("filtered_words")).alias("word"))

word_cloud_df = exploded_df.groupBy("word") \
                            .count() \
                            .withColumnRenamed("count", "frequency") \
                            .orderBy(desc("frequency"))

result = word_cloud_df.withColumnRenamed("word", "Word") \
                      .withColumnRenamed("frequency", "Frequency")

z.show(result)



In [14]:
8. Construct a word association graph

In [15]:


%pyspark
from pyspark.sql.functions import explode, split, lower, col, expr

df = spark.table("review")

words_df = df.withColumn("words", split(lower(col("rev_text")), "\\s+"))

word_pairs = words_df.select(
    explode(expr("transform(sequence(0, size(words) - 2), i -> struct(words[i] as word1, words[i+1] as word2))")).alias("pair")
)

result = word_pairs.select(
    col("pair.word1").alias("Word 1"),
    col("pair.word2").alias("Word 2")
).groupBy("Word 1", "Word 2") \
 .count() \
 .withColumnRenamed("count", "Count") \
 .orderBy("Count", ascending=False)

z.show(result)

