In [0]:
1. Analyze the distribution of ratings (1-5 stars).

In [1]:

%pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import count

spark = SparkSession.builder.appName("ReviewAnalysis").getOrCreate()

review_df = spark.table("review")

result = review_df.groupBy("rev_stars") \
                  .agg(count("*").alias("rating_count")) \
                  .withColumnRenamed("rev_stars", "Stars") \
                  .withColumnRenamed("rating_count", "Number of Ratings") \
                  .orderBy("Stars", ascending=True)

z.show(result)



In [2]:
2. Analyze the weekly rating frequency (Monday to Sunday).

In [3]:


%pyspark
from pyspark.sql.functions import date_format, to_date, col, lit, create_map
from itertools import chain

df = spark.table("review")

day_order = {
    'Sunday': 1,
    'Monday': 2,
    'Tuesday': 3,
    'Wednesday': 4,
    'Thursday': 5,
    'Friday': 6,
    'Saturday': 7
}

mapping_expr = create_map([lit(x) for x in chain(*day_order.items())])

result = df.groupBy(date_format(to_date(col("rev_date")), 'EEEE').alias("day_of_week")) \
           .count() \
           .withColumnRenamed("count", "Rating Count") \
           .withColumnRenamed("day_of_week", "Day of Week") \
           .withColumn("day_order", mapping_expr[col("Day of Week")]) \
           .orderBy("day_order") \
           .select("Day of Week", "Rating Count")

z.show(result)



In [4]:
3. Identify the top businesses with the most five-star ratings.

In [5]:


%pyspark
review_df = spark.table("review")
business_df = spark.table("business")

result = review_df.filter(review_df.rev_stars == 5) \
                  .join(business_df, review_df.rev_business_id == business_df.business_id) \
                  .groupBy(business_df.name.alias("Business Name")) \
                  .agg(count("*").alias("Five Star Count")) \
                  .orderBy(desc("Five Star Count")) \
                  .limit(20)

z.show(result)

