In [0]:
1. Identify the 20 most common merchants in the U.S.

In [1]:


%pyspark
df = spark.table("business")

result = df.groupBy("name") \
           .count() \
           .withColumnRenamed("count", "name_count") \
           .withColumnRenamed("name", "Name") \
           .withColumnRenamed("name_count", "Number of Businesses") \
           .orderBy("Number of Businesses", ascending=False) \
           .limit(20)

z.show(result)



In [2]:
2. Identify the top 10 cities with the most merchants in the U.S.

In [3]:


%pyspark
df = spark.table("business")

result = df.groupBy("city") \
           .count() \
           .withColumnRenamed("count", "business_count") \
           .withColumnRenamed("city", "City") \
           .withColumnRenamed("business_count", "Number of Businesses") \
           .orderBy("Number of Businesses", ascending=False) \
           .limit(10)

z.show(result)



In [4]:
3. Identify the top 5 states with the most merchants in the U.S.

In [5]:


%pyspark
df = spark.table("business")

result = df.groupBy("state") \
           .count() \
           .withColumnRenamed("count", "business_count") \
           .withColumnRenamed("state", "State") \
           .withColumnRenamed("business_count", "Number of Businesses") \
           .orderBy("Number of Businesses", ascending=False) \
           .limit(5)

z.show(result)



In [6]:
4. Identify the 20 most common merchants in the U.S. and display their average ratings.

In [7]:


%pyspark
business_df = spark.table("business")

result = business_df \
         .groupBy("name") \
         .agg({"stars": "avg", "name": "count"}) \
         .withColumnRenamed("avg(stars)", "Average Rating") \
         .withColumnRenamed("count(name)", "name_count") \
         .orderBy("name_count", ascending=False) \
         .select("name", "Average Rating") \
         .withColumnRenamed("name", "Business Name") \
         .limit(20)

z.show(result)



In [8]:
5. Identify the top 10 cities with the highest ratings.

In [9]:


%pyspark
business_df = spark.table("business")

result = business_df \
         .groupBy("city") \
         .agg({"stars": "avg"}) \
         .withColumnRenamed("avg(stars)", "Average Rating") \
         .orderBy("Average Rating", ascending=False) \
         .selectExpr("city as City", "`Average Rating`") \
         .limit(10)

z.show(result)



In [10]:
6. Count the number of different categories.

In [11]:


%pyspark
business_df = spark.table("business")

business_df_filtered = business_df.filter(business_df.categories.isNotNull())

categories_exploded = business_df_filtered.selectExpr("explode(split(categories, ',')) as category")

result = categories_exploded.selectExpr("trim(category) as category") \
                           .distinct() \
                           .agg({"*": "count"}) \
                           .withColumnRenamed("count(1)", "The Number of Different Categories")

z.show(result)



In [12]:
7. Identify the top 10 most frequent categories and their count.

In [13]:


%pyspark
business_df = spark.table("business")

result = business_df \
    .filter(business_df.categories.isNotNull()) \
    .selectExpr("explode(split(categories, ',')) as category") \
    .selectExpr("trim(category) as category") \
    .groupBy("category") \
    .count() \
    .withColumnRenamed("count", "Category Count") \
    .withColumnRenamed("category", "Category") \
    .orderBy("Category Count", ascending=False) \
    .limit(10)

z.show(result)



In [14]:
8. Identify the top 20 merchants that received the most five-star reviews. 

In [15]:


%pyspark
review_df = spark.table("review")
business_df = spark.table("business")

result = review_df \
    .filter(review_df.rev_stars == 5) \
    .groupBy("rev_business_id") \
    .count() \
    .withColumnRenamed("count", "Five-Star Reviews") \
    .join(business_df, review_df.rev_business_id == business_df.business_id) \
    .select("name", "Five-Star Reviews") \
    .withColumnRenamed("name", "Business Name") \
    .orderBy("Five-Star Reviews", ascending=False) \
    .limit(20)

z.show(result)



In [16]:
9. Count the number of restaurant types (Chinese, American, Mexican).

In [17]:


%pyspark
business_df = spark.table("business")

result = business_df \
    .filter(business_df.categories.like("%Restaurants%")) \
    .selectExpr(
        "CASE " +
        "  WHEN categories LIKE '%American%' THEN 'American' " +
        "  WHEN categories LIKE '%Mexican%' THEN 'Mexican' " +
        "  WHEN categories LIKE '%Chinese%' THEN 'Chinese' " +
        "  ELSE 'Other' " +
        "END AS cuisine_type"
    ) \
    .groupBy("cuisine_type") \
    .count() \
    .withColumnRenamed("count", "restaurant_count") \
    .withColumnRenamed("cuisine_type", "Cuisine Type") \
    .withColumnRenamed("restaurant_count", "Restaurant Count") \
    .orderBy("Restaurant Count", ascending=False)

z.show(result)



In [18]:
10. Count the number of reviews for each restaurant type (Chinese, American, Mexican).

In [19]:


%pyspark
business_df = spark.table("business")
review_df = spark.table("review")

result = business_df \
    .filter(business_df.categories.like("%Restaurants%")) \
    .join(review_df, business_df.business_id == review_df.rev_business_id) \
    .selectExpr(
        "CASE " +
        "  WHEN categories LIKE '%American%' THEN 'American' " +
        "  WHEN categories LIKE '%Mexican%' THEN 'Mexican' " +
        "  WHEN categories LIKE '%Chinese%' THEN 'Chinese' " +
        "  ELSE 'Other' " +
        "END AS cuisine_type"
    ) \
    .groupBy("cuisine_type") \
    .count() \
    .withColumnRenamed("count", "review_count") \
    .withColumnRenamed("cuisine_type", "Cuisine Type") \
    .withColumnRenamed("review_count", "Review Count") \
    .orderBy("Review Count", ascending=False)

z.show(result)



In [20]:
11. Analyze the rating distribution for different restaurant types (Chinese, American, Mexican).

In [21]:


%pyspark
business_df = spark.table("business")
review_df = spark.table("review")

result = business_df \
    .filter(business_df.categories.like("%Restaurants%")) \
    .join(review_df, business_df.business_id == review_df.rev_business_id) \
    .selectExpr(
        "CASE " +
        "  WHEN categories LIKE '%American%' THEN 'American' " +
        "  WHEN categories LIKE '%Mexican%' THEN 'Mexican' " +
        "  WHEN categories LIKE '%Chinese%' THEN 'Chinese' " +
        "  ELSE 'Other' " +
        "END AS cuisine_type",
        "rev_stars"
    ) \
    .groupBy("cuisine_type") \
    .avg("rev_stars") \
    .withColumnRenamed("avg(rev_stars)", "average_rating") \
    .withColumnRenamed("cuisine_type", "Cuisine Type") \
    .withColumnRenamed("average_rating", "Average Rating") \
    .orderBy("Average Rating", ascending=False)

z.show(result)

