In [0]:
1. Identify the top 5 merchants in each city based on rating frequency, average rating, and check-in frequency.

In [1]:


%pyspark
from pyspark.sql import HiveContext
from pyspark.sql.functions import avg, count, col, row_number, explode, split
from pyspark.sql.window import Window

hc = HiveContext(sc)

business_df = hc.table('business')
checkin_df = hc.table('checkin')
review_df = hc.table("review")

business_with_reviews_df = business_df.join(review_df, business_df.business_id == review_df.rev_business_id, "left")

business_stats_df = business_with_reviews_df.groupBy("business_id", "name", "city") \
    .agg(
        avg("rev_stars").alias("Average Rating"),
        count("rev_stars").alias("Rating Frequency")
    )

checkin_counts_df = checkin_df.withColumn("checkin_date", explode(split(col("checkin_dates"), ", "))) \
    .groupBy("business_id") \
    .agg(count("*").alias("checkin_count"))

business_stats_df = business_stats_df.join(checkin_counts_df, "business_id", "left")
business_stats_df = business_stats_df.na.fill(0, subset=["checkin_count"])
business_stats_df = business_stats_df.withColumnRenamed("checkin_count", "Check-in Frequency")

window_spec = Window.partitionBy("city").orderBy(col("Rating Frequency").desc(), col("Average Rating").desc(), col("Check-in Frequency").desc())

ranked_businesses_df = business_stats_df.withColumn("Rank", row_number().over(window_spec))

top_5_businesses_df = ranked_businesses_df.filter(col("Rank") <= 5)

final_result_df = top_5_businesses_df.select(
    col("Rank"),
    col("City"),
    col("name").alias("Business Name"),
    col("Average Rating"),
    col("Rating Frequency"),
    col("Check-in Frequency")
)

z.show(final_result_df)

