In [None]:
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.3'
spark_version = 'spark-3.<enter version>'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Q1").getOrCreate()

In [3]:
from pyspark import SparkFiles
# Load in user_data.csv from S3 into a DataFrame
url = "https://s3.amazonaws.com/dataviz-curriculum/day_3/checkpoints/question_one.csv"
spark.sparkContext.addFile(url)

df = spark.read.option('header', 'true').csv(SparkFiles.get("question_one.csv"), inferSchema=True, sep=',', timestampFormat="mm/dd/yy")
df.show(10)

+--------------------+----------+
|    coffee_shop_name|num_rating|
+--------------------+----------+
|The Factory - Caf...|         5|
|The Factory - Caf...|         4|
|The Factory - Caf...|         4|
|The Factory - Caf...|         2|
|The Factory - Caf...|         4|
|The Factory - Caf...|         4|
|The Factory - Caf...|         4|
|The Factory - Caf...|         5|
|The Factory - Caf...|         3|
|The Factory - Caf...|         5|
+--------------------+----------+
only showing top 10 rows



 # What is the average rating for the coffee shop with the most reviews?

In [4]:
from pyspark.sql.functions import desc
coffee_ratings_df =df.select(["coffee_shop_name","num_rating"])\
  .groupby("coffee_shop_name")\
  .agg({"num_rating": "avg", "coffee_shop_name":"count"})
coffee_ratings_df.show(truncate=False)

+------------------------------+-----------------------+------------------+
|coffee_shop_name              |count(coffee_shop_name)|avg(num_rating)   |
+------------------------------+-----------------------+------------------+
|Flitch Coffee                 |28                     |4.821428571428571 |
|Apanas Coffee & Beer          |136                    |4.580882352941177 |
|Arturo's Underground Cafe     |100                    |4.3               |
|Lola Savannah Coffee Downtown |4                      |5.0               |
|Lola Savannah Coffee Lounge   |100                    |4.11              |
|Epoch Coffee                  |400                    |3.8125            |
|Caffe Medici                  |243                    |4.1193415637860085|
|Figure 8 Coffee Purveyors     |100                    |4.5               |
|Hot Mama's Cafe               |100                    |4.27              |
|Sorrento's Coffee             |100                    |4.26              |
|The Steepin

In [5]:
coffee_ratings_df.orderBy(desc("count(coffee_shop_name)")).show(truncate=False)

+-------------------------------+-----------------------+------------------+
|coffee_shop_name               |count(coffee_shop_name)|avg(num_rating)   |
+-------------------------------+-----------------------+------------------+
|Epoch Coffee                   |400                    |3.8125            |
|Halcyon                        |300                    |3.82              |
|The Factory - Cafe With a Soul |244                    |4.360655737704918 |
|Caffe Medici                   |243                    |4.1193415637860085|
|Houndstooth Coffee             |200                    |4.175             |
|Venezia Italian Gelato         |200                    |4.81              |
|Apanas Coffee & Beer           |136                    |4.580882352941177 |
|Arturo's Underground Cafe      |100                    |4.3               |
|Flightpath Coffeehouse         |100                    |4.23              |
|La Patisserie                  |100                    |4.05              |