In [None]:
import os
# Find the latest version of spark 3.2  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.2.2'
spark_version = 'spark-3.<enter version>'

os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Q1").getOrCreate()

In [4]:
from pyspark import SparkFiles
# Load in helpful_sentences.csv from S3 into a DataFrame

url = "https://2u-data-curriculum-team.s3.amazonaws.com/dataviz-classroom/v1.1/22-big-data/day_3/helpful_sentences.csv"
spark.sparkContext.addFile(url)

df = spark.read.option('header', 'true').csv(SparkFiles.get("helpful_sentences.csv"), inferSchema=True, sep=',', timestampFormat="mm/dd/yy")
df.show(10)

+----------+--------------------+------------------+--------------------+--------------------+
|      asin|            sentence|           helpful|      main_image_url|       product_title|
+----------+--------------------+------------------+--------------------+--------------------+
|B000AO3L84|this flash is a s...|1.7000000000000002|http://ecx.images...|Canon 430EX Speed...|
|B001SEQPGK|The pictures were...|               1.3|http://ecx.images...|Sony Cyber-shot D...|
|0553386697|A very good resou...|               1.9|http://ecx.images...|The Whole-Brain C...|
|B006SUWZH2|We have it in a c...|              0.25|http://ecx.images...|Memorex Portable ...|
|B000W7F5SS|Again the makers ...|               0.9|http://ecx.images...|Harry Potter and ...|
|B000AO3L84|This flash is a g...|               2.0|http://ecx.images...|Canon 430EX Speed...|
|B00081NX5U|So I've had these...|              0.73|http://ecx.images...|iPod Detachable R...|
|B00000F1D3|"they're cd's or ...|               0.

 # What is the average rating for "asin" (Amazon Standard Identification Number -ASIN- is a ten-digit alphanumeric code that identifies products on Amazon) with the most reviews?

In [6]:
from pyspark.sql.functions import desc
asin_ratings_df =df.select(["asin","helpful"])\
  .groupby("asin")\
  .agg({"helpful": "avg", "asin":"count"})
asin_ratings_df.show(truncate=False)

+----------+-----------+------------------+
|asin      |count(asin)|avg(helpful)      |
+----------+-----------+------------------+
|B001K5UQX0|57         |1.2210526315789478|
|B00004UFOO|215        |1.3140654205607474|
|B0006ZOV5E|106        |1.1395833333333334|
|B00VG90446|148        |1.356081081081081 |
|B003T90WY8|189        |1.3260962566844918|
|0064430170|58         |1.1074999999999997|
|0375703764|951        |1.0754918918918923|
|B00B0DWB62|360        |1.1378333333333328|
|B003G9ZQQA|190        |1.1575661375661372|
|0553391135|89         |1.0204545454545453|
|B00ITOAYOQ|171        |1.0711176470588244|
|B00000JZC7|103        |1.0443298969072166|
|B00007E7K9|154        |1.3640131578947365|
|B001SEQPGK|99         |1.302323232323232 |
|B0018QROM2|305        |1.2875247524752478|
|0670012335|125        |1.2020161290322575|
|B000002KB8|349        |1.0742342342342341|
|B000P0J09C|148        |1.0914383561643843|
|B004BFVKSQ|50         |1.0499999999999998|
|B00KU9LQUO|101        |1.230300

In [8]:
asin_ratings_df.orderBy(desc("count(asin)")).show(truncate=False)

+----------+-----------+------------------+
|asin      |count(asin)|avg(helpful)      |
+----------+-----------+------------------+
|B00C30FCUI|952        |1.3146575342465765|
|0375703764|951        |1.0754918918918923|
|B00000J0JF|744        |1.2193396226415092|
|0671015206|518        |1.0766081871345023|
|B00000FCBH|481        |0.9858547008547006|
|B00081NX5U|419        |1.2174820143884884|
|B00B0DWB62|360        |1.1378333333333328|
|B0000658L4|352        |1.156914285714285 |
|B000002KB8|349        |1.0742342342342341|
|B0018QROM2|305        |1.2875247524752478|
|B003IHUHGE|305        |1.278708609271524 |
|1402241372|304        |1.1392642140468225|
|B00NAPJ8DM|266        |1.2249429657794673|
|B00L5LDWP8|257        |1.0062890624999998|
|B002ZG99CC|243        |1.0348936170212766|
|B00000F1D3|242        |1.0215859030837005|
|B00IUAAJY4|235        |1.0041276595744681|
|B0007U00XK|232        |1.360344827586207 |
|0451232852|231        |1.152610619469026 |
|B0015450T6|220        |1.214678