In [2]:
import os
# Find the latest version of spark 3.0 from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.3'
spark_version = 'spark-3.0.3'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
0% [Connecting to archive.ubuntu.com (91.189.91.38)] [1 InRelease 14.2 kB/88.7                                                                                Get:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
0% [Waiting for headers] [1 InRelease 14.2 kB/88.7 kB 16%] [2 InRelease 3,626 B0% [Waiting for headers] [1 InRelease 43.1 kB/88.7 kB 49%] [Connecting to ppa.l                                                                               Hit:3 http://archive.ubuntu.com/ubuntu bionic InRelease
0% [Waiting for headers] [1 InRelease 43.1 kB/88.7 kB 49%] [Connecting to ppa.l0% [2 InRelease gpgv 3,626 B] [Waiting for headers] [1 InRelease 43.1 kB/88.7 k                                                                               Get:4 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Get:5 http://archive.ubuntu.com/ubuntu

In [3]:
# Download the Postgres driver that will allow Spark to interact with Postgres.
!wget https://jdbc.postgresql.org/download/postgresql-42.2.16.jar

--2022-04-23 20:25:47--  https://jdbc.postgresql.org/download/postgresql-42.2.16.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1002883 (979K) [application/java-archive]
Saving to: ‘postgresql-42.2.16.jar’


2022-04-23 20:25:47 (5.71 MB/s) - ‘postgresql-42.2.16.jar’ saved [1002883/1002883]



In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("M16-Amazon-Challenge").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()

### Load Amazon Data into Spark DataFrame

In [5]:
from pyspark import SparkFiles
url = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Pet_Products_v1_00.tsv.gz"
spark.sparkContext.addFile(url)
df = spark.read.option("encoding", "UTF-8").csv(SparkFiles.get(""), sep="\t", header=True, inferSchema=True)
df.show()

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   28794885| REAKC26P07MDN|B00Q0K9604|     510387886|(8-Pack) EZwhelp ...|    Pet Products|          5|            0|          0|   N|                Y|A great purchase ...|Best belly bands ...| 2015-08-31|
|         US|   11488901|R3NU7OMZ4HQIEG|B00MBW5O9W|     912374672|Warren Eckstein's...|    Pet Products|          2|    

### Create DataFrames to match tables

In [6]:
# Create the vine_table. DataFrame
vine_df = df.select(["review_id","star_rating","helpful_votes","total_votes","vine","verified_purchase"])
vine_df.show()

+--------------+-----------+-------------+-----------+----+-----------------+
|     review_id|star_rating|helpful_votes|total_votes|vine|verified_purchase|
+--------------+-----------+-------------+-----------+----+-----------------+
| REAKC26P07MDN|          5|            0|          0|   N|                Y|
|R3NU7OMZ4HQIEG|          2|            0|          1|   N|                Y|
|R14QJW3XF8QO1P|          5|            0|          0|   N|                Y|
|R2HB7AX0394ZGY|          5|            0|          0|   N|                Y|
| RGKMPDQGSAHR3|          5|            0|          0|   N|                Y|
|R1DJCVPQGCV66E|          5|            0|          0|   N|                Y|
|R3V52EAWLPBFQG|          3|            0|          0|   N|                Y|
|R3DKO8J1J28QBI|          2|            0|          0|   N|                Y|
| R764DBXGRNECG|          5|            1|          1|   N|                N|
| RW1853GAT0Z9F|          5|            0|          0|   N|     

In [7]:
#create dataframe to retrieve rows total_votes equal or greater than 20
vine_tvotes_df = vine_df.filter("total_votes>=20")
vine_tvotes_df.show()

+--------------+-----------+-------------+-----------+----+-----------------+
|     review_id|star_rating|helpful_votes|total_votes|vine|verified_purchase|
+--------------+-----------+-------------+-----------+----+-----------------+
|R21KC552Y6HL8X|          1|           27|         31|   N|                Y|
| RX9WC9FTIR1XR|          5|           25|         25|   N|                Y|
| RGDCOU1KBHMNG|          3|           29|         31|   N|                Y|
| RVTYWID2TPMMY|          2|           35|         42|   N|                Y|
|R2CMPZ5VESGRLY|          4|           27|         28|   N|                Y|
|R3VQPJZ54B55BA|          5|           62|         64|   N|                N|
|R24QM6D7FEDZ5M|          2|           36|         43|   N|                Y|
|R3A1VQ3SQDXEJ3|          5|           20|         20|   N|                Y|
|  R39GSNW76GYF|          1|           20|         23|   N|                Y|
| RPJLR6MFDPXXE|          5|           35|         36|   N|     

In [8]:
#create dataframe to retrieve rows where number of helpful_votes/total_votes is equal to or greater than 50%
vine_help_total_df = vine_tvotes_df.filter("helpful_votes/total_votes>=0.5")
vine_help_total_df.show()

+--------------+-----------+-------------+-----------+----+-----------------+
|     review_id|star_rating|helpful_votes|total_votes|vine|verified_purchase|
+--------------+-----------+-------------+-----------+----+-----------------+
|R21KC552Y6HL8X|          1|           27|         31|   N|                Y|
| RX9WC9FTIR1XR|          5|           25|         25|   N|                Y|
| RGDCOU1KBHMNG|          3|           29|         31|   N|                Y|
| RVTYWID2TPMMY|          2|           35|         42|   N|                Y|
|R2CMPZ5VESGRLY|          4|           27|         28|   N|                Y|
|R3VQPJZ54B55BA|          5|           62|         64|   N|                N|
|R24QM6D7FEDZ5M|          2|           36|         43|   N|                Y|
|R3A1VQ3SQDXEJ3|          5|           20|         20|   N|                Y|
|  R39GSNW76GYF|          1|           20|         23|   N|                Y|
| RPJLR6MFDPXXE|          5|           35|         36|   N|     

In [9]:
# filter paid helpful review 
vine_paidr_df = vine_help_total_df.filter(vine_help_total_df.vine == 'Y')
vine_paidr_df.show()

+--------------+-----------+-------------+-----------+----+-----------------+
|     review_id|star_rating|helpful_votes|total_votes|vine|verified_purchase|
+--------------+-----------+-------------+-----------+----+-----------------+
|R3A71VR1JZD8WF|          2|           27|         30|   Y|                N|
|R16OMUJIGI18JZ|          5|           72|         72|   Y|                N|
|R3TS8ZP2FHQ9XR|          5|           39|         42|   Y|                N|
|R2MHP919VZN7DI|          5|           29|         30|   Y|                N|
| RD2BCTVS59A5L|          2|           20|         20|   Y|                N|
|R1JUJLXQ2CMWKF|          4|           25|         26|   Y|                N|
|R2T7YE0IFI6N9L|          3|           24|         25|   Y|                N|
|R25FI3J7WWOYXM|          3|           55|         58|   Y|                N|
|R3P3U4EOWIUD50|          3|           27|         36|   Y|                N|
|R3C7MKX7MO6XUC|          5|          216|        219|   Y|     

In [10]:
# filter unpaid helpful review
vine_unpaidr_df = vine_help_total_df.filter(vine_help_total_df.vine == 'N')
vine_unpaidr_df.show()

+--------------+-----------+-------------+-----------+----+-----------------+
|     review_id|star_rating|helpful_votes|total_votes|vine|verified_purchase|
+--------------+-----------+-------------+-----------+----+-----------------+
|R21KC552Y6HL8X|          1|           27|         31|   N|                Y|
| RX9WC9FTIR1XR|          5|           25|         25|   N|                Y|
| RGDCOU1KBHMNG|          3|           29|         31|   N|                Y|
| RVTYWID2TPMMY|          2|           35|         42|   N|                Y|
|R2CMPZ5VESGRLY|          4|           27|         28|   N|                Y|
|R3VQPJZ54B55BA|          5|           62|         64|   N|                N|
|R24QM6D7FEDZ5M|          2|           36|         43|   N|                Y|
|R3A1VQ3SQDXEJ3|          5|           20|         20|   N|                Y|
|  R39GSNW76GYF|          1|           20|         23|   N|                Y|
| RPJLR6MFDPXXE|          5|           35|         36|   N|     

In [11]:
# determine the total number of reviews, 
total_review = vine_help_total_df.select("review_id").count()
total_review

38010

In [12]:
# get total number of 5-star reviews
total_five_review = vine_help_total_df.filter("star_rating == 5").select("review_id").count()
total_five_review

20677

In [14]:
# get the percentage of 5-stars reviews for unpaid review
unpaid_review = vine_unpaidr_df.select("review_id").count()
five_unpaid_review = vine_unpaidr_df.filter("star_rating == 5").select("review_id").count()
percentage_five_unpaid = five_unpaid_review/unpaid_review *100
percentage_five_unpaid

54.471458773784356

In [15]:
# get the percentage of 5-stars reviews for paid review
paid_review = vine_paidr_df.select("review_id").count()
five_paid_review = vine_paidr_df.filter("star_rating == 5").select("review_id").count()
percentage_five_paid = five_paid_review/paid_review *100
percentage_five_paid

38.23529411764706

In [16]:
#number of paid and unpaid reviews
print(unpaid_review, paid_review)

37840 170


In [17]:
#number of 5 star paid and unpaid reviews
print(five_unpaid_review,five_paid_review)

20612 65


In [82]:
# get the percentage of 4-stars reviews for unpaid review

four_unpaid_review = vine_unpaidr_df.filter("star_rating == 4").select("review_id").count()
percentage_four_unpaid = four_unpaid_review/unpaid_review *100
percentage_four_unpaid

12.949260042283298

In [83]:
# get the percentage of 4-stars reviews for paid review

four_paid_review = vine_paidr_df.filter("star_rating == 4").select("review_id").count()
percentage_four_paid = four_paid_review/paid_review *100
percentage_four_paid

32.94117647058823

In [84]:
# get the percentage of 3-stars reviews for unpaid review

three_unpaid_review = vine_unpaidr_df.filter("star_rating == 3").select("review_id").count()
percentage_three_unpaid = three_unpaid_review/unpaid_review *100
percentage_three_unpaid

7.164376321353065

In [85]:
# get the percentage of 3-stars reviews for paid review
paid_review = vine_paidr_df.select("review_id").count()
three_paid_review = vine_paidr_df.filter("star_rating == 3").select("review_id").count()
percentage_three_paid = three_paid_review/paid_review *100
percentage_three_paid

15.88235294117647

In [86]:
# get the percentage of 2-stars reviews for unpaid review

two_unpaid_review = vine_unpaidr_df.filter("star_rating == 2").select("review_id").count()
percentage_two_unpaid = two_unpaid_review/unpaid_review *100
percentage_two_unpaid

5.412262156448203

In [87]:
# get the percentage of 2-stars reviews for paid review

two_paid_review = vine_paidr_df.filter("star_rating == 2").select("review_id").count()
percentage_two_paid = two_paid_review/paid_review *100
percentage_two_paid

9.411764705882353

In [88]:
# get the percentage of 1-stars reviews for unpaid review

one_unpaid_review = vine_unpaidr_df.filter("star_rating == 1").select("review_id").count()
percentage_one_unpaid = one_unpaid_review/unpaid_review *100
percentage_one_unpaid

20.002642706131077

In [89]:
# get the percentage of 1-stars reviews for paid review

one_paid_review = vine_paidr_df.filter("star_rating == 1").select("review_id").count()
percentage_one_paid = one_paid_review/paid_review *100
percentage_one_paid

3.5294117647058822

In [90]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
data3 = [("5",percentage_five_paid, percentage_five_unpaid),
         ("4",percentage_four_paid, percentage_four_unpaid),
         ("3",percentage_three_paid, percentage_four_unpaid),
         ("2",percentage_two_paid, percentage_four_unpaid),
         ("1",percentage_one_paid, percentage_four_unpaid),
         ]
schema2 = StructType([\
                     StructField("Star_rating", StringType(), True),\
                     StructField("Paid_percentage", FloatType(), True),\
                     StructField("Unpaid_percentage", FloatType(), True)\
                     ])
summary_df = spark.createDataFrame(data=data3, schema=schema2)
summary_df.printSchema()
summary_df.show(truncate=False)


root
 |-- Star_rating: string (nullable = true)
 |-- Paid_percentage: float (nullable = true)
 |-- Unpaid_percentage: float (nullable = true)

+-----------+---------------+-----------------+
|Star_rating|Paid_percentage|Unpaid_percentage|
+-----------+---------------+-----------------+
|5          |38.235294      |54.47146         |
|4          |32.941177      |12.94926         |
|3          |15.882353      |12.94926         |
|2          |9.411765       |12.94926         |
|1          |3.5294118      |12.94926         |
+-----------+---------------+-----------------+

