In [1]:
import os
# Find the latest version of spark 2.0  from http://www-us.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.0'
spark_version = 'spark-3.1.1'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
0% [Waiting for headers] [Connecting to security.ubuntu.com (91.189.91.39)] [Co                                                                               Hit:2 http://archive.ubuntu.com/ubuntu bionic InRelease
0% [Waiting for headers] [Connecting to security.ubuntu.com (91.189.91.39)] [Co0% [1 InRelease gpgv 15.9 kB] [Waiting for headers] [Connecting to security.ubu                                                                               Hit:3 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
0% [1 InRelease gpgv 15.9 kB] [Waiting for headers] [Connecting to security.ubu                                                                               Get:4 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
0% [1 InRelease gpgv 15.9 kB] [4 InRelease 14.2 kB/88.7 kB 16%] [Connecting to                                                        

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("BigData-Challenge").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()

In [3]:
from pyspark import SparkFiles
from pyspark.sql.functions import to_date
# Read in the Review dataset as a DataFrame
url = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Musical_Instruments_v1_00.tsv.gz"
spark.sparkContext.addFile(url)
df = spark.read.option("encoding", "UTF-8").csv(SparkFiles.get(""), sep="\t", header=True, inferSchema=True)
df.show()

+-----------+-----------+--------------+----------+--------------+--------------------+-------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|   product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+-------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   45610553| RMDCHWD0Y5OZ9|B00HH62VB6|     618218723|AGPtek® 10 Isolat...|Musical Instruments|          3|            0|          1|   N|                N|         Three Stars|Works very good, ...| 2015-08-31|
|         US|   14640079| RZSL0BALIYUNU|B003LRN53I|     986692292|Sennheiser HD203 ...|Musical Instruments| 

In [4]:
# Create the vine_table. DataFrame
vine_df = df.select(["review_id","star_rating","helpful_votes","vine","verified_purchase"])
vine_df.show()

+--------------+-----------+-------------+----+-----------------+
|     review_id|star_rating|helpful_votes|vine|verified_purchase|
+--------------+-----------+-------------+----+-----------------+
| RMDCHWD0Y5OZ9|          3|            0|   N|                N|
| RZSL0BALIYUNU|          5|            0|   N|                Y|
| RIZR67JKUDBI0|          3|            0|   N|                Y|
|R27HL570VNL85F|          5|            0|   N|                Y|
|R34EBU9QDWJ1GD|          5|            0|   N|                Y|
|R1WCUI4Z1SIQEO|          5|            0|   N|                N|
| RL5LNO26GAVJ1|          2|            3|   N|                Y|
|R3GYQ5W8JHP8SB|          5|            0|   N|                Y|
|R30SHYQXGG5EYC|          5|            0|   N|                Y|
|R14YLXA56NP51I|          5|            1|   N|                N|
|R1ZH0HSH38IOTZ|          5|            0|   N|                Y|
|R3H53KLLC210XI|          4|            0|   N|                Y|
|R3OOR877N

In [5]:
#step1: total votes equal to or greater than 20
vine_df_filter1 = vine_df.filter("total_votes >= 20")
vine_df_filter1.show()

+--------------+-----------+-------------+----+-----------------+
|     review_id|star_rating|helpful_votes|vine|verified_purchase|
+--------------+-----------+-------------+----+-----------------+
|R2243Y3OD8U6KQ|          5|           47|   N|                N|
|R2TGT0CDTCAAHW|          5|           21|   N|                Y|
| RX4D22YSXEF4P|          1|           37|   N|                Y|
|R3FL2NTLFUSPTQ|          5|           33|   N|                N|
|R3QTP3YNZXAPPF|          3|           23|   N|                Y|
|R36V6V42VN5AS5|          5|           34|   N|                Y|
|R27LZWE27BJPOB|          5|           22|   N|                N|
|  RMRD6SMF2AUQ|          3|            4|   N|                N|
| RMPCXKWX3T57Y|          1|            1|   N|                N|
|R1P7GJ0IN2BRNH|          5|           37|   N|                Y|
|R2R6JPF9KOD2HJ|          5|           19|   N|                Y|
|R3JM8Z4SP9N3H2|          1|            4|   N|                N|
|R2J0ZZGFX

In [6]:
#step2: helpful_votes divided by total vote >= 50
vine_df_filter2 = vine_df_filter1.filter("helpful_votes/total_votes >= 0.5")
vine_df_filter2.show()

+--------------+-----------+-------------+----+-----------------+
|     review_id|star_rating|helpful_votes|vine|verified_purchase|
+--------------+-----------+-------------+----+-----------------+
|R2243Y3OD8U6KQ|          5|           47|   N|                N|
|R2TGT0CDTCAAHW|          5|           21|   N|                Y|
| RX4D22YSXEF4P|          1|           37|   N|                Y|
|R3FL2NTLFUSPTQ|          5|           33|   N|                N|
|R3QTP3YNZXAPPF|          3|           23|   N|                Y|
|R36V6V42VN5AS5|          5|           34|   N|                Y|
|R27LZWE27BJPOB|          5|           22|   N|                N|
|R1P7GJ0IN2BRNH|          5|           37|   N|                Y|
|R2R6JPF9KOD2HJ|          5|           19|   N|                Y|
|R2J0ZZGFXKM8KR|          2|           21|   N|                Y|
|R238ZSG6TSUBNX|          5|           27|   N|                Y|
| RC6Y8GYIQZ8YU|          3|           72|   N|                Y|
| RFL3TJED

In [7]:
#step3: review was written as part of the vine program (vine == Y)
vine_df_paid = vine_df_filter2.filter("vine == 'Y'")
vine_df_paid.show()

+--------------+-----------+-------------+----+-----------------+
|     review_id|star_rating|helpful_votes|vine|verified_purchase|
+--------------+-----------+-------------+----+-----------------+
|R1R9RU7JW0MFR2|          4|           20|   Y|                N|
|R19EFYNN3W8Q07|          5|           26|   Y|                N|
|R34DJ1R8AEU0SG|          5|           29|   Y|                N|
|R25P5CXK5L9RHF|          5|          146|   Y|                N|
|R2E9VZB3I4LSN5|          5|           55|   Y|                N|
| RKYLHZL7EPELX|          4|           19|   Y|                N|
|R1U13EKGQD3ZE6|          5|           22|   Y|                N|
| RYW05F1MUEF01|          5|           87|   Y|                N|
|R2SW4NXNO7HZJ5|          4|           28|   Y|                N|
|R2016NFLSUR97Y|          2|           26|   Y|                N|
| RFCV9QXJ3D79X|          3|           20|   Y|                N|
|R36779ZAY4ZT0Y|          3|           46|   Y|                N|
|R13PPTBWW

In [8]:
#step4: review was not written as part of the vine program (vine == N)
vine_df_unpaid = vine_df_filter2.filter("vine == 'N'")
vine_df_unpaid.show()

+--------------+-----------+-------------+----+-----------------+
|     review_id|star_rating|helpful_votes|vine|verified_purchase|
+--------------+-----------+-------------+----+-----------------+
|R2243Y3OD8U6KQ|          5|           47|   N|                N|
|R2TGT0CDTCAAHW|          5|           21|   N|                Y|
| RX4D22YSXEF4P|          1|           37|   N|                Y|
|R3FL2NTLFUSPTQ|          5|           33|   N|                N|
|R3QTP3YNZXAPPF|          3|           23|   N|                Y|
|R36V6V42VN5AS5|          5|           34|   N|                Y|
|R27LZWE27BJPOB|          5|           22|   N|                N|
|R1P7GJ0IN2BRNH|          5|           37|   N|                Y|
|R2R6JPF9KOD2HJ|          5|           19|   N|                Y|
|R2J0ZZGFXKM8KR|          2|           21|   N|                Y|
|R238ZSG6TSUBNX|          5|           27|   N|                Y|
| RC6Y8GYIQZ8YU|          3|           72|   N|                Y|
| RFL3TJED

In [9]:
#step5: total number of reviews, the number of 5-star reviews, and the percentage of 5-star reviews for the two types of review (paid vs unpaid).
vine_df_paid.describe().show()


+-------+--------------+------------------+------------------+----+-----------------+
|summary|     review_id|       star_rating|     helpful_votes|vine|verified_purchase|
+-------+--------------+------------------+------------------+----+-----------------+
|  count|            60|                60|                60|  60|               60|
|   mean|          null| 4.383333333333334| 40.21666666666667|null|             null|
| stddev|          null|0.8044719642367519|26.964566621908546|null|             null|
|    min|R113X83B4OZYF8|                 2|                14|   Y|                N|
|    max| RYW05F1MUEF01|                 5|               146|   Y|                N|
+-------+--------------+------------------+------------------+----+-----------------+



In [10]:
total_review_paid = vine_df_paid.count()
print(total_review_paid)
five_star_review_paid = vine_df_paid.filter("star_rating == 5").count()
print(five_star_review_paid)
percentage_five_star_review_paid = five_star_review_paid/total_review_paid
print(percentage_five_star_review_paid)

60
34
0.5666666666666667


In [11]:
vine_df_unpaid.describe().show()

+-------+-------------+------------------+-----------------+-----+-----------------+
|summary|    review_id|       star_rating|    helpful_votes| vine|verified_purchase|
+-------+-------------+------------------+-----------------+-----+-----------------+
|  count|        14477|             14477|            14477|14477|            14477|
|   mean|         null| 4.056365269047455|47.50183048974235| null|             null|
| stddev|         null|1.3469133590849638|90.68383751472832| null|             null|
|    min|R101479YHVWJ7|                 1|               10|    N|                N|
|    max|RZZU4EG8610O6|                 5|             4709|    N|                Y|
+-------+-------------+------------------+-----------------+-----+-----------------+



In [12]:
total_review_unpaid = vine_df_unpaid.count()
print(total_review_unpaid)
five_star_review_unpaid = vine_df_unpaid.filter("star_rating == 5").count()
print(five_star_review_unpaid)
percentage_five_star_review_unpaid = five_star_review_unpaid/total_review_unpaid
print(percentage_five_star_review_unpaid)

14477
8212
0.5672445948746287
