In [13]:
import findspark
findspark.init()
from pyspark import SparkContext
sc = SparkContext()
from pyspark.sql import SparkSession, SQLContext
sqlContext = SQLContext(sc)
spark = SparkSession.builder.appName("amazon-reviews-project").getOrCreate()

In [14]:
#for now, only reading reviews for items in the "Kitchen" category
reviews = sqlContext.read.parquet("s3://amazon-reviews-pds/parquet/product_category=Kitchen/")

***

## Data Extraction
Obtaining sentiment polarity from review string contents

In [97]:
reviews = reviews.na.fill({'review_body': '', 'review_headline': ''})

In [112]:
from pyspark.sql import Row
from pyspark.sql.functions import udf
from textblob import TextBlob

polarity = udf(lambda x: TextBlob(x).sentiment.polarity)

reviews = reviews.withColumn('headline_polarity', polarity('review_headline'))\
                 .withColumn('body_polarity', polarity('review_body'))

## Exploratory Data Analysis

In [114]:
reviews.count()

4882831

In [115]:
reviews.printSchema()

root
 |-- marketplace: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: string (nullable = true)
 |-- product_title: string (nullable = true)
 |-- star_rating: integer (nullable = true)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- vine: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_headline: string (nullable = false)
 |-- review_body: string (nullable = false)
 |-- review_date: date (nullable = true)
 |-- year: integer (nullable = true)
 |-- headline_polarity: string (nullable = true)
 |-- body_polarity: string (nullable = true)



In [116]:
reviews.head(5)

[Row(marketplace='US', customer_id='17050420', review_id='R3QP7FJW6GAB3M', product_id='B007T0CIVS', product_parent='224029078', product_title='Preethi Eco Twin Jar Mixer Grinder, 550-Watt', star_rating=5, helpful_votes=3, total_votes=4, vine='N', verified_purchase='Y', review_headline='Finally something I can use', review_body="I always believed that the American blenders went toe to toe with the Indian ones, at least the affordable ones. I was getting tired of the blenders that only seemed to work when water was added or spice mixers that just tossed stuff around without actually grinding anything.<br /><br />Finally here's something that is pretty basic by Indian standards but so very effective. It comes with 2 jar and 4 lids. there are 2 lids for each jar. One adds a lot of room and the other one reduces it. I use the Large lid when I'm blending something like Dosa batter and the smaller lid for chutneys.<br /><br />There's also one extra blade (am not sure if this is for a special 

### Summary statistics for numerical columns

In [None]:
reviews.select("star_rating", "helpful_votes", "total_votes", "year", "headline_polarity", "body_polarity").describe().show()

### Summary statistics for categorical columns

In [29]:
reviews.groupBy("marketplace").count().show()

+-----------+-------+
|marketplace|  count|
+-----------+-------+
|         DE|    120|
|         US|4882309|
|         FR|     67|
|         UK|     21|
|         JP|    314|
+-----------+-------+



https://www.amazon.com/gp/vine/help for more info on what `vine` column means

In [38]:
reviews.groupBy("vine").count().show()

+----+-------+
|vine|  count|
+----+-------+
|   Y|  24434|
|   N|4858397|
+----+-------+



In [39]:
reviews.groupBy("verified_purchase").count().show()

+-----------------+-------+
|verified_purchase|  count|
+-----------------+-------+
|                Y|4101350|
|                N| 781481|
+-----------------+-------+

