# Data Exploration - Books

In [1]:
import findspark
findspark.init()
from pyspark import SparkContext
sc = SparkContext()
from pyspark.sql import SparkSession, SQLContext
sqlContext = SQLContext(sc)
spark = SparkSession.builder.appName("amazon-reviews-project").getOrCreate()

In [2]:
#for now, only reading reviews for items in the "Kitchen" category
reviews = sqlContext.read.parquet("s3://amazon-reviews-pds/parquet/product_category=Books/")

***

## Data Extraction
Obtaining sentiment polarity from review string contents

In [3]:
reviews = reviews.na.fill({'review_body': '', 'review_headline': ''})

In [4]:
from pyspark.sql import Row
from pyspark.sql.functions import udf
from textblob import TextBlob

polarity = udf(lambda x: TextBlob(x).sentiment.polarity)
reviewLength = udf(lambda x: len(x))

reviews = reviews.withColumn('headline_polarity', polarity('review_headline'))\
                 .withColumn('body_polarity', polarity('review_body'))\
                 .withColumn('headline_length', reviewLength('review_headline'))\
                 .withColumn('body_length', reviewLength('review_body'))

Creating "helpful?" variable - a review is helpful if at least 75% of 'total_votes' have been 'helpful_votes'.

In [5]:
import pyspark.sql.functions as f
reviews = reviews.withColumn("helpful-ratio", reviews.helpful_votes/reviews.total_votes)

In [6]:
reviews = reviews.withColumn("helpful?", f.when(reviews["helpful-ratio"] > 0.75, 1).otherwise(0))

In [7]:
reviews = reviews.withColumn("verified_purchase", f.when(reviews["verified_purchase"] == "Y", 1).otherwise(reviews.verified_purchase))
reviews = reviews.withColumn("verified_purchase", f.when(reviews["verified_purchase"] == "N", 0).otherwise(reviews.verified_purchase))
reviews = reviews.withColumn("vine", f.when(reviews["vine"] == "Y", 1).otherwise(reviews.vine))
reviews = reviews.withColumn("vine", f.when(reviews["vine"] == "N", 0).otherwise(reviews.vine))

## Exploratory Data Analysis

In [8]:
reviews.count()

20726160

In [9]:
reviews.printSchema()

root
 |-- marketplace: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: string (nullable = true)
 |-- product_title: string (nullable = true)
 |-- star_rating: integer (nullable = true)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- vine: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_headline: string (nullable = false)
 |-- review_body: string (nullable = false)
 |-- review_date: date (nullable = true)
 |-- year: integer (nullable = true)
 |-- headline_polarity: string (nullable = true)
 |-- body_polarity: string (nullable = true)
 |-- headline_length: string (nullable = true)
 |-- body_length: string (nullable = true)
 |-- helpful-ratio: double (nullable = true)
 |-- helpful?: integer (nullable = false)



In [10]:
reviews.head(5)

[Row(marketplace='US', customer_id='15444933', review_id='R1WWG70WK9VUCH', product_id='1848192576', product_parent='835940987', product_title='Standing Qigong for Health and Martial Arts - Zhan Zhuang', star_rating=5, helpful_votes=9, total_votes=10, vine='0', verified_purchase='1', review_headline='Informative AND interesting!', review_body="After attending a few Qigong classes, I wanted to have a book to read and re-read the instructions so I could practice at home.  I also wanted to gain more of an understanding of the purpose and benefit of the movements in order to practice them with a more focused purpose.<br /><br />The book exceeded my expectations.  The explanations are very clear and are paired with photos showing the correct form.  The book itself is more than just the Qigong, it's a very interesting read.  I read the whole book in two days and will read it again. I rarely read books twice!  The book has provided the information and additional instruction that I was looking 

### Summary statistics for numerical columns

In [11]:
reviews.describe("star_rating", "helpful_votes", "total_votes", "helpful-ratio", "helpful?").show()

+-------+------------------+-----------------+------------------+-------------------+-------------------+
|summary|       star_rating|    helpful_votes|       total_votes|      helpful-ratio|           helpful?|
+-------+------------------+-----------------+------------------+-------------------+-------------------+
|  count|          20726160|         20726160|          20726160|           13125056|           20726160|
|   mean| 4.340540167594962|3.836474725660711| 5.332146620502785| 0.7072656836756213| 0.3642755339146277|
| stddev|1.1256043311867316|22.40866963313652|25.810179471168826|0.35802059020169846|0.48122643368699736|
|    min|                 1|                0|                 0|                0.0|                  0|
|    max|                 5|            27550|             28727|                1.0|                  1|
+-------+------------------+-----------------+------------------+-------------------+-------------------+



### Summary statistics for categorical columns

In [12]:
reviews.groupBy("marketplace").count().show()

+-----------+--------+
|marketplace|   count|
+-----------+--------+
|         DE|   63860|
|         US|20370130|
|         FR|   21462|
|         UK|  258057|
|         JP|   12651|
+-----------+--------+



https://www.amazon.com/gp/vine/help for more info on what `vine` column means

In [13]:
reviews.groupBy("vine").count().show()

+----+--------+
|vine|   count|
+----+--------+
|   0|20445054|
|   1|  281106|
+----+--------+



In [14]:
reviews.groupBy("verified_purchase").count().show()

+-----------------+--------+
|verified_purchase|   count|
+-----------------+--------+
|                0|10136108|
|                1|10590052|
+-----------------+--------+



In [15]:
reviews.groupBy("year").count().sort("year").show()

+----+-------+
|year|  count|
+----+-------+
|1995|    211|
|1996|   5024|
|1997|  39582|
|1998| 129068|
|1999| 262997|
|2000| 571135|
|2001| 461224|
|2002| 444260|
|2003| 459694|
|2004| 461265|
|2005| 539531|
|2006| 587516|
|2007| 788724|
|2008| 859497|
|2009|1050246|
|2010|1161966|
|2011|1357646|
|2012|1736442|
|2013|3106099|
|2014|3710219|
+----+-------+
only showing top 20 rows



In [16]:
reviews.groupBy("helpful?").count().show()

+--------+--------+
|helpful?|   count|
+--------+--------+
|       1| 7550033|
|       0|13176127|
+--------+--------+



In [17]:
spark.stop()