# Amazon Reviews - Model

In [1]:
import findspark
findspark.init()
from pyspark import SparkContext
sc = SparkContext()
from pyspark.sql import SparkSession, SQLContext
sqlContext = SQLContext(sc)
spark = SparkSession.builder.appName("amazon-reviews-project").getOrCreate()

In [2]:
#for now, only reading reviews for items in the "Kitchen" category
reviews = sqlContext.read.parquet("s3://amazon-reviews-pds/parquet/product_category=Electronics/")

***

## Data Schema

In [3]:
reviews.count()

3120938

In [4]:
reviews.printSchema()

root
 |-- marketplace: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: string (nullable = true)
 |-- product_title: string (nullable = true)
 |-- star_rating: integer (nullable = true)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- vine: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: date (nullable = true)
 |-- year: integer (nullable = true)



## Data Extraction
Applying a filter - keeping only reviews with higher than 100 total votes received

In [5]:
reviews = reviews.filter(reviews.total_votes > 100)

In [6]:
reviews.count()

6627

Obtaining sentiment polarity from review string contents

In [7]:
reviews = reviews.na.fill({'review_body': '', 'review_headline': ''})

In [8]:
from pyspark.sql import Row
from pyspark.sql.functions import udf
from textblob import TextBlob

polarity = udf(lambda x: TextBlob(x).sentiment.polarity)
reviewLength = udf(lambda x: len(x))

reviews = reviews.withColumn('headline_polarity', polarity('review_headline'))\
                 .withColumn('body_polarity', polarity('review_body'))\
                 .withColumn('headline_length', reviewLength('review_headline'))\
                 .withColumn('body_length', reviewLength('review_body'))

Creating "helpful?" variable - a review is helpful if at least 75% of 'total_votes' have been 'helpful_votes'.

In [9]:
import pyspark.sql.functions as f
reviews = reviews.withColumn("helpful-ratio", reviews.helpful_votes/reviews.total_votes)

In [10]:
reviews = reviews.withColumn("helpful?", f.when(reviews["helpful-ratio"] > 0.75, 1).otherwise(0))

In [11]:
reviews = reviews.withColumn("verified_purchase", f.when(reviews["verified_purchase"] == "Y", 1).otherwise(reviews.verified_purchase))
reviews = reviews.withColumn("verified_purchase", f.when(reviews["verified_purchase"] == "N", 0).otherwise(reviews.verified_purchase))
reviews = reviews.withColumn("vine", f.when(reviews["vine"] == "Y", 1).otherwise(reviews.vine))
reviews = reviews.withColumn("vine", f.when(reviews["vine"] == "N", 0).otherwise(reviews.vine))

In [12]:
reviews.take(1)

[Row(marketplace='US', customer_id='50815760', review_id='R9M3OW6DMOSVZ', product_id='B004J6DLD4', product_parent='210015229', product_title='La Crosse Technology BC1000 Alpha Power Battery Charger', star_rating=1, helpful_votes=108, total_votes=115, vine='0', verified_purchase='1', review_headline='Abysmal customer service', review_body="I own four of these chargers. A few months back one of the chargers (only a year old) stopped working. I was able to find out through swapping with my other chargers that that the power adapter of the unit was faulty. I tried contacting La Crosse Technology about it and quickly learned that the company neither has any workable customer service line, nor dives a damn about its customers.<br /><br />As it turns out La Crosse Technology is a company in Wisconsin whose primary products are weather stations. These chargers that they ship are re-branded chargers produced by a different manufacturer (in other words La Crosse is not the OEM for these). Appare

## Model Building

In [13]:
import pyspark.ml.evaluation as ev
from pyspark.ml import Pipeline
import pyspark.ml.regression as rg
import pyspark.sql.functions as f
import pyspark.ml.feature as feat
import pyspark.ml.classification as cl

In [14]:
# running bucketizer for pickup_longitude and adding it in the dataset
splits = [-float("inf"), 0, 5, float("inf")]

bucketizer = feat.Bucketizer(splits=splits, inputCol="year", outputCol="year_bkt")

reviews = bucketizer.transform(reviews)

In [15]:
reviews = reviews.drop('customer_id','review_id','product_id','parent_product','product title', 'helpful_votes', 'review_headline', 'review_body', 'review_date', 'year', 'helpful-ratio')

In [16]:
reviews.printSchema()

root
 |-- marketplace: string (nullable = true)
 |-- product_parent: string (nullable = true)
 |-- product_title: string (nullable = true)
 |-- star_rating: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- vine: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- headline_polarity: string (nullable = true)
 |-- body_polarity: string (nullable = true)
 |-- headline_length: string (nullable = true)
 |-- body_length: string (nullable = true)
 |-- helpful?: integer (nullable = false)
 |-- year_bkt: double (nullable = true)



In [17]:
from pyspark.sql.types import FloatType
from pyspark.sql.types import IntegerType  
reviews = reviews.withColumn("headline_polarity", reviews["headline_polarity"].cast(FloatType()))
reviews = reviews.withColumn("body_polarity", reviews["body_polarity"].cast(FloatType()))
reviews = reviews.withColumn("headline_length", reviews["headline_polarity"].cast(FloatType()))
reviews = reviews.withColumn("body_length", reviews["body_polarity"].cast(FloatType()))
reviews = reviews.withColumn("vine", reviews["vine"].cast(IntegerType()))
reviews = reviews.withColumn("verified_purchase", reviews["verified_purchase"].cast(IntegerType()))

In [18]:
reviews=reviews.drop('features') #removes the column 'features' if it already exists
#selects all numeric columns to be combined into column 'features'
Cols_to_Select = reviews["star_rating", "total_votes", "headline_polarity", "body_polarity", "headline_length", "body_length", "year_bkt", "vine", "verified_purchase"]
assembler = feat.VectorAssembler(inputCols=Cols_to_Select.columns, outputCol="features") #creates the VectorAssembler object

In [19]:
# running the VectorAssembler transformation onto the dataframe to create the 'features' column
reviews=assembler.setHandleInvalid("skip").transform(reviews)

In [20]:
#splitting the data into train, test, and predict datasets
splitted_data = reviews.randomSplit([0.7, 0.3], 199)
train_data = splitted_data[0]
test_data = splitted_data[1]

In [21]:
# creating the logistic regression object 
logReg_obj = cl.LogisticRegression(
    labelCol="helpful?"
    , featuresCol = "features",
    maxIter=5, regParam=0.3, elasticNetParam=0.8
)
# using pipeline to run the logistic regression, plus all other objects intially created
pipeline = Pipeline(
    stages=[
        logReg_obj
    ])

pipelineModel = pipeline.fit(train_data) #running the model on training dataset


In [22]:
trainingSummary = pipelineModel.stages[-1].summary

print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

areaUnderROC: 0.5


In [23]:
import pyspark.ml.evaluation as ev
#evaluating the model created against test dataset
results_logReg = (
    pipelineModel
    .transform(test_data)
    .select('helpful?', 'probability', 'prediction')
)

In [24]:
evaluator = ev.MulticlassClassificationEvaluator(
    predictionCol='prediction'
    , labelCol='helpful?')

In [25]:
(
    evaluator.evaluate(results_logReg)
    , evaluator.evaluate(
        results_logReg
        , {evaluator.metricName: 'weightedPrecision'}
    ) 
    , evaluator.evaluate(
        results_logReg
        , {evaluator.metricName: 'accuracy'}
    )
)

(0.9048029630884138, 0.8757706295077597, 0.9358261748357757)

In [26]:
spark.stop()