# Amazon Reviews - Model

In [1]:
import findspark
findspark.init()
from pyspark import SparkContext
sc = SparkContext()
from pyspark.sql import SparkSession, SQLContext
sqlContext = SQLContext(sc)
spark = SparkSession.builder.appName("amazon-reviews-project").getOrCreate()

In [2]:
#for now, only reading reviews for items in the "Kitchen" category
reviews = sqlContext.read.parquet("s3://amazon-reviews-pds/parquet/product_category=Electronics/")

KeyboardInterrupt: 

***

## Data Schema

In [None]:
reviews.count()

In [None]:
reviews.printSchema()

## Data Extraction
Applying a filter - keeping only reviews with higher than 100 total votes received

In [None]:
reviews = reviews.filter(reviews.total_votes > 100)

In [None]:
reviews.count()

Obtaining sentiment polarity from review string contents

In [None]:
reviews = reviews.na.fill({'review_body': '', 'review_headline': ''})

In [None]:
from pyspark.sql import Row
from pyspark.sql.functions import udf
from textblob import TextBlob

polarity = udf(lambda x: TextBlob(x).sentiment.polarity)
reviewLength = udf(lambda x: len(x))

reviews = reviews.withColumn('headline_polarity', polarity('review_headline'))\
                 .withColumn('body_polarity', polarity('review_body'))\
                 .withColumn('headline_length', reviewLength('review_headline'))\
                 .withColumn('body_length', reviewLength('review_body'))

Creating "helpful?" variable - a review is helpful if at least 75% of 'total_votes' have been 'helpful_votes'.

In [None]:
import pyspark.sql.functions as f
reviews = reviews.withColumn("helpful-ratio", reviews.helpful_votes/reviews.total_votes)

In [None]:
reviews = reviews.withColumn("helpful?", f.when(reviews["helpful-ratio"] > 0.75, 1).otherwise(0))

In [None]:
reviews = reviews.withColumn("verified_purchase", f.when(reviews["verified_purchase"] == "Y", 1).otherwise(reviews.verified_purchase))
reviews = reviews.withColumn("verified_purchase", f.when(reviews["verified_purchase"] == "N", 0).otherwise(reviews.verified_purchase))
reviews = reviews.withColumn("vine", f.when(reviews["vine"] == "Y", 1).otherwise(reviews.vine))
reviews = reviews.withColumn("vine", f.when(reviews["vine"] == "N", 0).otherwise(reviews.vine))

In [None]:
reviews.take(1)

## Model Building

In [None]:
import pyspark.ml.evaluation as ev
from pyspark.ml import Pipeline
import pyspark.ml.regression as rg
import pyspark.sql.functions as f
import pyspark.ml.feature as feat
import pyspark.ml.classification as cl

In [None]:
# running bucketizer for pickup_longitude and adding it in the dataset
splits = [-float("inf"), 0, 5, float("inf")]

bucketizer = feat.Bucketizer(splits=splits, inputCol="year", outputCol="year_bkt")

reviews = bucketizer.transform(reviews)

In [None]:
reviews = reviews.drop('customer_id','review_id','product_id','parent_product','product title', 'helpful_votes', 'review_headline', 'review_body', 'review_date', 'year', 'helpful-ratio')

In [None]:
reviews.printSchema()

In [None]:
from pyspark.sql.types import FloatType
from pyspark.sql.types import IntegerType  
reviews = reviews.withColumn("headline_polarity", reviews["headline_polarity"].cast(FloatType()))
reviews = reviews.withColumn("body_polarity", reviews["body_polarity"].cast(FloatType()))
reviews = reviews.withColumn("headline_length", reviews["headline_polarity"].cast(FloatType()))
reviews = reviews.withColumn("body_length", reviews["body_polarity"].cast(FloatType()))
reviews = reviews.withColumn("vine", reviews["vine"].cast(IntegerType()))
reviews = reviews.withColumn("verified_purchase", reviews["verified_purchase"].cast(IntegerType()))

In [None]:
reviews=reviews.drop('features') #removes the column 'features' if it already exists
#selects all numeric columns to be combined into column 'features'
Cols_to_Select = reviews["star_rating", "total_votes", "headline_polarity", "body_polarity", "headline_length", "body_length", "year_bkt", "vine", "verified_purchase"]
assembler = feat.VectorAssembler(inputCols=Cols_to_Select.columns, outputCol="features") #creates the VectorAssembler object

In [None]:
# running the VectorAssembler transformation onto the dataframe to create the 'features' column
reviews=assembler.setHandleInvalid("skip").transform(reviews)

In [None]:
#splitting the data into train, test, and predict datasets
splitted_data = reviews.randomSplit([0.7, 0.3], 199)
train_data = splitted_data[0]
test_data = splitted_data[1]

In [None]:
# creating the logistic regression object 
logReg_obj = cl.LogisticRegression(
    labelCol="helpful?"
    , featuresCol = "features",
    maxIter=5, regParam=0.3, elasticNetParam=0.8
)
# using pipeline to run the logistic regression, plus all other objects intially created
pipeline = Pipeline(
    stages=[
        logReg_obj
    ])

pipelineModel = pipeline.fit(train_data) #running the model on training dataset


In [None]:
trainingSummary = pipelineModel.stages[-1].summary

print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

In [None]:
import pyspark.ml.evaluation as ev
#evaluating the model created against test dataset
results_logReg = (
    pipelineModel
    .transform(test_data)
    .select('helpful?', 'probability', 'prediction')
)

In [None]:
evaluator = ev.MulticlassClassificationEvaluator(
    predictionCol='prediction'
    , labelCol='helpful?')

In [None]:
(
    evaluator.evaluate(results_logReg)
    , evaluator.evaluate(
        results_logReg
        , {evaluator.metricName: 'weightedPrecision'}
    ) 
    , evaluator.evaluate(
        results_logReg
        , {evaluator.metricName: 'accuracy'}
    )
)

In [None]:
spark.stop()