# Review NLP Helpfulness Prediction PretrainedModels (DVD)

## Creating Spark Session & Importing All Necessary Libraries

In [1]:
import findspark
findspark.init()

In [2]:
# Import Spark NLP
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql import functions as F

import sparknlp

# Start SparkSession with Spark NLP
# start() functions has 5 parameters: gpu, spark23, spark24, spark32, and memory
# sparknlp.start(gpu=True) will start the session with GPU support
# sparknlp.start(spark23=True) is when you have Apache Spark 2.3.x installed
# sparknlp.start(spark24=True) is when you have Apache Spark 2.4.x installed
# sparknlp.start(spark32=True) is when you have Apache Spark 3.2.x installed
# sparknlp.start(memory="16G") to change the default driver memory in SparkSession
spark = sparknlp.start(gpu = True, spark32= True)

### Reading in Data

In [3]:
df = spark.read \
    .option("quote", "\"")  \
    .option("escape", "\"") \
    .option("ignoreLeadingWhiteSpace",True) \
    .csv(r"C:\Users\kenne\OneDrive\Desktop\UChicago\Python\Final Project Big Data Amazon Review\Amazon Review Kaggle Data\amazon_reviews_us_Video_DVD_v1_00.tsv",inferSchema=True,header=True, sep='\t' )

## Code Cleaning

In [4]:
df.show(1, vertical = True, truncate = False)

-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 marketplace       | US                                                                                                                                                                                                                  
 customer_id       | 27288431                                                                                                                                                                                                            
 review_id         | R33UPQQUZQEM8                                                                                                                                                                                                       
 product_id        | B005T4ND06                                 

In [5]:
df = df.select('product_id', 'star_rating', 'product_category', 'review_headline', 'review_body', 'helpful_votes', 'total_votes')

In [6]:
df.show(1, vertical = True, truncate = False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 product_id       | B005T4ND06                                                                                                                                                                                                          
 star_rating      | 5                                                                                                                                                                                                                   
 product_category | Video DVD                                                                                                                                                                                                           
 review_headline  | This was a gift for my aunt who has Parkinson's 

In [7]:
df.dropna().count()

5068532

In [8]:
from pyspark.sql.functions import col,isnan, when, count
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]
   ).show()

+----------+-----------+----------------+---------------+-----------+-------------+-----------+
|product_id|star_rating|product_category|review_headline|review_body|helpful_votes|total_votes|
+----------+-----------+----------------+---------------+-----------+-------------+-----------+
|         0|          4|               4|            163|        456|            4|          4|
+----------+-----------+----------------+---------------+-----------+-------------+-----------+



In [9]:
df2 = df.select([count(when(col(c).contains('None') | \
                            col(c).contains('NULL') | \
                            (col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in df.columns])
df2.count()
df2.show(5, vertical = True)

-RECORD 0-----------------
 product_id       | 0     
 star_rating      | 4     
 product_category | 4     
 review_headline  | 760   
 review_body      | 16722 
 helpful_votes    | 4     
 total_votes      | 4     



In [10]:
df = df.filter(col('total_votes') > 10)
df.count()

478432

In [11]:
df = df.fillna("", "review_body")
df = df.fillna("", "review_headline")

In [12]:
df = df.withColumn('review_text', F.concat('review_headline', F.lit(" "), 'review_body'))
df.show(1, vertical = True, truncate = False)

-RECORD 0-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 product_id       | B00ZGPZR9I                                                                                                                                                                                                                                                                                                                                                                                                                                                              
 star_rating      | 4                         

In [13]:
df.filter((col("review_text").isNull()) | ( col("review_text") == "")).show(1, vertical = True, truncate = False)

(0 rows)



In [14]:
df.select([count(when((col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in df.columns]).show(5, vertical = True)

-RECORD 0---------------
 product_id       | 0   
 star_rating      | 0   
 product_category | 0   
 review_headline  | 5   
 review_body      | 14  
 helpful_votes    | 0   
 total_votes      | 0   
 review_text      | 0   



In [15]:
df = df.withColumn('helpful_ratio', F.col('helpful_votes') / F.col('total_votes'))
df.show(1, vertical = True, truncate = False)

-RECORD 0-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 product_id       | B00ZGPZR9I                                                                                                                                                                                                                                                                                                                                                                                                                                                              
 star_rating      | 4                         

In [16]:
df.filter(col('helpful_ratio') < 0).count()

0

In [17]:
df = df.withColumn('helpful', when(col("helpful_ratio") < 0.5, 0).otherwise(1))

In [18]:
df.show(5, vertical = True)

-RECORD 0--------------------------------
 product_id       | B00ZGPZR9I           
 star_rating      | 4                    
 product_category | Video DVD            
 review_headline  | Unfair Accusation... 
 review_body      | Amazed after Wode... 
 helpful_votes    | 32                   
 total_votes      | 39                   
 review_text      | Unfair Accusation... 
 helpful_ratio    | 0.8205128205128205   
 helpful          | 1                    
-RECORD 1--------------------------------
 product_id       | B001JAHSI2           
 star_rating      | 1                    
 product_category | Video DVD            
 review_headline  | Buy Chinese bootlegs 
 review_body      | I do not have cab... 
 helpful_votes    | 12                   
 total_votes      | 15                   
 review_text      | Buy Chinese bootl... 
 helpful_ratio    | 0.8                  
 helpful          | 1                    
-RECORD 2--------------------------------
 product_id       | B00XUV1B4U    

In [19]:
df = df.sample(False, 0.5, seed = 0)

## USE Pipeline

In [20]:
#USE (Universal Sentence Encoder) Sentence Embedding
document = DocumentAssembler()\
    .setInputCol("review_text")\
    .setOutputCol("document")
    
embeddingsSentence = UniversalSentenceEncoder.load('tfhub_use_en_2.4.0_2.4_1587136330099') \
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")

classifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("prediction")\
  .setLabelColumn("helpful")\
  .setMaxEpochs(5)\
  .setEnableOutputLogs(True)

use_clf_pipeline = Pipeline(
    stages = [
        document,
        embeddingsSentence,
        classifierdl
    ])

In [21]:
train, test = df.randomSplit([0.8, 0.2], seed=12345)

In [22]:
%%time
pipeline2 = use_clf_pipeline.fit(train)

Wall time: 2min 14s


In [23]:
%%time
final = pipeline2.transform(test)

Wall time: 49.5 ms


In [24]:
metrics = final.select('helpful','product_id',"prediction.result")

## USE Results

In [25]:
%%time
from sklearn.metrics import classification_report, accuracy_score
metrics_final = metrics.toPandas()
metrics_final['result'] = metrics_final['result'].apply(lambda x: x[0])
metrics_final['result'] = metrics_final['result'].astype('int')

print(classification_report(metrics_final.helpful, metrics_final.result))
print(accuracy_score(metrics_final.helpful, metrics_final.result))

              precision    recall  f1-score   support

           0       0.69      0.66      0.67     15431
           1       0.84      0.86      0.85     32661

    accuracy                           0.80     48092
   macro avg       0.77      0.76      0.76     48092
weighted avg       0.79      0.80      0.80     48092

0.796847708558596
Wall time: 13.6 s


## USE Inference

In [26]:
light_model = LightPipeline(pipeline2)
#Using a review that was stated Helpful on Amazon
text="I learned 10 dances on the first disc. Easy dances to follow. Instructor was very good. The best part for me was that at the end, they combined all the dances into a 30 minute workout. Haven't gone to the 2nd disc yet, but I will. Great value for the money."
light_model.annotate(text)['prediction'][0]

'1'

In [27]:
#Using a review that has not beed stated Helpful on Amazon YET
text="Was good"
light_model.annotate(text)['prediction'][0]

'0'

In [37]:
#Using a review that has not beed stated Helpful on Amazon YET
text="No complaints, was good, could be better, not gonna buy again"
light_model.annotate(text)['prediction'][0]

'0'

## BERT_uncased Embedding Pipeline

In [29]:
#BERT Sentence Embedding
document = DocumentAssembler()\
    .setInputCol("review_text")\
    .setOutputCol("document")
    
bert_cmlm = BertSentenceEmbeddings.load('sent_bert_base_uncased_en_2.6.0_2.4_1598346203624')\
.setInputCols(["document"])\
.setOutputCol("sentence_embeddings")

classifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("prediction")\
  .setLabelColumn("helpful")\
  .setMaxEpochs(5)\
  .setEnableOutputLogs(True)

use_clf_pipeline = Pipeline(
    stages = [
        document,
        embeddingsSentence,
        classifierdl
    ])

In [30]:
%%time
pipeline2 = use_clf_pipeline.fit(train)

Wall time: 2min 6s


In [31]:
%%time
final = pipeline2.transform(test)

Wall time: 39.5 ms


In [32]:
metrics = final.select('helpful','product_id',"prediction.result")

## BERT_uncased Embedding Results

In [33]:
%%time
from sklearn.metrics import classification_report, accuracy_score
metrics_final = metrics.toPandas()
metrics_final['result'] = metrics_final['result'].apply(lambda x: x[0])
metrics_final['result'] = metrics_final['result'].astype('int')

print(classification_report(metrics_final.helpful, metrics_final.result))
print(accuracy_score(metrics_final.helpful, metrics_final.result))

              precision    recall  f1-score   support

           0       0.72      0.63      0.67     15431
           1       0.83      0.88      0.86     32661

    accuracy                           0.80     48092
   macro avg       0.78      0.76      0.76     48092
weighted avg       0.80      0.80      0.80     48092

0.8017133826831906
Wall time: 14 s


## BERT Inference

In [34]:
light_model = LightPipeline(pipeline2)
#Using a review that was stated Helpful on Amazon
text="I learned 10 dances on the first disc. Easy dances to follow. Instructor was very good. The best part for me was that at the end, they combined all the dances into a 30 minute workout. Haven't gone to the 2nd disc yet, but I will. Great value for the money."
light_model.annotate(text)['prediction'][0]

'1'

In [35]:
#Using a review that has not beed stated Helpful on Amazon YET
text="Was good"
light_model.annotate(text)['prediction'][0]

'0'

In [38]:
#Using a review that has not beed stated Helpful on Amazon YET
text="No complaints, was good, could be better, not gonna buy again"
light_model.annotate(text)['prediction'][0]

'0'