# Review NLP Helpfulness Prediction PretrainedModels (Digital Video)

## Creating Spark Session & Importing All Necessary Libraries

In [1]:
import findspark
findspark.init()

In [2]:
# Import Spark NLP
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql import functions as F

import sparknlp

# Start SparkSession with Spark NLP
# start() functions has 5 parameters: gpu, spark23, spark24, spark32, and memory
# sparknlp.start(gpu=True) will start the session with GPU support
# sparknlp.start(spark23=True) is when you have Apache Spark 2.3.x installed
# sparknlp.start(spark24=True) is when you have Apache Spark 2.4.x installed
# sparknlp.start(spark32=True) is when you have Apache Spark 3.2.x installed
# sparknlp.start(memory="16G") to change the default driver memory in SparkSession
spark = sparknlp.start(gpu = True, spark32= True)

### Reading in Data

In [3]:
df = spark.read \
    .option("quote", "\"")  \
    .option("escape", "\"") \
    .option("ignoreLeadingWhiteSpace",True) \
    .csv(r"C:\Users\kenne\OneDrive\Desktop\UChicago\Python\Final Project Big Data Amazon Review\Amazon Review Kaggle Data\amazon_reviews_us_Digital_Video_Download_v1_00.tsv",inferSchema=True,header=True, sep='\t' )

## Code Cleaning

In [4]:
df.show(1, vertical = True, truncate = False)

-RECORD 0-----------------------------------------------------------------------------------------------------
 marketplace       | US                                                                                       
 customer_id       | 12190288                                                                                 
 review_id         | R3FU16928EP5TC                                                                           
 product_id        | B00AYB1482                                                                               
 product_parent    | 668895143                                                                                
 product_title     | Enlightened: Season 1                                                                    
 product_category  | Digital_Video_Download                                                                   
 star_rating       | 5                                                                                        
 

In [5]:
df = df.select('product_id', 'star_rating', 'product_category', 'review_headline', 'review_body', 'helpful_votes', 'total_votes')

In [6]:
df.show(1, vertical = True, truncate = False)

-RECORD 0----------------------------------------------------------------------------------------------------
 product_id       | B00AYB1482                                                                               
 star_rating      | 5                                                                                        
 product_category | Digital_Video_Download                                                                   
 review_headline  | I loved it and I wish there was a season 3                                               
 review_body      | I loved it and I wish there was a season 3... I watched season 2 and loved that as well! 
 helpful_votes    | 0                                                                                        
 total_votes      | 0                                                                                        
only showing top 1 row



In [7]:
df.dropna().count()

4056163

In [8]:
from pyspark.sql.functions import col,isnan, when, count
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]
   ).show()

+----------+-----------+----------------+---------------+-----------+-------------+-----------+
|product_id|star_rating|product_category|review_headline|review_body|helpful_votes|total_votes|
+----------+-----------+----------------+---------------+-----------+-------------+-----------+
|         0|          0|               0|            369|        616|            0|          0|
+----------+-----------+----------------+---------------+-----------+-------------+-----------+



In [9]:
df2 = df.select([count(when(col(c).contains('None') | \
                            col(c).contains('NULL') | \
                            (col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in df.columns])
df2.count()
df2.show(5, vertical = True)

-RECORD 0----------------
 product_id       | 19   
 star_rating      | 0    
 product_category | 0    
 review_headline  | 601  
 review_body      | 4460 
 helpful_votes    | 0    
 total_votes      | 0    



In [10]:
df = df.filter(col('total_votes') > 10)
df.count()

62710

In [11]:
df = df.fillna("", "review_body")
df = df.fillna("", "review_headline")

In [12]:
df = df.withColumn('review_text', F.concat('review_headline', F.lit(" "), 'review_body'))
df.show(1, vertical = True, truncate = False)

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 product_id       | B01489L5LQ                                                                                                                                                                                                                                                                                                                                                                                                                                                              

In [13]:
df.filter((col("review_text").isNull()) | ( col("review_text") == "")).show(1, vertical = True, truncate = False)

(0 rows)



In [14]:
df.select([count(when((col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in df.columns]).show(5, vertical = True)

-RECORD 0---------------
 product_id       | 0   
 star_rating      | 0   
 product_category | 0   
 review_headline  | 1   
 review_body      | 23  
 helpful_votes    | 0   
 total_votes      | 0   
 review_text      | 0   



In [15]:
df = df.withColumn('helpful_ratio', F.col('helpful_votes') / F.col('total_votes'))
df.show(1, vertical = True, truncate = False)

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 product_id       | B01489L5LQ                                                                                                                                                                                                                                                                                                                                                                                                                                                              

In [16]:
df.filter(col('helpful_ratio') < 0).count()

0

In [17]:
df = df.withColumn('helpful', when(col("helpful_ratio") < 0.5, 0).otherwise(1))

In [18]:
df.show(5, vertical = True)

-RECORD 0--------------------------------
 product_id       | B01489L5LQ           
 star_rating      | 4                    
 product_category | Digital_Video_Dow... 
 review_headline  | Charming movie       
 review_body      | This movie isn't ... 
 helpful_votes    | 17                   
 total_votes      | 18                   
 review_text      | Charming movie Th... 
 helpful_ratio    | 0.9444444444444444   
 helpful          | 1                    
-RECORD 1--------------------------------
 product_id       | B00SZT6I3G           
 star_rating      | 1                    
 product_category | Digital_Video_Dow... 
 review_headline  | If it can't be br... 
 review_body      | If it can't be br... 
 helpful_votes    | 11                   
 total_votes      | 18                   
 review_text      | If it can't be br... 
 helpful_ratio    | 0.6111111111111112   
 helpful          | 1                    
-RECORD 2--------------------------------
 product_id       | B00VO8D13K    

## USE Pipeline

In [19]:
#USE (Universal Sentence Encoder) Sentence Embedding
document = DocumentAssembler()\
    .setInputCol("review_text")\
    .setOutputCol("document")
    
embeddingsSentence = UniversalSentenceEncoder.load('tfhub_use_en_2.4.0_2.4_1587136330099') \
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")

classifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("prediction")\
  .setLabelColumn("helpful")\
  .setMaxEpochs(5)\
  .setEnableOutputLogs(True)

use_clf_pipeline = Pipeline(
    stages = [
        document,
        embeddingsSentence,
        classifierdl
    ])

##### Train & Test Split

In [21]:
train, test = df.randomSplit([0.8, 0.2], seed=12345)

In [22]:
%%time
pipeline2 = use_clf_pipeline.fit(train)

Wall time: 33.5 s


In [23]:
%%time
final = pipeline2.transform(test)

Wall time: 47.5 ms


In [24]:
metrics = final.select('helpful','product_id',"prediction.result")

## USE Results

In [25]:
%%time
from sklearn.metrics import classification_report, accuracy_score
metrics_final = metrics.toPandas()
metrics_final['result'] = metrics_final['result'].apply(lambda x: x[0])
metrics_final['result'] = metrics_final['result'].astype('int')

print(classification_report(metrics_final.helpful, metrics_final.result))
print(accuracy_score(metrics_final.helpful, metrics_final.result))

              precision    recall  f1-score   support

           0       0.78      0.79      0.78      6812
           1       0.75      0.75      0.75      5924

    accuracy                           0.77     12736
   macro avg       0.77      0.77      0.77     12736
weighted avg       0.77      0.77      0.77     12736

0.7675094221105527
Wall time: 4.58 s


## USE Inference

In [27]:
light_model = LightPipeline(pipeline2)
#Using a review that was stated Helpful on Amazon
text="The show is smart and awkwardly, yet deliciously, inappropriate. Miss, miss, miss Steve Carell but after a weak season 8, the Office has rebounded with season 9 and will end its run with high marks. Season 8 had its moments but the show seemed rudderless without Michael – Robert California and Nellie were just weird and Andy is no Michael. When it seemed all hope was lost, the show shifts to a more ensemble – no superstar- approach in season 9 which, with Michael gone, really works. With such wonderful characters in Dwight, Jim, Meridith, Stanley, Angela, Kevin, Oscar, Darrell and Phyllis it’s nice to have all the story lines going at once – Nellie fits in much better this year too. Andy and Erin are fine in the mix but are much better in doses than in being the main focus. A little of Andy goes a long way. That shift was a game changer in a good way."
light_model.annotate(text)['prediction'][0]

'1'

In [38]:
#Using a review that has not beed stated Helpful on Amazon YET
text="Liked it"
light_model.annotate(text)['prediction'][0]

'0'

In [42]:
#Using a review that has not beed stated Helpful on Amazon YET
text="No complaints, could keep watching"
light_model.annotate(text)['prediction'][0]

'0'

In [52]:
#Using a review that has not beed stated Helpful on Amazon YET
text="I tossed it in the trash. It smelled so bad."
light_model.annotate(text)['prediction'][0]

'0'

## BERT_uncased Embedding Pipeline

In [43]:
#BERT Sentence Embedding
document = DocumentAssembler()\
    .setInputCol("review_text")\
    .setOutputCol("document")
    
bert_cmlm = BertSentenceEmbeddings.load('sent_bert_base_uncased_en_2.6.0_2.4_1598346203624')\
.setInputCols(["document"])\
.setOutputCol("sentence_embeddings")

classifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("prediction")\
  .setLabelColumn("helpful")\
  .setMaxEpochs(5)\
  .setEnableOutputLogs(True)

use_clf_pipeline = Pipeline(
    stages = [
        document,
        embeddingsSentence,
        classifierdl
    ])

In [44]:
%%time
pipeline2 = use_clf_pipeline.fit(train)

Wall time: 33.3 s


In [45]:
%%time
final = pipeline2.transform(test)

Wall time: 36.5 ms


In [46]:
metrics = final.select('helpful','product_id',"prediction.result")

## BERT_uncased Embedding Results

In [47]:
%%time
from sklearn.metrics import classification_report, accuracy_score
metrics_final = metrics.toPandas()
metrics_final['result'] = metrics_final['result'].apply(lambda x: x[0])
metrics_final['result'] = metrics_final['result'].astype('int')

print(classification_report(metrics_final.helpful, metrics_final.result))
print(accuracy_score(metrics_final.helpful, metrics_final.result))

              precision    recall  f1-score   support

           0       0.75      0.85      0.80      6812
           1       0.79      0.68      0.73      5924

    accuracy                           0.77     12736
   macro avg       0.77      0.76      0.76     12736
weighted avg       0.77      0.77      0.77     12736

0.7681375628140703
Wall time: 3.92 s


## BERT_uncased Inference

In [48]:
light_model = LightPipeline(pipeline2)
#Using a review that was stated Helpful on Amazon
text="The show is smart and awkwardly, yet deliciously, inappropriate. Miss, miss, miss Steve Carell but after a weak season 8, the Office has rebounded with season 9 and will end its run with high marks. Season 8 had its moments but the show seemed rudderless without Michael – Robert California and Nellie were just weird and Andy is no Michael. When it seemed all hope was lost, the show shifts to a more ensemble – no superstar- approach in season 9 which, with Michael gone, really works. With such wonderful characters in Dwight, Jim, Meridith, Stanley, Angela, Kevin, Oscar, Darrell and Phyllis it’s nice to have all the story lines going at once – Nellie fits in much better this year too. Andy and Erin are fine in the mix but are much better in doses than in being the main focus. A little of Andy goes a long way. That shift was a game changer in a good way."
light_model.annotate(text)['prediction'][0]

'1'

In [50]:
#Using a review that has not beed stated Helpful on Amazon YET
text="Liked it"
light_model.annotate(text)['prediction'][0]

'0'

In [51]:
#Using a review that has not beed stated Helpful on Amazon YET
text="No complaints, could keep watching"
light_model.annotate(text)['prediction'][0]

'0'

In [53]:
#Using a review that has not beed stated Helpful on Amazon YET
text="I tossed it in the trash. It smelled so bad."
light_model.annotate(text)['prediction'][0]

'0'