# Review NLP Helpfulness Prediction PretrainedModels (Music)

## Creating Spark Session & Importing All Necessary Libraries

In [1]:
import findspark
findspark.init()

In [2]:
# Import Spark NLP
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql import functions as F

import sparknlp

# Start SparkSession with Spark NLP
# start() functions has 5 parameters: gpu, spark23, spark24, spark32, and memory
# sparknlp.start(gpu=True) will start the session with GPU support
# sparknlp.start(spark23=True) is when you have Apache Spark 2.3.x installed
# sparknlp.start(spark24=True) is when you have Apache Spark 2.4.x installed
# sparknlp.start(spark32=True) is when you have Apache Spark 3.2.x installed
# sparknlp.start(memory="16G") to change the default driver memory in SparkSession
spark = sparknlp.start(gpu = True, spark32= True)

### Reading in Data

In [3]:
df = spark.read \
    .option("quote", "\"")  \
    .option("escape", "\"") \
    .option("ignoreLeadingWhiteSpace",True) \
    .csv(r"C:\Users\kenne\OneDrive\Desktop\UChicago\Python\Final Project Big Data Amazon Review\Amazon Review Kaggle Data\amazon_reviews_us_Music_v1_00.tsv",inferSchema=True,header=True, sep='\t' )

## Code Cleaning

In [4]:
df.show(1, vertical = True, truncate = False)

-RECORD 0--------------------------------------------------------------------
 marketplace       | US                                                      
 customer_id       | 10140119                                                
 review_id         | R3LI5TRP3YIDQL                                          
 product_id        | B00TXH4OLC                                              
 product_parent    | 384427924                                               
 product_title     | Whatever's for Us: Remastered                           
 product_category  | Music                                                   
 star_rating       | 5                                                       
 helpful_votes     | 0                                                       
 total_votes       | 0                                                       
 vine              | N                                                       
 verified_purchase | Y                                          

In [5]:
df = df.select('product_id', 'star_rating', 'product_category', 'review_headline', 'review_body', 'helpful_votes', 'total_votes')

In [6]:
df.show(1, vertical = True, truncate = False)

-RECORD 0-------------------------------------------------------------------
 product_id       | B00TXH4OLC                                              
 star_rating      | 5                                                       
 product_category | Music                                                   
 review_headline  | Five Stars                                              
 review_body      | Love this CD along with other CDs by the same musician. 
 helpful_votes    | 0                                                       
 total_votes      | 0                                                       
only showing top 1 row



In [7]:
df.dropna().count()

4751006

In [8]:
from pyspark.sql.functions import col,isnan, when, count
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]
   ).show()

+----------+-----------+----------------+---------------+-----------+-------------+-----------+
|product_id|star_rating|product_category|review_headline|review_body|helpful_votes|total_votes|
+----------+-----------+----------------+---------------+-----------+-------------+-----------+
|         0|         37|              35|            219|        389|           37|         37|
+----------+-----------+----------------+---------------+-----------+-------------+-----------+



In [9]:
df2 = df.select([count(when(col(c).contains('None') | \
                            col(c).contains('NULL') | \
                            (col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in df.columns])
df2.count()
df2.show(5, vertical = True)

-RECORD 0-----------------
 product_id       | 198   
 star_rating      | 37    
 product_category | 35    
 review_headline  | 950   
 review_body      | 20498 
 helpful_votes    | 37    
 total_votes      | 37    



In [10]:
df = df.filter(col('total_votes') > 10)
df.count()

406095

In [11]:
df = df.fillna("", "review_body")
df = df.fillna("", "review_headline")

In [12]:
df = df.withColumn('review_text', F.concat('review_headline', F.lit(" "), 'review_body'))
df.show(1, vertical = True, truncate = False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [13]:
df.filter((col("review_text").isNull()) | ( col("review_text") == "")).show(1, vertical = True, truncate = False)

(0 rows)



In [14]:
df.select([count(when((col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in df.columns]).show(5, vertical = True)

-RECORD 0---------------
 product_id       | 0   
 star_rating      | 0   
 product_category | 0   
 review_headline  | 9   
 review_body      | 7   
 helpful_votes    | 0   
 total_votes      | 0   
 review_text      | 0   



In [15]:
df = df.withColumn('helpful_ratio', F.col('helpful_votes') / F.col('total_votes'))
df.show(1, vertical = True, truncate = False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [16]:
df.filter(col('helpful_ratio') < 0).count()

0

In [17]:
df = df.withColumn('helpful', when(col("helpful_ratio") < 0.5, 0).otherwise(1))

In [18]:
df.show(5, vertical = True)

-RECORD 0--------------------------------
 product_id       | B010FP0WRU           
 star_rating      | 3                    
 product_category | Music                
 review_headline  | Up it by a half-s... 
 review_body      | A solid collectio... 
 helpful_votes    | 25                   
 total_votes      | 26                   
 review_text      | Up it by a half-s... 
 helpful_ratio    | 0.9615384615384616   
 helpful          | 1                    
-RECORD 1--------------------------------
 product_id       | B00ZGJ85Y8           
 star_rating      | 5                    
 product_category | Music                
 review_headline  | Awesome Soundtrack   
 review_body      | Yes, I love every... 
 helpful_votes    | 14                   
 total_votes      | 15                   
 review_text      | Awesome Soundtrac... 
 helpful_ratio    | 0.9333333333333333   
 helpful          | 1                    
-RECORD 2--------------------------------
 product_id       | B00ZYBH6M0    

## USE Pipeline

In [19]:
#USE (Universal Sentence Encoder) Sentence Embedding
document = DocumentAssembler()\
    .setInputCol("review_text")\
    .setOutputCol("document")
    
embeddingsSentence = UniversalSentenceEncoder.load('tfhub_use_en_2.4.0_2.4_1587136330099') \
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")

classifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("prediction")\
  .setLabelColumn("helpful")\
  .setMaxEpochs(5)\
  .setEnableOutputLogs(True)

use_clf_pipeline = Pipeline(
    stages = [
        document,
        embeddingsSentence,
        classifierdl
    ])

In [20]:
train, test = df.randomSplit([0.8, 0.2], seed=12345)

In [21]:
%%time
pipeline2 = use_clf_pipeline.fit(train)

Wall time: 3min 52s


In [22]:
%%time
final = pipeline2.transform(test)

Wall time: 93.8 ms


In [23]:
metrics = final.select('helpful','product_id',"prediction.result")

## USE Results

In [24]:
%%time
from sklearn.metrics import classification_report, accuracy_score
metrics_final = metrics.toPandas()
metrics_final['result'] = metrics_final['result'].apply(lambda x: x[0])
metrics_final['result'] = metrics_final['result'].astype('int')

print(classification_report(metrics_final.helpful, metrics_final.result))
print(accuracy_score(metrics_final.helpful, metrics_final.result))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00     22655
           1       0.72      1.00      0.84     58711

    accuracy                           0.72     81366
   macro avg       0.36      0.50      0.42     81366
weighted avg       0.52      0.72      0.60     81366

0.7215667477816287
Wall time: 20.4 s


  _warn_prf(average, modifier, msg_start, len(result))


## USE Inference

In [25]:
light_model = LightPipeline(pipeline2)
#Using a review that was stated Helpful on Amazon
text="The images are beautiful, though I wish there was a little bit of gentle camera movement. A slooow zoom in, out, or pan would be nice. The music was the best for meditation or relaxation out of all of the videos I tried here, so far."
light_model.annotate(text)['prediction'][0]

'1'

In [26]:
#Using a review that has not beed stated Helpful on Amazon YET
text="Was good"
light_model.annotate(text)['prediction'][0]

'1'

In [27]:
#Using a review that has not beed stated Helpful on Amazon YET
text="No complaints, was good, could be better, not gonna buy again"
light_model.annotate(text)['prediction'][0]

'1'

## BERT_uncased Embedding Pipeline

In [28]:
#BERT Sentence Embedding
document = DocumentAssembler()\
    .setInputCol("review_text")\
    .setOutputCol("document")
    
bert_cmlm = BertSentenceEmbeddings.load('sent_bert_base_uncased_en_2.6.0_2.4_1598346203624')\
.setInputCols(["document"])\
.setOutputCol("sentence_embeddings")

classifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("prediction")\
  .setLabelColumn("helpful")\
  .setMaxEpochs(5)\
  .setEnableOutputLogs(True)

use_clf_pipeline = Pipeline(
    stages = [
        document,
        embeddingsSentence,
        classifierdl
    ])

In [29]:
%%time
pipeline2 = use_clf_pipeline.fit(train)

Wall time: 3min 52s


In [30]:
%%time
final = pipeline2.transform(test)

Wall time: 64.1 ms


In [31]:
metrics = final.select('helpful','product_id',"prediction.result")

## BERT_uncased Embedding Results

In [32]:
%%time
from sklearn.metrics import classification_report, accuracy_score
metrics_final = metrics.toPandas()
metrics_final['result'] = metrics_final['result'].apply(lambda x: x[0])
metrics_final['result'] = metrics_final['result'].astype('int')

print(classification_report(metrics_final.helpful, metrics_final.result))
print(accuracy_score(metrics_final.helpful, metrics_final.result))

              precision    recall  f1-score   support

           0       0.73      0.69      0.71     22655
           1       0.88      0.90      0.89     58711

    accuracy                           0.84     81366
   macro avg       0.81      0.80      0.80     81366
weighted avg       0.84      0.84      0.84     81366

0.8442592729149768
Wall time: 18.6 s


## BERT Inference

In [33]:
light_model = LightPipeline(pipeline2)
#Using a review that was stated Helpful on Amazon
text="The images are beautiful, though I wish there was a little bit of gentle camera movement. A slooow zoom in, out, or pan would be nice. The music was the best for meditation or relaxation out of all of the videos I tried here, so far."
light_model.annotate(text)['prediction'][0]

'1'

In [34]:
#Using a review that has not beed stated Helpful on Amazon YET
text="Was good"
light_model.annotate(text)['prediction'][0]

'0'

In [35]:
#Using a review that has not beed stated Helpful on Amazon YET
text="No complaints, was good, could be better, not gonna buy again"
light_model.annotate(text)['prediction'][0]

'0'