# Review NLP Helpfulness Prediction NER (Digital Video)

## Creating Spark Session & Importing All Necessary Libraries

In [2]:
import findspark
findspark.init()

In [3]:
# Import Spark NLP
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql import functions as F

import sparknlp

# Start SparkSession with Spark NLP
# start() functions has 5 parameters: gpu, spark23, spark24, spark32, and memory
# sparknlp.start(gpu=True) will start the session with GPU support
# sparknlp.start(spark23=True) is when you have Apache Spark 2.3.x installed
# sparknlp.start(spark24=True) is when you have Apache Spark 2.4.x installed
# sparknlp.start(spark32=True) is when you have Apache Spark 3.2.x installed
# sparknlp.start(memory="16G") to change the default driver memory in SparkSession
spark = sparknlp.start(gpu = True, spark32= True)

### Reading in Data

In [4]:
df = spark.read \
    .option("quote", "\"")  \
    .option("escape", "\"") \
    .option("ignoreLeadingWhiteSpace",True) \
    .csv(r"C:\Users\kenne\OneDrive\Desktop\UChicago\Python\Final Project Big Data Amazon Review\Amazon Review Kaggle Data\amazon_reviews_us_Digital_Video_Download_v1_00.tsv",inferSchema=True,header=True, sep='\t' )

## Code Cleaning

In [5]:
df.show(1, vertical = True, truncate = False)

-RECORD 0-----------------------------------------------------------------------------------------------------
 marketplace       | US                                                                                       
 customer_id       | 12190288                                                                                 
 review_id         | R3FU16928EP5TC                                                                           
 product_id        | B00AYB1482                                                                               
 product_parent    | 668895143                                                                                
 product_title     | Enlightened: Season 1                                                                    
 product_category  | Digital_Video_Download                                                                   
 star_rating       | 5                                                                                        
 

In [6]:
df = df.select('product_id', 'star_rating', 'product_category', 'review_headline', 'review_body', 'helpful_votes', 'total_votes')

In [7]:
df.show(1, vertical = True, truncate = False)

-RECORD 0----------------------------------------------------------------------------------------------------
 product_id       | B00AYB1482                                                                               
 star_rating      | 5                                                                                        
 product_category | Digital_Video_Download                                                                   
 review_headline  | I loved it and I wish there was a season 3                                               
 review_body      | I loved it and I wish there was a season 3... I watched season 2 and loved that as well! 
 helpful_votes    | 0                                                                                        
 total_votes      | 0                                                                                        
only showing top 1 row



In [8]:
df.dropna().count()

4056163

In [9]:
from pyspark.sql.functions import col,isnan, when, count
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]
   ).show()

+----------+-----------+----------------+---------------+-----------+-------------+-----------+
|product_id|star_rating|product_category|review_headline|review_body|helpful_votes|total_votes|
+----------+-----------+----------------+---------------+-----------+-------------+-----------+
|         0|          0|               0|            369|        616|            0|          0|
+----------+-----------+----------------+---------------+-----------+-------------+-----------+



In [10]:
df2 = df.select([count(when(col(c).contains('None') | \
                            col(c).contains('NULL') | \
                            (col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in df.columns])
df2.count()
df2.show(5, vertical = True)

-RECORD 0----------------
 product_id       | 19   
 star_rating      | 0    
 product_category | 0    
 review_headline  | 601  
 review_body      | 4460 
 helpful_votes    | 0    
 total_votes      | 0    



In [11]:
df = df.filter(col('total_votes') > 10)
df.count()

62710

In [12]:
df = df.fillna("", "review_body")
df = df.fillna("", "review_headline")

In [13]:
df = df.withColumn('review_text', F.concat('review_headline', F.lit(" "), 'review_body'))
df.show(1, vertical = True, truncate = False)

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 product_id       | B01489L5LQ                                                                                                                                                                                                                                                                                                                                                                                                                                                              

In [14]:
df.filter((col("review_text").isNull()) | ( col("review_text") == "")).show(1, vertical = True, truncate = False)

(0 rows)



In [15]:
df.select([count(when((col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in df.columns]).show(5, vertical = True)

-RECORD 0---------------
 product_id       | 0   
 star_rating      | 0   
 product_category | 0   
 review_headline  | 1   
 review_body      | 23  
 helpful_votes    | 0   
 total_votes      | 0   
 review_text      | 0   



In [16]:
df = df.withColumn('helpful_ratio', F.col('helpful_votes') / F.col('total_votes'))
df.show(1, vertical = True, truncate = False)

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 product_id       | B01489L5LQ                                                                                                                                                                                                                                                                                                                                                                                                                                                              

In [17]:
df.filter(col('helpful_ratio') < 0).count()

0

In [18]:
df = df.withColumn('helpful', when(col("helpful_ratio") < 0.5, 0).otherwise(1))

In [19]:
df.show(5, vertical = True)

-RECORD 0--------------------------------
 product_id       | B01489L5LQ           
 star_rating      | 4                    
 product_category | Digital_Video_Dow... 
 review_headline  | Charming movie       
 review_body      | This movie isn't ... 
 helpful_votes    | 17                   
 total_votes      | 18                   
 review_text      | Charming movie Th... 
 helpful_ratio    | 0.9444444444444444   
 helpful          | 1                    
-RECORD 1--------------------------------
 product_id       | B00SZT6I3G           
 star_rating      | 1                    
 product_category | Digital_Video_Dow... 
 review_headline  | If it can't be br... 
 review_body      | If it can't be br... 
 helpful_votes    | 11                   
 total_votes      | 18                   
 review_text      | If it can't be br... 
 helpful_ratio    | 0.6111111111111112   
 helpful          | 1                    
-RECORD 2--------------------------------
 product_id       | B00VO8D13K    

## NER Pipeline

In [26]:
#NER (Universal Sentence Encoder) Sentence Embedding
documentAssembler = DocumentAssembler()\
    .setInputCol("review_text")\
    .setOutputCol("document")

sentence_detector = SentenceDetector() \
    .setInputCols('document') \
    .setOutputCol('sentence')

tokenizer = Tokenizer() \
    .setInputCols(['sentence']) \
    .setOutputCol('token')

##Using Bert Embeddings
embeddings = BertEmbeddings.load('bert_base_cased_en_2.6.0_2.4_1598340336670') \
        .setInputCols(['document', 'token']) \
        .setOutputCol('embeddings')

#NER Bert English Base
#ner_dl_bert is a Named Entity Recognition (or NER) model, meaning it annotates text to find features like the names of people, places, and organizations. 
#It was trained on the CoNLL 2003 text corpus. 
#This NER model does not read words directly but instead reads word embeddings, which represent words as points such that more semantically similar words are closer together. 
#ner_dl_bert model is trained with bert_based_cased word embeddings, so be sure to use the same embeddings in the pipeline.
ner_model = NerDLModel.load('ner_dl_bert_en_2.6.0_2.4_1599550979101') \
    .setInputCols(['sentence', 'token', 'embeddings']) \
    .setOutputCol('ner')

ner_converter = NerConverter() \
    .setInputCols(['sentence', 'token', 'ner']) \
    .setOutputCol('ner_chunk')

nlp_pipeline = Pipeline(stages=[
    documentAssembler, 
    sentence_detector,
    tokenizer,
    embeddings,
    ner_model,
    ner_converter
])

In [27]:
%%time
pipeline2 = nlp_pipeline.fit(df)

Wall time: 40 ms


In [28]:
%%time
final_df = pipeline2.transform(df)

Wall time: 172 ms


## Visualize NER Results

In [30]:
#! pip install spark-nlp-display

Collecting spark-nlp-display
  Downloading spark_nlp_display-1.9.1-py3-none-any.whl (95 kB)
Collecting svgwrite==1.4
  Downloading svgwrite-1.4-py3-none-any.whl (66 kB)
Installing collected packages: svgwrite, spark-nlp-display
Successfully installed spark-nlp-display-1.9.1 svgwrite-1.4


In [None]:
from sparknlp_display import NerVisualizer

NerVisualizer().display(
    final_df = final_df.collect()[0],
    label_col = 'ner_chunk',
    document_col = 'document'
)

## USE Results

In [46]:
%%time
from sklearn.metrics import classification_report, accuracy_score
metrics_final = metrics.toPandas()
metrics_final['result'] = metrics_final['result'].apply(lambda x: x[0])
metrics_final['result'] = metrics_final['result'].astype('int')

print(classification_report(metrics_final.helpful, metrics_final.result))
print(accuracy_score(metrics_final.helpful, metrics_final.result))

              precision    recall  f1-score   support

           0       0.79      0.88      0.83     33775
           1       0.84      0.73      0.78     28935

    accuracy                           0.81     62710
   macro avg       0.81      0.80      0.81     62710
weighted avg       0.81      0.81      0.81     62710

0.8102854409185138
Wall time: 10.5 s
