# Review NLP Helpfulness Prediction T5 (Digital Software)

## Creating Spark Session & Importing All Necessary Libraries

In [1]:
import findspark
findspark.init()

In [2]:
# Import Spark NLP
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql import functions as F

import sparknlp

# Start SparkSession with Spark NLP
# start() functions has 5 parameters: gpu, spark23, spark24, spark32, and memory
# sparknlp.start(gpu=True) will start the session with GPU support
# sparknlp.start(spark23=True) is when you have Apache Spark 2.3.x installed
# sparknlp.start(spark24=True) is when you have Apache Spark 2.4.x installed
# sparknlp.start(spark32=True) is when you have Apache Spark 3.2.x installed
# sparknlp.start(memory="16G") to change the default driver memory in SparkSession
spark = sparknlp.start(gpu = True, spark32= True)

In [3]:
from pyspark.sql.functions import col, concat_ws

### Reading in Data

In [4]:
df = spark.read \
    .option("quote", "\"")  \
    .option("escape", "\"") \
    .option("ignoreLeadingWhiteSpace",True) \
    .csv(r"C:\Users\kenne\OneDrive\Desktop\UChicago\Python\Final Project Big Data Amazon Review\Amazon Review Kaggle Data\amazon_reviews_us_Digital_Software_v1_00.tsv",inferSchema=True,header=True, sep='\t' )

## Code Cleaning

In [5]:
df.show(1, vertical = True, truncate = False)

-RECORD 0-------------------------------------
 marketplace       | US                       
 customer_id       | 17747349                 
 review_id         | R2EI7QLPK4LF7U           
 product_id        | B00U7LCE6A               
 product_parent    | 106182406                
 product_title     | CCleaner Free [Download] 
 product_category  | Digital_Software         
 star_rating       | 4                        
 helpful_votes     | 0                        
 total_votes       | 0                        
 vine              | N                        
 verified_purchase | Y                        
 review_headline   | Four Stars               
 review_body       | So far so good           
 review_date       | 2015-08-31               
only showing top 1 row



In [6]:
df = df.select('product_id', 'star_rating', 'product_category', 'review_headline', 'review_body', 'helpful_votes', 'total_votes')

In [7]:
df.show(1, vertical = True, truncate = False)

-RECORD 0----------------------------
 product_id       | B00U7LCE6A       
 star_rating      | 4                
 product_category | Digital_Software 
 review_headline  | Four Stars       
 review_body      | So far so good   
 helpful_votes    | 0                
 total_votes      | 0                
only showing top 1 row



In [8]:
df.dropna().count()

102078

In [9]:
from pyspark.sql.functions import col,isnan, when, count
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]
   ).show()

+----------+-----------+----------------+---------------+-----------+-------------+-----------+
|product_id|star_rating|product_category|review_headline|review_body|helpful_votes|total_votes|
+----------+-----------+----------------+---------------+-----------+-------------+-----------+
|         0|          0|               0|              0|          6|            0|          0|
+----------+-----------+----------------+---------------+-----------+-------------+-----------+



In [10]:
df2 = df.select([count(when(col(c).contains('None') | \
                            col(c).contains('NULL') | \
                            (col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in df.columns])
df2.count()
df2.show(5, vertical = True)

-RECORD 0---------------
 product_id       | 0   
 star_rating      | 0   
 product_category | 0   
 review_headline  | 15  
 review_body      | 196 
 helpful_votes    | 0   
 total_votes      | 0   



In [11]:
df = df.filter(col('total_votes') > 10)
df.count()

5214

In [12]:
df = df.fillna("", "review_body")
df = df.fillna("", "review_headline")

In [13]:
df = df.withColumn('review_text', F.concat('review_headline', F.lit(" "), 'review_body'))
df.show(1, vertical = True, truncate = False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [14]:
df.filter((col("review_text").isNull()) | ( col("review_text") == "")).show(1, vertical = True, truncate = False)

(0 rows)



In [15]:
df.select([count(when((col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in df.columns]).show(5, vertical = True)

-RECORD 0---------------
 product_id       | 0   
 star_rating      | 0   
 product_category | 0   
 review_headline  | 0   
 review_body      | 1   
 helpful_votes    | 0   
 total_votes      | 0   
 review_text      | 0   



In [16]:
df = df.withColumn('helpful_ratio', F.col('helpful_votes') / F.col('total_votes'))
df.show(1, vertical = True, truncate = False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [17]:
df.filter(col('helpful_ratio') < 0).count()

0

In [18]:
df = df.withColumn('helpful', when(col("helpful_ratio") < 0.5, 0).otherwise(1))

In [19]:
df.show(5, vertical = True)

-RECORD 0--------------------------------
 product_id       | B00KNDCCE6           
 star_rating      | 2                    
 product_category | Digital_Software     
 review_headline  | Buy this directly... 
 review_body      | This is a great p... 
 helpful_votes    | 20                   
 total_votes      | 20                   
 review_text      | Buy this directly... 
 helpful_ratio    | 1.0                  
 helpful          | 1                    
-RECORD 1--------------------------------
 product_id       | B004KPKSRQ           
 star_rating      | 1                    
 product_category | Digital_Software     
 review_headline  | One Star             
 review_body      | Did not work for me  
 helpful_votes    | 4                    
 total_votes      | 14                   
 review_text      | One Star Did not ... 
 helpful_ratio    | 0.2857142857142857   
 helpful          | 0                    
-RECORD 2--------------------------------
 product_id       | B00CS75YKE    

In [20]:
df_helpful = df.filter(col('helpful') == 1)

## T5 Pipeline

In [21]:
%%time
#T5 Transformer
documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document") 

#Trying Document, but can also take in sentence
t5 = T5Transformer.load('t5_base_en_2.7.1_2.4_1610133506835')\
    .setInputCols('document')\
    .setOutputCol("T5")\
    .setMaxOutputLength(400)

nlp_pipeline = Pipeline(stages=[
    documentAssembler, 
    t5
])

Wall time: 30 s


##### Setting T5 Task to Answer Questions

In [22]:
t5.setTask('question')

T5TRANSFORMER_98cb3158fd7c

#### Open Book Question

What makes this review helpful?

In [23]:
df_t5_1 = df_helpful.withColumn('text', F.concat_ws(' ',F.lit("question: What makes this review helpful? context:"), col('review_text')))

In [24]:
df_t5_1 = df_t5_1.select('text')

In [25]:
%%time
#Predict on text data with T5
model = nlp_pipeline.fit(df_t5_1)
annotated_df = model.transform(df_t5_1)
annotated_df.select(['text','t5.result']).show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [26]:
annotated_df_select = annotated_df.select(['text','t5.result'])

In [27]:
annotated_df_select = annotated_df_select.withColumn("result",
   concat_ws(" ",col("result")))

In [28]:
annotated_df_select.show(1)

+--------------------+--------------------+
|                text|              result|
+--------------------+--------------------+
|question: What ma...|serious technical...|
+--------------------+--------------------+
only showing top 1 row



Is this review helpful? (Didn't work as it does not produce binary result of yes/no)

In [29]:
#df_t5_2 = df.withColumn('text', F.concat_ws(' ',F.lit("question: Is this review helpful? context:"), col('review_text')))

In [30]:
#df_t5_2 = df_t5_2.select('text','helpful')

In [31]:
#%%time
#Predict on text data with T5
#model = nlp_pipeline.fit(df_t5_2)
#annotated_df = model.transform(df_t5_2)
#annotated_df.select(['helpful','text','t5.result']).show(truncate=False)

## Wordcloud from T5 Result

Passing HashingTF IDF into T5 results to better rank the word importance related to "Helpfulness"

In [32]:
from pyspark.sql.functions import lower, col
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql import DataFrame
from pyspark.ml.feature import HashingTF, IDF, StopWordsRemover, CountVectorizer
from pyspark.ml import Pipeline, Transformer
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, DecisionTreeClassifier

import nltk
import matplotlib.pyplot as plt
%matplotlib inline

#nltk.download('wordnet')

In [33]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
eng_stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kenne\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:
#Document & Tokenize
document_assembler = DocumentAssembler().setInputCol("result").setOutputCol("document")
tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("review_words")
 
#Cleaning Tokens
remover           = StopWordsCleaner().setInputCols("review_words").setOutputCol("review_words_stop").setCaseSensitive(False).setStopWords(eng_stopwords)
finisher          = Finisher().setInputCols(["review_words_stop"]).setOutputCols(["token_features"]).setOutputAsArray(True).setCleanAnnotations(False)

pipeline = Pipeline(stages=[document_assembler,tokenizer,remover,finisher])

In [35]:
%%time
wordcloud_df = pipeline.fit(annotated_df_select).transform(annotated_df_select)

Wall time: 137 ms


In [None]:
#! pip install wordcloud

In [36]:
from pyspark.sql.functions import explode
dfwc = wordcloud_df.select(explode(wordcloud_df.token_features))

In [None]:
%%time
dfwc = dfwc.groupBy("col").count()
dfwc.show()

In [None]:
%%time
pdwc = dfwc.toPandas()

In [None]:
wordlist = pdwc[['token_features'].to_list()

In [None]:
from nltk import FreqDist
def Bag_Of_Words(wordlist):
    all_words = []
    for m in wordlist:
        for w in m:
            all_words.append(w.lower())
    all_words = FreqDist(all_words)
    #print(all_words.most_common(300))
    #print(len(all_words.keys()))
    return all_words

In [None]:
all_words = Bag_Of_Words(wordlist)

In [None]:
import matplotlib as mpl
from wordcloud import WordCloud
all_words = Bag_Of_Words(ListWords)
ax = plt.figure(figsize=(15,10))
# Generate a word cloud image
wordcloud = WordCloud(background_color='white',max_font_size=40, mask= amazon_mask,contour_width=0.5, contour_color='orange').generate(' '.join(all_words.keys()))

# Display the generated image:
# the matplotlib way:
import matplotlib.pyplot as plt
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")