# Review NLP Helpfulness Prediction TFIDF (Digital Video)

## Creating Spark Session & Importing All Necessary Libraries

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from sparknlp.base import *
from sparknlp.annotator import *

import sparknlp

# Start SparkSession with Spark NLP
# start() functions has 5 parameters: gpu, spark23, spark24, spark32, and memory
# sparknlp.start(gpu=True) will start the session with GPU support
# sparknlp.start(spark23=True) is when you have Apache Spark 2.3.x installed
# sparknlp.start(spark24=True) is when you have Apache Spark 2.4.x installed
# sparknlp.start(spark32=True) is when you have Apache Spark 3.2.x installed
# sparknlp.start(memory="16G") to change the default driver memory in SparkSession
spark = sparknlp.start(gpu = True)

In [3]:
from pyspark.sql.functions import lower, col
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql import DataFrame
from pyspark.ml.feature import HashingTF, IDF, StopWordsRemover, CountVectorizer
from pyspark.ml import Pipeline, Transformer
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, DecisionTreeClassifier, NaiveBayes
import numpy as np

import nltk
import matplotlib.pyplot as plt
%matplotlib inline

#nltk.download('wordnet')

In [4]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
eng_stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kenne\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Reading in Data

In [5]:
df = spark.read \
    .option("quote", "\"")  \
    .option("escape", "\"") \
    .option("ignoreLeadingWhiteSpace",True) \
    .csv(r"C:\Users\kenne\OneDrive\Desktop\UChicago\Python\Final Project Big Data Amazon Review\Amazon Review Kaggle Data\amazon_reviews_us_Digital_Video_Download_v1_00.tsv",inferSchema=True,header=True, sep='\t' )

## Code Cleaning

In [6]:
df.show(1, vertical = True, truncate = False)

-RECORD 0-----------------------------------------------------------------------------------------------------
 marketplace       | US                                                                                       
 customer_id       | 12190288                                                                                 
 review_id         | R3FU16928EP5TC                                                                           
 product_id        | B00AYB1482                                                                               
 product_parent    | 668895143                                                                                
 product_title     | Enlightened: Season 1                                                                    
 product_category  | Digital_Video_Download                                                                   
 star_rating       | 5                                                                                        
 

In [7]:
df = df.select('product_id', 'star_rating', 'product_category', 'review_headline', 'review_body', 'helpful_votes', 'total_votes')

In [8]:
df.show(1, vertical = True, truncate = False)

-RECORD 0----------------------------------------------------------------------------------------------------
 product_id       | B00AYB1482                                                                               
 star_rating      | 5                                                                                        
 product_category | Digital_Video_Download                                                                   
 review_headline  | I loved it and I wish there was a season 3                                               
 review_body      | I loved it and I wish there was a season 3... I watched season 2 and loved that as well! 
 helpful_votes    | 0                                                                                        
 total_votes      | 0                                                                                        
only showing top 1 row



In [9]:
df.dropna().count()

4056163

In [10]:
from pyspark.sql.functions import col,isnan, when, count
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]
   ).show()

+----------+-----------+----------------+---------------+-----------+-------------+-----------+
|product_id|star_rating|product_category|review_headline|review_body|helpful_votes|total_votes|
+----------+-----------+----------------+---------------+-----------+-------------+-----------+
|         0|          0|               0|            369|        616|            0|          0|
+----------+-----------+----------------+---------------+-----------+-------------+-----------+



In [11]:
df2 = df.select([count(when(col(c).contains('None') | \
                            col(c).contains('NULL') | \
                            (col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in df.columns])
df2.count()
df2.show(5, vertical = True)

-RECORD 0----------------
 product_id       | 19   
 star_rating      | 0    
 product_category | 0    
 review_headline  | 601  
 review_body      | 4460 
 helpful_votes    | 0    
 total_votes      | 0    



In [12]:
df = df.filter(col('total_votes') > 10)
df.count()

62710

In [13]:
df = df.fillna("", "review_body")
df = df.fillna("", "review_headline")

In [14]:
df = df.withColumn('review_text', F.concat('review_headline', F.lit(" "), 'review_body'))
df.show(1, vertical = True, truncate = False)

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 product_id       | B01489L5LQ                                                                                                                                                                                                                                                                                                                                                                                                                                                              

In [15]:
df.filter((col("review_text").isNull()) | ( col("review_text") == "")).show(1, vertical = True, truncate = False)

(0 rows)



In [16]:
df.select([count(when((col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in df.columns]).show(5, vertical = True)

-RECORD 0---------------
 product_id       | 0   
 star_rating      | 0   
 product_category | 0   
 review_headline  | 1   
 review_body      | 23  
 helpful_votes    | 0   
 total_votes      | 0   
 review_text      | 0   



In [17]:
df = df.withColumn('helpful_ratio', F.col('helpful_votes') / F.col('total_votes'))
df.show(1, vertical = True, truncate = False)

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 product_id       | B01489L5LQ                                                                                                                                                                                                                                                                                                                                                                                                                                                              

In [18]:
df.filter(col('helpful_ratio') < 0).count()

0

In [19]:
df = df.withColumn('helpful', when(col("helpful_ratio") < 0.5, 0).otherwise(1))

In [20]:
df.show(5, vertical = True)

-RECORD 0--------------------------------
 product_id       | B01489L5LQ           
 star_rating      | 4                    
 product_category | Digital_Video_Dow... 
 review_headline  | Charming movie       
 review_body      | This movie isn't ... 
 helpful_votes    | 17                   
 total_votes      | 18                   
 review_text      | Charming movie Th... 
 helpful_ratio    | 0.9444444444444444   
 helpful          | 1                    
-RECORD 1--------------------------------
 product_id       | B00SZT6I3G           
 star_rating      | 1                    
 product_category | Digital_Video_Dow... 
 review_headline  | If it can't be br... 
 review_body      | If it can't be br... 
 helpful_votes    | 11                   
 total_votes      | 18                   
 review_text      | If it can't be br... 
 helpful_ratio    | 0.6111111111111112   
 helpful          | 1                    
-RECORD 2--------------------------------
 product_id       | B00VO8D13K    

### Cleaning the Review_Text

In [21]:
df_clean = df

In [22]:
df_clean.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- star_rating: integer (nullable = true)
 |-- product_category: string (nullable = true)
 |-- review_headline: string (nullable = false)
 |-- review_body: string (nullable = false)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- review_text: string (nullable = false)
 |-- helpful_ratio: double (nullable = true)
 |-- helpful: integer (nullable = false)



In [23]:
df_clean.select("helpful").distinct().show()

+-------+
|helpful|
+-------+
|      1|
|      0|
+-------+



In [24]:
#Making the review text to all lower case
df_clean=df_clean.withColumn('review_text_l', F.lower(F.col('review_text')))

In [25]:
df_clean.select(df_clean.columns[10]).show(5, vertical = True)

-RECORD 0-----------------------------
 review_text_l | charming movie th... 
-RECORD 1-----------------------------
 review_text_l | if it can't be br... 
-RECORD 2-----------------------------
 review_text_l | unnecessarily slo... 
-RECORD 3-----------------------------
 review_text_l | underwhelming rea... 
-RECORD 4-----------------------------
 review_text_l | one star bizarre ... 
only showing top 5 rows



In [26]:
#Removing all of the additional punctuations
df_clean=df_clean.withColumn('review_text_l', F.regexp_replace('review_text_l', '\'', ''))
                                                                 
df_clean=df_clean.withColumn('review_text_l', F.regexp_replace('review_text_l', '\\n|[^\w]', ' ')).withColumn('review_text_l', F.regexp_replace('review_text_l', '\s+', ' '))

In [27]:
df_clean.select('review_text_l').show(5,vertical = True, truncate = False)

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 review_text_l | charming movie this movie isnt perfect but it gets a lot of things right yes the librarian character played by marcia gay harden is stereotypical and played a bit heavy handed but the universal nature of the story the beautiful setting and the likability of the characters overcome this flaw the quote at the end brought tears to my eyes if you want to take a break from hollywoods standard fare of dark violent or stupid movies then give this a try it is is entertaining and t

In [28]:
#Removing reviews that have 3 or less words
df_clean.filter(F.length(df_clean.review_text_l) < 3).count()

0

In [29]:
df_clean = df_clean.filter(F.length(df_clean.review_text_l) > 3)

In [30]:
df_clean.count()

62710

## NLP Pipeline

This pipeline consists of the following: Tokenizer, Stop Words Remover, Stemming, & TFIDF

In [31]:
#Document & Tokenize
document_assembler = DocumentAssembler().setInputCol("review_text_l").setOutputCol("document")
tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("review_words")
 
#Cleaning Tokens
remover           = StopWordsCleaner().setInputCols("review_words").setOutputCol("review_words_stop").setCaseSensitive(False).setStopWords(eng_stopwords)
#lemmatizer        = Lemmatizer().setInputCols(["review_words_stop"]).setOutputCol("review_words_lemstem")
stemmer           = Stemmer().setInputCols(["review_words_stop"]).setOutputCol("review_words_lemstem")
finisher          = Finisher().setInputCols(["review_words_lemstem"]).setOutputCols(["token_features"]).setOutputAsArray(True).setCleanAnnotations(False)
#hashingTF         = HashingTF(inputCol="token_features", outputCol="rawFeatures")
#idf               = IDF(inputCol="rawFeatures", outputCol="features")

pipeline_stem = Pipeline(stages=[document_assembler,tokenizer,remover,stemmer,finisher])#,hashingTF,idf])   

##### Running NLP pipeline

In [32]:
%%time
df_clean_nlp = pipeline_stem.fit(df_clean).transform(df_clean)

Wall time: 290 ms


In [33]:
df_clean_nlp.show(1)

+----------+-----------+--------------------+---------------+--------------------+-------------+-----------+--------------------+------------------+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|product_id|star_rating|    product_category|review_headline|         review_body|helpful_votes|total_votes|         review_text|     helpful_ratio|helpful|       review_text_l|            document|        review_words|   review_words_stop|review_words_lemstem|      token_features|
+----------+-----------+--------------------+---------------+--------------------+-------------+-----------+--------------------+------------------+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|B01489L5LQ|          4|Digital_Video_Dow...| Charming movie|This movie isn't ...|           17|         18|Charming movie Th...|0.9444444444444444|   

##### Hashing TF

In [34]:
hashingTF = HashingTF(inputCol="token_features", outputCol="rawFeatures", numFeatures = 10000)
df_featurizedData = hashingTF.transform(df_clean_nlp)
df_featurizedData.show(1)

+----------+-----------+--------------------+---------------+--------------------+-------------+-----------+--------------------+------------------+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|product_id|star_rating|    product_category|review_headline|         review_body|helpful_votes|total_votes|         review_text|     helpful_ratio|helpful|       review_text_l|            document|        review_words|   review_words_stop|review_words_lemstem|      token_features|         rawFeatures|
+----------+-----------+--------------------+---------------+--------------------+-------------+-----------+--------------------+------------------+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|B01489L5LQ|          4|Digital_Video_Dow...| Charming movie|This movie isn't ...|      

##### IDF

In [35]:
%%time
idf = IDF(inputCol="rawFeatures", outputCol="features")
df_nlp = idf.fit(df_featurizedData).transform(df_featurizedData)
df_nlp.show(1, vertical = True, truncate = False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [36]:
df_nlp.show(1, vertical = True, truncate = False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [37]:
nlpdf_model = df_nlp.select('helpful','features')

In [38]:
from pyspark.sql.types import IntegerType
nlpdf_model = nlpdf_model.withColumn('helpful',col('helpful').cast(IntegerType()))

##### Train & Test Split

In [39]:
train, test = nlpdf_model.randomSplit([0.8, 0.2], seed=12345)

## NLP Modeling

#### Logistic Regression (1 = Helpful, 0 = Not Helpful)

In [40]:
%%time
lr = LogisticRegression(featuresCol = 'features', labelCol='helpful')

paramGrid = (ParamGridBuilder().addGrid(lr.regParam, [0.3, 0.0]).addGrid(lr.elasticNetParam, [0.0]).addGrid(lr.maxIter, [20, 100]).build())

#Evaluator
evaluator = MulticlassClassificationEvaluator(labelCol='helpful', predictionCol="prediction")
    
# Create 3-fold CrossValidator
cv = CrossValidator(estimator=lr,estimatorParamMaps=paramGrid,evaluator=evaluator,numFolds=3)

cvModel = cv.fit(train)

predictions = cvModel.transform(train)

print(evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "f1"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"}))

0.7792051866970825
0.7718823505867335
0.799494368346066
0.7792051866970825
Wall time: 4min 16s


In [41]:
print(cvModel.getEstimatorParamMaps()[np.argmax(cvModel.avgMetrics)])

{Param(parent='LogisticRegression_d598d4c4daf2', name='regParam', doc='regularization parameter (>= 0).'): 0.3, Param(parent='LogisticRegression_d598d4c4daf2', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0, Param(parent='LogisticRegression_d598d4c4daf2', name='maxIter', doc='max number of iterations (>= 0).'): 20}


In [42]:
%%time
predictions = cvModel.transform(test)

print(evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "f1"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"}))

0.7288787688442211
0.7189128175970296
0.7481372200242972
0.7288787688442211
Wall time: 1min


#### Naive Bayes Classifier (1 = Helpful, 0 = Not Helpful)

In [43]:
%%time
nb = NaiveBayes(featuresCol='features', labelCol='helpful')

paramGrid = (ParamGridBuilder().addGrid(nb.smoothing, [0.1, 0.5, 1.0]).build())

#Evaluator
evaluator = MulticlassClassificationEvaluator(labelCol='helpful', predictionCol="prediction")
    
# Create 3-fold CrossValidator
cv = CrossValidator(estimator=nb,estimatorParamMaps=paramGrid,evaluator=evaluator,numFolds=3)

cvModel = cv.fit(train)

predictions = cvModel.transform(train)

print(evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "f1"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"}))

0.7585144275023012
0.7542496802270101
0.7650004125607055
0.7585144275023012
Wall time: 2min 50s


In [44]:
print(cvModel.getEstimatorParamMaps()[np.argmax(cvModel.avgMetrics)])

{Param(parent='NaiveBayes_78ed73862df7', name='smoothing', doc='The smoothing parameter, should be >= 0, default is 1.0'): 1.0}


In [45]:
%%time
predictions = cvModel.transform(test)

print(evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "f1"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"}))

0.7230684673366834
0.718757752569916
0.7276615444234262
0.7230684673366834
Wall time: 1min


#### Random Forest Classifier (1 = Helpful, 0 = Not Helpful)

In [46]:
%%time
rfc = RandomForestClassifier(impurity="gini", featuresCol='features', labelCol="helpful")

paramGrid = (ParamGridBuilder().addGrid(rfc.impurity, ['gini', 'entropy']).addGrid(rfc.maxBins, [32, 100]).build())

#Evaluator
evaluator = MulticlassClassificationEvaluator(labelCol='helpful', predictionCol="prediction")
    
# Create 3-fold CrossValidator
cv = CrossValidator(estimator=rfc,estimatorParamMaps=paramGrid,evaluator=evaluator,numFolds=3)

cvModel = cv.fit(train)

predictions = cvModel.transform(train)

print(evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "f1"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"}))

0.6609436907191739
0.6330995433558185
0.6938038426615719
0.6609436907191739
Wall time: 4min 28s


In [47]:
print(cvModel.getEstimatorParamMaps()[np.argmax(cvModel.avgMetrics)])

{Param(parent='RandomForestClassifier_80628e154332', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini'): 'gini', Param(parent='RandomForestClassifier_80628e154332', name='maxBins', doc='Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.'): 32}


In [48]:
%%time
predictions = cvModel.transform(test)

print(evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "f1"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"}))

0.6560144472361809
0.6284315214057037
0.6902293146475933
0.6560144472361809
Wall time: 1min
