# Review NLP Helpfulness Prediction TFIDF (Music)

## Creating Spark Session & Importing All Necessary Libraries

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from sparknlp.base import *
from sparknlp.annotator import *

import sparknlp

# Start SparkSession with Spark NLP
# start() functions has 5 parameters: gpu, spark23, spark24, spark32, and memory
# sparknlp.start(gpu=True) will start the session with GPU support
# sparknlp.start(spark23=True) is when you have Apache Spark 2.3.x installed
# sparknlp.start(spark24=True) is when you have Apache Spark 2.4.x installed
# sparknlp.start(spark32=True) is when you have Apache Spark 3.2.x installed
# sparknlp.start(memory="16G") to change the default driver memory in SparkSession
spark = sparknlp.start(gpu = True)

In [3]:
from pyspark.sql.functions import lower, col
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql import DataFrame
from pyspark.ml.feature import HashingTF, IDF, StopWordsRemover, CountVectorizer
from pyspark.ml import Pipeline, Transformer
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, DecisionTreeClassifier, NaiveBayes
import numpy as np

import nltk
import matplotlib.pyplot as plt
%matplotlib inline

#nltk.download('wordnet')

In [4]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
eng_stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kenne\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Reading in Data

In [5]:
df = spark.read \
    .option("quote", "\"")  \
    .option("escape", "\"") \
    .option("ignoreLeadingWhiteSpace",True) \
    .csv(r"C:\Users\kenne\OneDrive\Desktop\UChicago\Python\Final Project Big Data Amazon Review\Amazon Review Kaggle Data\amazon_reviews_us_Music_v1_00.tsv",inferSchema=True,header=True, sep='\t' )

## Code Cleaning

In [6]:
df.show(1, vertical = True, truncate = False)

-RECORD 0--------------------------------------------------------------------
 marketplace       | US                                                      
 customer_id       | 10140119                                                
 review_id         | R3LI5TRP3YIDQL                                          
 product_id        | B00TXH4OLC                                              
 product_parent    | 384427924                                               
 product_title     | Whatever's for Us: Remastered                           
 product_category  | Music                                                   
 star_rating       | 5                                                       
 helpful_votes     | 0                                                       
 total_votes       | 0                                                       
 vine              | N                                                       
 verified_purchase | Y                                          

In [7]:
df = df.select('product_id', 'star_rating', 'product_category', 'review_headline', 'review_body', 'helpful_votes', 'total_votes')

In [8]:
df.show(1, vertical = True, truncate = False)

-RECORD 0-------------------------------------------------------------------
 product_id       | B00TXH4OLC                                              
 star_rating      | 5                                                       
 product_category | Music                                                   
 review_headline  | Five Stars                                              
 review_body      | Love this CD along with other CDs by the same musician. 
 helpful_votes    | 0                                                       
 total_votes      | 0                                                       
only showing top 1 row



In [9]:
df.dropna().count()

4751006

In [10]:
from pyspark.sql.functions import col,isnan, when, count
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]
   ).show()

+----------+-----------+----------------+---------------+-----------+-------------+-----------+
|product_id|star_rating|product_category|review_headline|review_body|helpful_votes|total_votes|
+----------+-----------+----------------+---------------+-----------+-------------+-----------+
|         0|         37|              35|            219|        389|           37|         37|
+----------+-----------+----------------+---------------+-----------+-------------+-----------+



In [11]:
df2 = df.select([count(when(col(c).contains('None') | \
                            col(c).contains('NULL') | \
                            (col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in df.columns])
df2.count()
df2.show(5, vertical = True)

-RECORD 0-----------------
 product_id       | 198   
 star_rating      | 37    
 product_category | 35    
 review_headline  | 950   
 review_body      | 20498 
 helpful_votes    | 37    
 total_votes      | 37    



In [12]:
df = df.filter(col('total_votes') > 10)
df.count()

406095

In [13]:
df = df.fillna("", "review_body")
df = df.fillna("", "review_headline")

In [14]:
df = df.withColumn('review_text', F.concat('review_headline', F.lit(" "), 'review_body'))
df.show(1, vertical = True, truncate = False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [15]:
df.filter((col("review_text").isNull()) | ( col("review_text") == "")).show(1, vertical = True, truncate = False)

(0 rows)



In [16]:
df.select([count(when((col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in df.columns]).show(5, vertical = True)

-RECORD 0---------------
 product_id       | 0   
 star_rating      | 0   
 product_category | 0   
 review_headline  | 9   
 review_body      | 7   
 helpful_votes    | 0   
 total_votes      | 0   
 review_text      | 0   



In [17]:
df = df.withColumn('helpful_ratio', F.col('helpful_votes') / F.col('total_votes'))
df.show(1, vertical = True, truncate = False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [18]:
df.filter(col('helpful_ratio') < 0).count()

0

In [19]:
df = df.withColumn('helpful', when(col("helpful_ratio") < 0.5, 0).otherwise(1))

In [20]:
df.show(5, vertical = True)

-RECORD 0--------------------------------
 product_id       | B010FP0WRU           
 star_rating      | 3                    
 product_category | Music                
 review_headline  | Up it by a half-s... 
 review_body      | A solid collectio... 
 helpful_votes    | 25                   
 total_votes      | 26                   
 review_text      | Up it by a half-s... 
 helpful_ratio    | 0.9615384615384616   
 helpful          | 1                    
-RECORD 1--------------------------------
 product_id       | B00ZGJ85Y8           
 star_rating      | 5                    
 product_category | Music                
 review_headline  | Awesome Soundtrack   
 review_body      | Yes, I love every... 
 helpful_votes    | 14                   
 total_votes      | 15                   
 review_text      | Awesome Soundtrac... 
 helpful_ratio    | 0.9333333333333333   
 helpful          | 1                    
-RECORD 2--------------------------------
 product_id       | B00ZYBH6M0    

### Cleaning the Review_Text

In [21]:
df_clean = df

In [22]:
df_clean.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- star_rating: integer (nullable = true)
 |-- product_category: string (nullable = true)
 |-- review_headline: string (nullable = false)
 |-- review_body: string (nullable = false)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- review_text: string (nullable = false)
 |-- helpful_ratio: double (nullable = true)
 |-- helpful: integer (nullable = false)



In [23]:
df_clean.select("helpful").distinct().show()

+-------+
|helpful|
+-------+
|      1|
|      0|
+-------+



In [24]:
#Making the review text to all lower case
df_clean=df_clean.withColumn('review_text_l', F.lower(F.col('review_text')))

In [25]:
df_clean.select(df_clean.columns[10]).show(5, vertical = True)

-RECORD 0-----------------------------
 review_text_l | up it by a half-s... 
-RECORD 1-----------------------------
 review_text_l | awesome soundtrac... 
-RECORD 2-----------------------------
 review_text_l | miles live in tok... 
-RECORD 3-----------------------------
 review_text_l | the last title in... 
-RECORD 4-----------------------------
 review_text_l | one star didn't l... 
only showing top 5 rows



In [26]:
#Removing all of the additional punctuations
df_clean=df_clean.withColumn('review_text_l', F.regexp_replace('review_text_l', '\'', ''))
                                                                 
df_clean=df_clean.withColumn('review_text_l', F.regexp_replace('review_text_l', '\\n|[^\w]', ' ')).withColumn('review_text_l', F.regexp_replace('review_text_l', '\s+', ' '))

In [27]:
df_clean.select('review_text_l').show(5,vertical = True, truncate = False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [28]:
#Removing reviews that have 3 or less words
df_clean.filter(F.length(df_clean.review_text_l) < 3).count()

1

In [29]:
df_clean = df_clean.filter(F.length(df_clean.review_text_l) > 3)

In [30]:
df_clean.count()

406093

## NLP Pipeline

This pipeline consists of the following: Tokenizer, Stop Words Remover, Stemming, & TFIDF

In [31]:
#Document & Tokenize
document_assembler = DocumentAssembler().setInputCol("review_text_l").setOutputCol("document")
tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("review_words")
 
#Cleaning Tokens
remover           = StopWordsCleaner().setInputCols("review_words").setOutputCol("review_words_stop").setCaseSensitive(False).setStopWords(eng_stopwords)
#lemmatizer        = Lemmatizer().setInputCols(["review_words_stop"]).setOutputCol("review_words_lemstem")
stemmer           = Stemmer().setInputCols(["review_words_stop"]).setOutputCol("review_words_lemstem")
finisher          = Finisher().setInputCols(["review_words_lemstem"]).setOutputCols(["token_features"]).setOutputAsArray(True).setCleanAnnotations(False)
#hashingTF         = HashingTF(inputCol="token_features", outputCol="rawFeatures")
#idf               = IDF(inputCol="rawFeatures", outputCol="features")

pipeline_stem = Pipeline(stages=[document_assembler,tokenizer,remover,stemmer,finisher])#,hashingTF,idf])   

##### Running NLP pipeline

In [32]:
%%time
df_clean_nlp = pipeline_stem.fit(df_clean).transform(df_clean)

Wall time: 298 ms


In [33]:
df_clean_nlp.show(1)

+----------+-----------+----------------+--------------------+--------------------+-------------+-----------+--------------------+------------------+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|product_id|star_rating|product_category|     review_headline|         review_body|helpful_votes|total_votes|         review_text|     helpful_ratio|helpful|       review_text_l|            document|        review_words|   review_words_stop|review_words_lemstem|      token_features|
+----------+-----------+----------------+--------------------+--------------------+-------------+-----------+--------------------+------------------+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|B010FP0WRU|          3|           Music|Up it by a half-s...|A solid collectio...|           25|         26|Up it by a half-s...|0.9615384615384616

##### Hashing TF

In [34]:
hashingTF = HashingTF(inputCol="token_features", outputCol="rawFeatures", numFeatures = 10000)
df_featurizedData = hashingTF.transform(df_clean_nlp)
df_featurizedData.show(1)

+----------+-----------+----------------+--------------------+--------------------+-------------+-----------+--------------------+------------------+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|product_id|star_rating|product_category|     review_headline|         review_body|helpful_votes|total_votes|         review_text|     helpful_ratio|helpful|       review_text_l|            document|        review_words|   review_words_stop|review_words_lemstem|      token_features|         rawFeatures|
+----------+-----------+----------------+--------------------+--------------------+-------------+-----------+--------------------+------------------+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|B010FP0WRU|          3|           Music|Up it by a half-s...|A solid collectio...|  

##### IDF

In [35]:
%%time
idf = IDF(inputCol="rawFeatures", outputCol="features")
df_nlp = idf.fit(df_featurizedData).transform(df_featurizedData)
df_nlp.show(1, vertical = True, truncate = False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [36]:
df_nlp.show(1, vertical = True, truncate = False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [37]:
nlpdf_model = df_nlp.select('helpful','features')

In [38]:
from pyspark.sql.types import IntegerType
nlpdf_model = nlpdf_model.withColumn('helpful',col('helpful').cast(IntegerType()))

##### Train & Test Split

In [39]:
train, test = nlpdf_model.randomSplit([0.8, 0.2], seed=12345)

## NLP Modeling

#### Logistic Regression (1 = Helpful, 0 = Not Helpful)

In [40]:
%%time
lr = LogisticRegression(featuresCol = 'features', labelCol='helpful')

paramGrid = (ParamGridBuilder().addGrid(lr.regParam, [0.3, 0.0]).addGrid(lr.elasticNetParam, [0.0]).addGrid(lr.maxIter, [20, 100]).build())

#Evaluator
evaluator = MulticlassClassificationEvaluator(labelCol='helpful', predictionCol="prediction")
    
# Create 3-fold CrossValidator
cv = CrossValidator(estimator=lr,estimatorParamMaps=paramGrid,evaluator=evaluator,numFolds=3)

cvModel = cv.fit(train)

predictions = cvModel.transform(train)

print(evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "f1"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"}))

0.8468436563636531
0.8450910560452777
0.8440757545560762
0.8468436563636532
Wall time: 25min


In [41]:
print(cvModel.getEstimatorParamMaps()[np.argmax(cvModel.avgMetrics)])

{Param(parent='LogisticRegression_3b89995a1e7e', name='regParam', doc='regularization parameter (>= 0).'): 0.0, Param(parent='LogisticRegression_3b89995a1e7e', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0, Param(parent='LogisticRegression_3b89995a1e7e', name='maxIter', doc='max number of iterations (>= 0).'): 20}


In [42]:
%%time
predictions = cvModel.transform(test)

print(evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "f1"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"}))

0.8309490450556744
0.8290135497771305
0.8278014197917307
0.8309490450556745
Wall time: 7min 52s


#### Naive Bayes Classifier (1 = Helpful, 0 = Not Helpful)

In [43]:
%%time
nb = NaiveBayes(featuresCol='features', labelCol='helpful')

paramGrid = (ParamGridBuilder().addGrid(nb.smoothing, [0.1, 0.5, 1.0]).build())

#Evaluator
evaluator = MulticlassClassificationEvaluator(labelCol='helpful', predictionCol="prediction")
    
# Create 3-fold CrossValidator
cv = CrossValidator(estimator=nb,estimatorParamMaps=paramGrid,evaluator=evaluator,numFolds=3)

cvModel = cv.fit(train)

predictions = cvModel.transform(train)

print(evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "f1"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"}))

0.743988026865644
0.7560685349033475
0.7961956853217957
0.7439880268656441
Wall time: 21min 52s


In [44]:
print(cvModel.getEstimatorParamMaps()[np.argmax(cvModel.avgMetrics)])

{Param(parent='NaiveBayes_922a178d7842', name='smoothing', doc='The smoothing parameter, should be >= 0, default is 1.0'): 1.0}


In [45]:
%%time
predictions = cvModel.transform(test)

print(evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "f1"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"}))

0.7405796032740948
0.7527183700900599
0.7928201954801537
0.7405796032740948
Wall time: 7min 54s


#### Random Forest Classifier (1 = Helpful, 0 = Not Helpful)

In [46]:
%%time
rfc = RandomForestClassifier(impurity="gini", featuresCol='features', labelCol="helpful")

paramGrid = (ParamGridBuilder().addGrid(rfc.impurity, ['gini', 'entropy']).addGrid(rfc.maxBins, [32, 100]).build())

#Evaluator
evaluator = MulticlassClassificationEvaluator(labelCol='helpful', predictionCol="prediction")
    
# Create 3-fold CrossValidator
cv = CrossValidator(estimator=rfc,estimatorParamMaps=paramGrid,evaluator=evaluator,numFolds=3)

cvModel = cv.fit(train)

predictions = cvModel.transform(train)

print(evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "f1"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"}))

0.7230935524301952
0.6068901886710546
0.5228642855661195
0.7230935524301952
Wall time: 30min 39s


In [47]:
print(cvModel.getEstimatorParamMaps()[np.argmax(cvModel.avgMetrics)])

{Param(parent='RandomForestClassifier_2cf0db45086a', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini'): 'gini', Param(parent='RandomForestClassifier_2cf0db45086a', name='maxBins', doc='Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.'): 32}


In [48]:
%%time
predictions = cvModel.transform(test)

print(evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "f1"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"}))
print(evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"}))

0.7219600324459848
0.6053872083302927
0.5212262884494074
0.7219600324459848
Wall time: 7min 54s
