### Code Cleaning

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
df = spark.read \
    .option("quote", "\"")  \
    .option("escape", "\"") \
    .option("ignoreLeadingWhiteSpace",True) \
    .csv("/user/klaurens/project/project/amazon_reviews_us_Digital_Ebook_Purchase_v1_01.tsv",inferSchema=True,header=True, sep='\t' )

In [3]:
df.show(1, vertical = True, truncate = False)

-RECORD 0---------------------------------------------------------------------------------------------------------------------------------------------
 marketplace       | US                                                                                                                               
 customer_id       | 33605939                                                                                                                         
 review_id         | RGYFDX8QXKEIR                                                                                                                    
 product_id        | B007KO2MLO                                                                                                                       
 product_parent    | 328837464                                                                                                                        
 product_title     | Big Maria                                                                

In [4]:
df = df.select('product_id', 'star_rating', 'product_category', 'review_headline', 'review_body', 'helpful_votes', 'total_votes')

In [5]:
df.show(1, vertical = True, truncate = False)

-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------
 product_id       | B007KO2MLO                                                                                                                       
 star_rating      | 4                                                                                                                                
 product_category | Digital_Ebook_Purchase                                                                                                           
 review_headline  | Quirky                                                                                                                           
 review_body      | Elmore Leonard meets the cast of Sierra Madre. Just a quirky read that will make you want to keep trying no matter what happens. 
 helpful_votes    | 0                                                                               

In [6]:
df.dropna().count()

5101525

In [7]:
from pyspark.sql.functions import col,isnan, when, count
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]
   ).show()

+----------+-----------+----------------+---------------+-----------+-------------+-----------+
|product_id|star_rating|product_category|review_headline|review_body|helpful_votes|total_votes|
+----------+-----------+----------------+---------------+-----------+-------------+-----------+
|         0|         17|              12|             60|        125|           17|         17|
+----------+-----------+----------------+---------------+-----------+-------------+-----------+



In [8]:
df2 = df.select([count(when(col(c).contains('None') | \
                            col(c).contains('NULL') | \
                            (col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in df.columns])
df2.count()
df2.show(5, vertical = True)

-RECORD 0----------------
 product_id       | 2    
 star_rating      | 17   
 product_category | 12   
 review_headline  | 5038 
 review_body      | 9632 
 helpful_votes    | 17   
 total_votes      | 17   



In [9]:
df = df.filter(col('total_votes') > 10)
df.count()

196219

In [10]:
df = df.fillna("", "review_body")
df = df.fillna("", "review_headline")

In [11]:
df = df.withColumn('review_text', F.concat('review_headline', F.lit(" "), 'review_body'))
df.show(1, vertical = True, truncate = False)

-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 product_id       | B006NG0GXQ                                                                                                                                                

In [12]:
df.filter((col("review_text").isNull()) | ( col("review_text") == "")).show(1, vertical = True, truncate = False)

(0 rows)



In [13]:
df.select([count(when((col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in df.columns]).show(5, vertical = True)

-RECORD 0---------------
 product_id       | 0   
 star_rating      | 0   
 product_category | 0   
 review_headline  | 1   
 review_body      | 4   
 helpful_votes    | 0   
 total_votes      | 0   
 review_text      | 0   



In [14]:
df = df.withColumn('helpful_ratio', F.col('helpful_votes') / F.col('total_votes'))
df.show(1, vertical = True, truncate = False)

-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 product_id       | B006NG0GXQ                                                                                                                                                

In [15]:
df.filter(col('helpful_ratio') < 0).count()

0

In [16]:
df = df.withColumn('helpful', when(col("helpful_ratio") < 0.5, 0).otherwise(1))

In [17]:
df.show(5, vertical = True)

-RECORD 0--------------------------------
 product_id       | B006NG0GXQ           
 star_rating      | 1                    
 product_category | Digital_Ebook_Pur... 
 review_headline  | Completely absurd    
 review_body      | From the time tha... 
 helpful_votes    | 36                   
 total_votes      | 47                   
 review_text      | Completely absurd... 
 helpful_ratio    | 0.7659574468085106   
 helpful          | 1                    
-RECORD 1--------------------------------
 product_id       | B0078T6YHY           
 star_rating      | 1                    
 product_category | Digital_Ebook_Pur... 
 review_headline  | No thanks            
 review_body      | This work should ... 
 helpful_votes    | 16                   
 total_votes      | 26                   
 review_text      | No thanks This wo... 
 helpful_ratio    | 0.6153846153846154   
 helpful          | 1                    
-RECORD 2--------------------------------
 product_id       | B00CKXCNH8    

### NLP Modeling

In [18]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import lower, col
from pyspark.sql.types import ArrayType, StringType
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from pyspark.ml.feature import StopWordsRemover

import matplotlib.pyplot as plt
%matplotlib inline

In [19]:
df_clean = df

In [20]:
df_clean.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- star_rating: integer (nullable = true)
 |-- product_category: string (nullable = true)
 |-- review_headline: string (nullable = false)
 |-- review_body: string (nullable = false)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- review_text: string (nullable = false)
 |-- helpful_ratio: double (nullable = true)
 |-- helpful: integer (nullable = false)



In [21]:
df_clean.select("helpful").distinct().show()

+-------+
|helpful|
+-------+
|      1|
|      0|
+-------+



### Cleaning the Review_Text

In [22]:
#Making the review text to all lower case
df_clean=df_clean.withColumn('review_text_l', F.lower(F.col('review_text')))

In [23]:
df_clean.select(df_clean.columns[10]).show(5, vertical = True)

-RECORD 0-----------------------------
 review_text_l | completely absurd... 
-RECORD 1-----------------------------
 review_text_l | no thanks this wo... 
-RECORD 2-----------------------------
 review_text_l | fell in love with... 
-RECORD 3-----------------------------
 review_text_l | a big disappointm... 
-RECORD 4-----------------------------
 review_text_l | holy hotness, bla... 
only showing top 5 rows



In [24]:
#Removing all of the additional punctuations
df_clean=df_clean.withColumn('review_text_l', F.regexp_replace('review_text_l', '\'', ''))
                                                          
df_clean=df_clean.withColumn('review_text_l', F.regexp_replace('review_text_l', '\\n|[^\w]', ' ')).withColumn('review_text_l', F.regexp_replace('review_text_l', '\s+', ' '))

In [25]:
df_clean.select('review_text_l').show(5,vertical = True, truncate = False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [26]:
#Removing reviews that have 3 or less words
df_clean.filter(F.length(df_clean.review_text_l) < 3).count()

0

In [27]:
df_clean = df_clean.filter(F.length(df_clean.review_text_l) > 3)

In [28]:
df_clean.count()

196219

In [29]:
# from typing import Iterable
from pyspark.sql import DataFrame
from pyspark.ml import Pipeline, Transformer
from pyspark.ml.feature import HashingTF, Tokenizer, CountVectorizer, IDF, StopWordsRemover

class Lemmatizer(Transformer):
    def __init__(self):
        super(Lemmatizer, self).__init__()

    def _transform(self, df: DataFrame) -> DataFrame:
        lemmatizer = WordNetLemmatizer()
        lemmatizer_udf = F.udf(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens]
                               , ArrayType(StringType()))
        df = df.withColumn("review_words_lemstem", lemmatizer_udf("review_words_stop"))

        return df
    
class Stemmer(Transformer):
    def __init__(self):
        super(Stemmer, self).__init__()

    def _transform(self, df: DataFrame) -> DataFrame:
        stemmer = SnowballStemmer(language='english')
        stemmer_udf = F.udf(lambda tokens: [stemmer.stem(token) for token in tokens], ArrayType(StringType()))
        df = df.withColumn("review_words_lemstem", stemmer_udf("review_words_stop"))

        return df

    
tokenizer = Tokenizer(inputCol="review_text_l", outputCol="review_words")
remover = StopWordsRemover(inputCol="review_words", outputCol="review_words_stop")
lemmatizer = Lemmatizer()
stemmer = Stemmer()
hashingTF = HashingTF(inputCol="review_words_lemstem", outputCol="rawFeatures", numFeatures = 10000)
CountVectorizer(inputCol="review_words_lem", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="features")

pipeline_lem = Pipeline(stages=[tokenizer,remover,lemmatizer,hashingTF,idf])
pipeline_stem = Pipeline(stages=[tokenizer,remover,stemmer,hashingTF,idf])    

In [30]:
def run_classification(model, train , test, lemstem = 'lem', score = True):
    if lemstem == 'lem':
        run_df = pipeline_lem.fit(train).transform(train)
    elif lemstem == 'stem':
        run_df = pipeline_stem.fit(train).transform(train)
        
    prediction = model.fit(run_df).transform(run_df)
    evaluator = MulticlassClassificationEvaluator(labelCol="helpful", predictionCol="prediction")
    scores = {}
    
    if score:
        scores['accuracy'] = evaluator.evaluate(prediction, {evaluator.metricName: "accuracy"})
        scores['f1'] = evaluator.evaluate(prediction, {evaluator.metricName: "f1"})
        scores['weightedPrecision'] = evaluator.evaluate(prediction, {evaluator.metricName: "weightedPrecision"})
        scores['weightedRecall'] = evaluator.evaluate(prediction, {evaluator.metricName: "weightedRecall"})
        print(model.__class__.__name__, lemstem)
        print(scores)

    return model, prediction, scores

In [31]:
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, DecisionTreeClassifier

In [32]:
train, test = df_clean.randomSplit([0.8, 0.2], seed=12345)

In [37]:
train.rdd.getNumPartitions()

25

In [38]:
train = train.repartition(50)

In [33]:
lr = LogisticRegression(featuresCol = 'features', labelCol='helpful')#,regParam=0.09,elasticNetParam=0.1 default better
lr_lem, lr_lem_pred, lr_lem_score = run_classification(lr, train, test, 'lem')
lr_stem, lr_stem_pred, lr_stem_score =run_classification(lr, train, test, 'stem')

LogisticRegression lem
0.826419137388953
0.8116197224412114
0.8117126997351846
0.826419137388953

LogisticRegression stem
0.8277711523376466
0.812780203718012
0.813318841006139
0.8277711523376466



(LogisticRegression_2bf978b051cc,
 DataFrame[product_id: string, star_rating: int, product_category: string, review_headline: string, review_body: string, helpful_votes: int, total_votes: int, review_text: string, helpful_ratio: double, helpful: int, review_text_l: string, review_words: array<string>, review_words_stop: array<string>, review_words_lemstem: array<string>, rawFeatures: vector, features: vector, rawPrediction: vector, probability: vector, prediction: double])

In [39]:
dt = DecisionTreeClassifier(featuresCol="features",labelCol ="helpful")
dt_lem, dt_lem_pred, dt_lem_score =run_classification(dt, train, test, 'lem')
dt_stem, dt_stem_pred, dt_stem_score =run_classification(dt, train, test, 'stem')

DecisionTreeClassifier lem
0.7791623884747103
0.6827552223168624
0.7738509816182938
0.7791623884747103

DecisionTreeClassifier stem
0.7784077501482809
0.6839843421162546
0.7642590527858948
0.7785554329171425



(DecisionTreeClassifier_70a01553f121,
 DataFrame[product_id: string, star_rating: int, product_category: string, review_headline: string, review_body: string, helpful_votes: int, total_votes: int, review_text: string, helpful_ratio: double, helpful: int, review_text_l: string, review_words: array<string>, review_words_stop: array<string>, review_words_lemstem: array<string>, rawFeatures: vector, features: vector, rawPrediction: vector, probability: vector, prediction: double])

In [40]:
rfc = RandomForestClassifier(impurity="gini", labelCol="helpful")#maxDepth=5, numTrees=15, 
rfc_lem, rfc_lem_pred, rfc_lem_score =run_classification(rfc, train, test, 'lem')
rfc_stem, rfc_stem_pred, rfc_stem_score =run_classification(rfc, train, test, 'stem')

RandomForestClassifier lem
0.7783581756373215
0.6813491881232981
0.6058414495814595
0.7783581756373215

RandomForestClassifier stem
0.7783581756373215
0.6813491881232981
0.6058414495814595
0.7783581756373215



(RandomForestClassifier_a3773b6b8d3b,
 DataFrame[product_id: string, star_rating: int, product_category: string, review_headline: string, review_body: string, helpful_votes: int, total_votes: int, review_text: string, helpful_ratio: double, helpful: int, review_text_l: string, review_words: array<string>, review_words_stop: array<string>, review_words_lemstem: array<string>, rawFeatures: vector, features: vector, rawPrediction: vector, probability: vector, prediction: double])

In [41]:
# km = KMeans(k=5, featuresCol="features",predictionCol ="helpful_clust")
# out_model, out_pred = run_classification(km, train, test, 'lem', False)

In [42]:
# silhouette

In [44]:
from  pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
silhouettes = {}
evaluator = ClusteringEvaluator(predictionCol ="helpful_clust")
for i in range(2,15):
    km = KMeans(k=i, featuresCol="features",predictionCol ="helpful_clust")
    out_model, out_pred = run_classification(km, train, test, 'lem', False)
    silhouette = evaluator.evaluate(out_pred.select('features', 'helpful_clust'))
    silhouettes[i] = silhouette
silhouettes

{2: 0.7874030284527945,
 3: 0.6437947051225711,
 4: 0.7833947897632287,
 5: 0.6203944455055674,
 6: 0.6290921118636491,
 7: 0.6828893346666126,
 8: 0.657032231602476,
 9: 0.5948228773884128,
 10: 0.384805459579299,
 11: 0.2894022380064678,
 12: 0.3292708725494614,
 13: 0.5347871276945738,
 14: 0.3753177774226027}

In [45]:
km = KMeans(k=4, featuresCol="features",predictionCol ="helpful_clust")
out_model, out_pred = run_classification(km, train, test, 'lem', False)
evaluator.evaluate(out_pred.select('features', 'helpful_clust'))

0.6037633180872019

In [47]:
counts = out_pred.groupby(['helpful_clust', 'helpful']).count().alias('counts')
result = (counts
          .groupBy('helpful_clust')
          .agg(F.max(F.struct(F.col('count'),
                              F.col('helpful'))).alias('max'))
          .select(F.col('helpful_clust'), F.col('max.helpful'))
         )
result.show()

+-------------+-------+
|helpful_clust|helpful|
+-------------+-------+
|            1|      1|
|            3|      1|
|            0|      1|
+-------------+-------+



In [48]:
clusters = out_pred.select('helpful','helpful_clust').toPandas()

In [52]:
import pandas as pd

In [53]:
clusters.groupby('helpful_clust').agg({'helpful':pd.Series.mode})

Unnamed: 0_level_0,helpful
helpful_clust,Unnamed: 1_level_1
0,1
1,1
3,1


In [54]:
clusters['pred'] = clusters['helpful_clust'].apply(lambda x : 1 if x in [0,1,3] else 0)

In [60]:
(clusters['helpful'] - clusters['pred']).sum() / clusters['helpful'].count()

-0.22099704725037148