In [1]:
#spark sql imports
import sparknlp

spark = sparknlp.start()

print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

Spark NLP version 3.4.2
Apache Spark version: 3.1.2


In [2]:
#spark imports
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import countDistinct
from pyspark.sql import Window
from pyspark.sql.functions import udf
from pyspark.sql.types import *

import pandas as pd
import numpy as np
import os
import time
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter)
%matplotlib inline
import seaborn as sns
pd.set_option('display.max_rows', None)

from sklearn.metrics import confusion_matrix, classification_report

#spark ML imports
from pyspark.ml import Pipeline
from pyspark.ml.feature import NGram,CountVectorizer, HashingTF, IDF, OneHotEncoder, StringIndexer, VectorAssembler, SQLTransformer
from pyspark.ml.regression import LinearRegression,DecisionTreeRegressor,RandomForestRegressor,GBTRegressor
from pyspark.ml.classification import LinearSVC,LogisticRegression,DecisionTreeClassifier,RandomForestClassifier,GBTClassifier

from pyspark.ml.evaluation import RegressionEvaluator,MulticlassClassificationEvaluator,BinaryClassificationEvaluator
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

#spark NLP imports
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
from sparknlp.pretrained import PretrainedPipeline

In [3]:
spark = SparkSession.builder.enableHiveSupport().appName('AmazonData').getOrCreate()
sc = spark.sparkContext

spark.conf.set('spark.sql.caseSensitive', True) #Using a SparkSession object named spark

# Read in All Data

In [4]:
AF= 'gs://classificationdata112/classificationdata1122/AMAZON_FASHION.json'
AB= 'gs://classificationdata112/classificationdata1122/All_Beauty.json'
CSJ= 'gs://classificationdata112/classificationdata1122/Clothing_Shoes_and_Jewelry.json'
LB= 'gs://classificationdata112/classificationdata1122/Luxury_Beauty.json'

mAF= 'gs://classificationdata112/classificationdata1122/meta_AMAZON_FASHION.json'
mAB= 'gs://classificationdata112/classificationdata1122/meta_ALL_Beauty.json'
mCSJ= 'gs://classificationdata112/classificationdata1122/meta_Clothing_Shoes_and_Jewelry.json'
mLB= 'gs://classificationdata112/classificationdata1122/meta_Luxury_Beauty.json'



## Load actual data

In [5]:
df_AF = spark.read.json(AF)
df_AF = df_AF.withColumn("Category", F.lit("Amazon Fashion"))
df_AF = df_AF.drop("reviewerName","style","image")

df_AB = spark.read.json(AB)
df_AB = df_AB.withColumn("Category", F.lit("All Beauty"))
df_AB = df_AB.drop("reviewerName","style","image")

df_CSJ = spark.read.json(CSJ)
df_CSJ = df_CSJ.withColumn("Category", F.lit("Clothing, Shoes and Jewelery"))
df_CSJ = df_CSJ.drop("reviewerName","style","image")

df_LB= spark.read.json(LB)
df_LB = df_LB.withColumn("Category", F.lit("Luxury Beauty"))
df_LB = df_LB.drop("reviewerName","style","image")

22/03/13 20:14:35 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [6]:
whole = df_AF.union(df_AB)
whole= whole.union(df_CSJ)
whole=whole.union(df_LB)
whole.printSchema()

root
 |-- asin: string (nullable = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)
 |-- verified: boolean (nullable = true)
 |-- vote: string (nullable = true)
 |-- Category: string (nullable = false)



In [7]:
whole.rdd.getNumPartitions()

114

In [9]:
whole.count()

                                                                                

34121708

## Load Meta data

In [7]:
mdf_AF = spark.read.json(mAF)
mdf_AF = mdf_AF.drop('also_buy','also_view', 'description', 'details', 'feature', 'fit',
                    'imageURL','imageURLHighRes', 'similar_item','tech1')

mdf_AB = spark.read.json(mAB)
mdf_AB = mdf_AB.drop('also_buy','also_view','category', 'description', 'details', 'feature', 'fit',
                    'imageURL','imageURLHighRes', 'main_cat','similar_item','tech1', 'tech2')

mdf_CSJ = spark.read.json(mCSJ)
mdf_CSJ = mdf_CSJ.drop('also_buy','also_view','category', 'description', 'details', 'feature', 'fit',
                    'imageURL','imageURLHighRes', 'main_cat','similar_item','tech1', 'tech2')

mdf_LB= spark.read.json(mLB)
mdf_LB = mdf_LB.drop('also_buy','also_view','category', 'description', 'details', 'feature', 'fit',
                    'imageURL','imageURLHighRes', 'main_cat','similar_item','tech1', 'tech2')

                                                                                

In [8]:
m_whole= mdf_AF.union(mdf_AB)
m_whole= m_whole.union(mdf_CSJ)
m_whole=m_whole.union(mdf_LB)

In [10]:
m_whole.printSchema()

root
 |-- asin: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- date: string (nullable = true)
 |-- price: string (nullable = true)
 |-- rank: string (nullable = true)
 |-- title: string (nullable = true)



In [13]:
m_whole.count()

                                                                                

2916887

## Joining Metadata and Actual Data

In [9]:
full = whole.join(m_whole,"asin","left")

In [9]:
full.limit(2).toPandas()

                                                                                

Unnamed: 0,asin,overall,reviewText,reviewTime,reviewerID,summary,unixReviewTime,verified,vote,Category,brand,date,price,rank,title
0,6030555170,4.0,I love it,"05 5, 2018",AOFKGSCY2NUHR,Four Stars,1525478400,True,,"Clothing, Shoes and Jewelery",Lady olga,5 star,$26.80 - $26.94,"266,796inClothing,ShoesJewelry(",Lady olga Women's Fleece Bed Jacket On Collar ...
1,6030555170,5.0,This bed jacket works well for my mother. It w...,"04 22, 2018",A223V2HN62ZKKZ,It washes easily and doesn't shed,1524355200,True,,"Clothing, Shoes and Jewelery",Lady olga,5 star,$26.80 - $26.94,"266,796inClothing,ShoesJewelry(",Lady olga Women's Fleece Bed Jacket On Collar ...


In [16]:
print(f"Before cleaning approximate count:{full.count()}")



Before cleaning approximate count:34479823


                                                                                

## Data Processing
- drop duplicates:according to 3 columns: reviewerID, asin, unixReviewTime
- drop missing values in reviewText:41579 reviews are missing
- keep products with at least 10 reviews(so that model can learn better)

In [10]:
# drop duplicates
full = full.dropDuplicates(subset=["asin","reviewerID","unixReviewTime"])
# drop null
full = full.dropna(subset="reviewText")
# keep 不那么小众的products
winSpecAgg  = Window.partitionBy("asin")
finaldf = full.withColumn("num",F.count(F.col("reviewerID")).over(winSpecAgg))\
.where(F.col("num") >= 10) #at least 10 reviews,even if from same author

print(f"After cleaning approximate count:{finaldf.count()}") #29 million



After cleaning approximate count:29814163


                                                                                

In [11]:
# append year + month + timestamp columns--dataframe name:df1
df1 = finaldf.withColumn("timestamp",F.from_unixtime("unixReviewTime"))
df1 = df1.withColumn("year",F.year("timestamp"))
df1 = df1.withColumn("month",F.month("timestamp"))

In [12]:
df2 = df1.select("asin","overall","reviewText",
                     "verified","Category","brand","year","month")

In [15]:
# User Defined Function convert rating into binary label
@udf(returnType=IntegerType())
def convert(num): #overall column cannot be null therefore not need to address this issue in funct
    if num > 3:
        return 0 #positive--class 0
    else: #num < 3
        return 1 #negative--class 1 since we care MORE about neg reviews

In [None]:
# or using "when" function
# finaldf.withColumn("label",
#                    F.when(finaldf["overall"] > 3,1.0).otherwise(0.0)) # after filtering out 3!

In [17]:
finaldf.groupBy("overall").count().show()



+-------+--------+
|overall|   count|
+-------+--------+
|    1.0| 1925720|
|    4.0| 4870378|
|    3.0| 2522919|
|    2.0| 1531057|
|    5.0|16710571|
+-------+--------+



                                                                                

In [18]:
2522919/(1925720+4870378+2522919+1531057+16710571) #9% percent of rating 3

0.09154063702065028

In [16]:
# only look at rating = 1,2,4,5
df3 = df2.filter(finaldf.overall != 3)

# add label column
df3 = df3.withColumn("label",convert("overall"))

In [27]:
df3.columns

['asin',
 'overall',
 'reviewText',
 'verified',
 'Category',
 'brand',
 'year',
 'month',
 'label']

### Text cleaning 1: filter out reviews that are not in English


- PIPELINE: `LanguageDetectorDL`
- parameters:
    - `threshold`:The minimum threshold for the final result--default:0.5;o.w if less than threshold:either neutral or value set in "thresholdLabel"
    - set lower thresh since `LanguageDetectorDL` works best with text longer than 140 characters.
    - but some review is pretty short eg: one word
    - `coalesceSentences`:output of all sentences will be averaged to one output instead of one output per sentence

In [28]:
documentAssembler = DocumentAssembler()\
.setInputCol("reviewText")\
.setOutputCol("document")

language_detector = LanguageDetectorDL.pretrained("ld_wiki_tatoeba_cnn_21")\
.setInputCols(["document"])\
.setOutputCol("lang")\
.setThreshold(0.3)\
.setCoalesceSentences(True)

languagePipeline = Pipeline(stages=[
 documentAssembler, 
 language_detector
])

ld_wiki_tatoeba_cnn_21 download started this may take some time.
Approximate size to download 7.1 MB
[ | ]ld_wiki_tatoeba_cnn_21 download started this may take some time.
Approximate size to download 7.1 MB
Download done! Loading the resource.


                                                                                

[ / ]



[ — ]



[ \ ]



[ | ]



[ / ]

                                                                                

[ — ]

2022-03-13 19:30:26.255266: I external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-03-13 19:30:26.348886: I external/org_tensorflow/tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2299995000 Hz


[OK!]


In [29]:
empty_df = spark.createDataFrame([['']]).toDF("reviewText")
langresults = languagePipeline.fit(empty_df).transform(df3)

In [30]:
langresults = langresults.withColumn("language",F.explode(F.col("lang.result")))

In [31]:
langresults.filter(langresults.language != "en").select("reviewText","language").show(5)

[Stage 124:>                                                        (0 + 1) / 1]

+------------------+--------+
|        reviewText|language|
+------------------+--------+
|          Garbage!|      de|
| Love Jane Iredale|      it|
|         Excellent|      fr|
|             Love!|      it|
|Excellent product.|      fr|
+------------------+--------+
only showing top 5 rows



                                                                                

We dont use language detection pipeline to eliminate non-English review since the result is very BAD!

In [16]:
# langresults = langresults.withColumn("language",F.explode(F.col("lang.result")))
# finalv2 = langresults.filter(langresults.language == "en").drop("language","document","lang") #only English


In [None]:
# previous:27560645
# print(f"After filtering out non-English reviews,{finalv2.rdd().countApprox()} reviews left")

### Text cleaning 2: expand contraction

In [108]:
# check whether contraction is included in stopwords
# NOT all included! so still need
print(stopwords_cleaner.getStopWords())

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'no

In [17]:
# expand contraction
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [18]:
@udf(returnType=StringType())
def expand(text):
    text = text.lower()
    words = text.split()
    new_text = []
    for w in words:
        if w in contractions:
            new_text.append(contractions[w])
        else:
            new_text.append(w)
    text = " ".join(new_text)
    return text

In [19]:
df4 = df3.withColumn("text",expand("reviewText"))
df4 = df4.drop("reviewText")

Apply stratified sampling according to 0&1 percentage to each category: 
- used for model selection, vectorizer selection, hyperparameter tuning
- 5 Model candidates: `LogisticRegression,LinearSVC,DecisionTreeClassifier,RandomForestClassifier,GBTClassifier`
- 2 Vectorizer candidates: Since want to explain the meaning of each columns, only `CountVectorizer` and `TF-IDF with CountVectorizer` can be used
    - `HashingTF`: cannot be reversed, infer back from column index to words
    - `word2vec`: each column has no meaning

## Loop over models & vectorizers

In [20]:
# candidate vectorizers

## BEST!!
document_assembler = DocumentAssembler() \
      .setInputCol("text") \
      .setOutputCol("document")
    
tokenizer = Tokenizer() \
      .setInputCols(["document"]) \
      .setOutputCol("token")

normalizer = Normalizer() \
      .setInputCols(["token"]) \
      .setOutputCol("normalized")\
      .setLowercase(True) \
      .setCleanupPatterns(["""[^A-Za-z]"""]) #only keep alphabet letters #find all numeric useless

lemmatizer = LemmatizerModel.pretrained('lemma_antbnc',"en")\
    .setInputCols(["normalized"])\
    .setOutputCol("lemma")

# remove stemmer:因为会导致结果不是完整的词
# stemmer = Stemmer() \
#       .setInputCols(["lemma"]) \
#       .setOutputCol("stem")

# 调整顺序
stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("lemma")\
      .setOutputCol("cleaned")\
      .setCaseSensitive(False)

# 后加 因为发现很多no good的评论被分错了 
ngrams_cum = NGramGenerator() \
            .setInputCols(["cleaned"]) \
            .setOutputCol("ngrams") \
            .setN(3) \
            .setEnableCumulative(True)\
            .setDelimiter(" ") # Default is space

finisher = Finisher() \
      .setInputCols(["ngrams"]) \
      .setOutputCols(["token_features"]) \
      .setOutputAsArray(True) \
      .setCleanAnnotations(False)

# CountVectorizer
countVectors = CountVectorizer(inputCol="token_features", outputCol="features", 
                               vocabSize=10000, #max size of the vocabulary
                               minDF=10) #一个term至少要在这么多不同的docs中出现才会被include

pipe_countvec = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            lemmatizer,
            stopwords_cleaner,
            ngrams_cum,
            finisher,
            countVectors
       ])

# TF-IDF with CountVectorizer
#countVectors = CountVectorizer(inputCol="token_features", outputCol="rawFeatures", 
                               vocabSize=10000, #max size of the vocabulary
                               minDF=10) #一个term至少要在这么多不同的docs中出现才会被include

#idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=10) #minDocFreq: remove sparse terms
# pipe_countidf = Pipeline(
#     stages=[document_assembler, 
#             tokenizer,
#             normalizer,
#             lemmatizer,
#             stopwords_cleaner,
#             ngrams_cum,
#             finisher,
#             countVectors,
#             idf
#        ])


lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ | ]lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
Download done! Loading the resource.


                                                                                

[ / ]

22/03/13 20:19:33 WARN org.apache.hadoop.util.concurrent.ExecutorHelper: Thread (Thread[GetFileInfo #1,5,main]) interrupted: 
java.lang.InterruptedException
	at com.google.common.util.concurrent.AbstractFuture.get(AbstractFuture.java:510)
	at com.google.common.util.concurrent.FluentFuture$TrustedFuture.get(FluentFuture.java:88)
	at org.apache.hadoop.util.concurrent.ExecutorHelper.logThrowableFromAfterExecute(ExecutorHelper.java:48)
	at org.apache.hadoop.util.concurrent.HadoopThreadPoolExecutor.afterExecute(HadoopThreadPoolExecutor.java:90)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1157)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)

[ — ]

                                                                                

[OK!]


Separate out the data into each small category

In [22]:
LB = df4.filter(df4.Category == "Luxury Beauty")
B = df4.filter(df4.Category == "All Beauty")
FA = df4.filter(df4.Category == "Amazon Fashion")
C = df4.filter(df4.Category == "Clothing, Shoes and Jewelery")

Stratified Sampling

In [23]:
#only take 1% from first three category,0.01% from Clothing category 
LBsm = LB.sampleBy("label",fractions = {0:0.01,1:0.01},seed=42)
Bsm = B.sampleBy("label",fractions = {0:0.01,1:0.01},seed=42)
FAsm = FA.sampleBy("label",fractions = {0:0.01,1:0.01},seed=42)
Csm = C.sampleBy("label",fractions = {0:0.0001,1:0.0001},seed=42)

For text preprocessing illustrative purpose:
- transform 100 rows only and show the result

In [None]:
nlp_fulltfidf = pipe_countidf.fit(df4)

In [None]:
illu_df = nlp_tfidf_full.transform(df4.limit(2))

illu_df.select("text","token.result","lemma.result","cleaned.result","ngrams.result","token_features",
              "features")\
                .show(2,truncate=300,vertical=True)

In [22]:
print(Csm.rdd.countApprox(timeout=1000))



2264


In [24]:
print(LBsm.rdd.countApprox(timeout=1000))



4892


In [26]:
# train test split to small dataset
LB_trainsm = LBsm.sampleBy("label",fractions = {0:0.8,1:0.8},seed=42) #80% training data
LB_testsm = LBsm.subtract(LB_trainsm)

B_trainsm = Bsm.sampleBy("label",fractions = {0:0.8,1:0.8},seed=42) #80% training data
B_testsm = Bsm.subtract(B_trainsm)

F_trainsm = FAsm.sampleBy("label",fractions = {0:0.8,1:0.8},seed=42) #80% training data
F_testsm = FAsm.subtract(F_trainsm)

C_trainsm = Csm.sampleBy("label",fractions = {0:0.8,1:0.8},seed=42) #80% training data
C_testsm = FAsm.subtract(C_trainsm)

Luxury Beauty

In [82]:
# candidate models
classifiers = list()
classifiers.append(["Logistic Classifier",LogisticRegression()])
classifiers.append(["Linear SVM",LinearSVC()])
classifiers.append(["DecisionTree Classifier",DecisionTreeClassifier()])
classifiers.append(["RandomForest Classifier",RandomForestClassifier(numTrees = 10)])
classifiers.append(["GradientBoost Classifier",GBTClassifier()])

In [None]:
%%time

# loop over models & vectorizers:
result = pd.DataFrame()
for name,clf in classifiers:
    print(f"{name} is training...")
    # vectorizer
    for vect,pipe in zip(["TF-IDF_CountVectorizer","CountVectorizer"],\
                         [pipe_countidf,pipe_countvec]):
        print(f"{vect} is running...")
    
        t0 = time.time()
        nlp_model = pipe.fit(LB_trainsm)
        processed_train = nlp_model.transform(LB_trainsm)
        processed_test = nlp_model.transform(LB_testsm)

        # fit model
        clfModel = clf.fit(processed_train)

        # predict
        predictions = clfModel.transform(processed_test)

        # Select example rows to display.
        predictions.select("prediction", "label", "features").show(5,truncate=30)

        # Evaluate
        bevaluator = BinaryClassificationEvaluator(labelCol="label",rawPredictionCol="rawPrediction")
        mevaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label')

        # AUC
        auc = bevaluator.evaluate(predictions, {bevaluator.metricName: "areaUnderROC"})

        # Accuracy
        acc = mevaluator.evaluate(predictions, {mevaluator.metricName: "accuracy"})

        # F1
        f1 = mevaluator.evaluate(predictions, {mevaluator.metricName: "f1"})

        # recall/True Positive Rate
        # metricLabel:The class whose metric will be computed in truePositiveRateByLabel|falsePositiveRateByLabel..default=0
        # ONLY care about minority class(class=1) recall rate
        recall = mevaluator.evaluate(predictions, {mevaluator.metricName: "recallByLabel",mevaluator.metricLabel: 1.0})
        elapsed = time.time() - t0
        # save result
        res = [name,vect,auc,acc,f1,recall,elapsed]
        result = result.append([res],ignore_index=True)
result.columns = ["Model","Vectorizer","AUC","Accuracy","F1","Recall","Time"]

Logistic Classifier is training...
TF-IDF_CountVectorizer is running...


                                                                                

+----------+-----+------------------------------+
|prediction|label|                      features|
+----------+-----+------------------------------+
|       0.0|    0|(1353,[4,5,10,12,20,51,60,6...|
|       1.0|    0|(1353,[39,40,77,92,201,211,...|
|       0.0|    0|(1353,[39,166,748,767],[2.7...|
|       0.0|    0|(1353,[1,5,10,12,124,132,35...|
|       0.0|    0|(1353,[53,83,156,282],[3.42...|
+----------+-----+------------------------------+
only showing top 5 rows



                                                                                

CountVectorizer is running...


                                                                                

+----------+-----+------------------------------+
|prediction|label|                      features|
+----------+-----+------------------------------+
|       0.0|    0|(1353,[28,53,69,83,142,246,...|
|       1.0|    0|(1353,[38,40,78,92,202,210,...|
|       0.0|    0|(1353,[0,1,2,5,7,8,9,12,13,...|
|       0.0|    0|            (1353,[224],[1.0])|
|       0.0|    0| (1353,[1,5,88],[1.0,1.0,1.0])|
+----------+-----+------------------------------+
only showing top 5 rows



                                                                                

Linear SVM is training...
TF-IDF_CountVectorizer is running...


                                                                                

+----------+-----+------------------------------+
|prediction|label|                      features|
+----------+-----+------------------------------+
|       0.0|    0|(1353,[0,1,2,5,7,8,9,12,13,...|
|       0.0|    0|(1353,[3,16,17,18,24,33,34,...|
|       0.0|    0|(1353,[18,142,165,192,313,1...|
|       0.0|    0|(1353,[5],[1.54785248222501...|
|       1.0|    0|(1353,[4,5,8,13,14,15,17,20...|
+----------+-----+------------------------------+
only showing top 5 rows



                                                                                

CountVectorizer is running...


                                                                                

+----------+-----+------------------------------+
|prediction|label|                      features|
+----------+-----+------------------------------+
|       0.0|    0|(1353,[28,53,69,83,142,247,...|
|       0.0|    0|(1353,[11,22,38,118,151,166...|
|       0.0|    0|(1353,[0,1,2,6,7,9,11,12,15...|
|       0.0|    0|(1353,[3,16,18,104,495,614,...|
|       0.0|    0|(1353,[0,4,6,11,25,69,70,10...|
+----------+-----+------------------------------+
only showing top 5 rows



                                                                                

DecisionTree Classifier is training...
TF-IDF_CountVectorizer is running...


                                                                                

+----------+-----+------------------------------+
|prediction|label|                      features|
+----------+-----+------------------------------+
|       0.0|    0|(1353,[3,16,17,18,24,33,34,...|
|       0.0|    0|(1353,[223],[4.082293067135...|
|       0.0|    0|(1353,[1,166,688],[1.135390...|
|       0.0|    1|(1353,[8,60,94,178,235,635,...|
|       0.0|    0|(1353,[1,5,87,169,513],[1.1...|
+----------+-----+------------------------------+
only showing top 5 rows



                                                                                

CountVectorizer is running...


                                                                                

+----------+-----+------------------------------+
|prediction|label|                      features|
+----------+-----+------------------------------+
|       0.0|    0|(1353,[28,53,69,83,143,244,...|
|       0.0|    0|(1353,[53,83,156,281],[1.0,...|
|       0.0|    0|(1353,[0,4,6,11,25,69,70,10...|
|       0.0|    0|(1353,[0,4,7,16,29,30,35,49...|
|       0.0|    0|(1353,[0,3,4,16,19,23,37,43...|
+----------+-----+------------------------------+
only showing top 5 rows



                                                                                

RandomForest Classifier is training...
TF-IDF_CountVectorizer is running...


                                                                                

+----------+-----+------------------------------+
|prediction|label|                      features|
+----------+-----+------------------------------+
|       0.0|    0|(1353,[1,5,88],[1.135390021...|
|       0.0|    0|(1353,[0,4,5,12,14,26,365,7...|
|       0.0|    0|(1353,[225],[4.082293067135...|
|       0.0|    0|(1353,[11,22,39,118,151,166...|
|       0.0|    0|(1353,[0,1,2,6,7,9,11,12,15...|
+----------+-----+------------------------------+
only showing top 5 rows



                                                                                

CountVectorizer is running...


                                                                                

+----------+-----+------------------------------+
|prediction|label|                      features|
+----------+-----+------------------------------+
|       0.0|    0|(1353,[2,5,6,20,25,34,45,93...|
|       0.0|    0|(1353,[3,158,455],[1.0,1.0,...|
|       0.0|    0|(1353,[1,5,10,12,124,131,35...|
|       0.0|    0|(1353,[1,166,684],[1.0,1.0,...|
|       0.0|    1|(1353,[0,1,2,5,7,8,9,19,21,...|
+----------+-----+------------------------------+
only showing top 5 rows



                                                                                

GradientBoost Classifier is training...
TF-IDF_CountVectorizer is running...


                                                                                

+----------+-----+------------------------------+
|prediction|label|                      features|
+----------+-----+------------------------------+
|       0.0|    0|(1353,[39,40,78,92,201,209,...|
|       0.0|    0|(1353,[13,17,23,25,99,259,4...|
|       0.0|    1|(1353,[0,1,3,5,9,15,17,67,7...|
|       0.0|    0|(1353,[18,151,158,196,312,1...|
|       0.0|    0|(1353,[5],[1.54785248222501...|
+----------+-----+------------------------------+
only showing top 5 rows



                                                                                

CountVectorizer is running...


                                                                                

+----------+-----+------------------------------+
|prediction|label|                      features|
+----------+-----+------------------------------+
|       0.0|    0|(1353,[39,164,760,777],[1.0...|
|       0.0|    0|(1353,[2,5,6,21,25,35,44,96...|
|       1.0|    1|(1353,[6,18,45,113,198,211,...|
|       1.0|    1|(1353,[8,60,94,178,238,642,...|
|       0.0|    0|(1353,[1,5,88,169,533],[1.0...|
+----------+-----+------------------------------+
only showing top 5 rows



                                                                                

CPU times: user 19.3 s, sys: 7.85 s, total: 27.2 s
Wall time: 1h 42min 56s


In [73]:
result.sort_values(by=["AUC","F1"],ascending=[False,False])

Unnamed: 0,Model,Vectorizer,AUC,F1,Recall,Time
3,Linear SVM,CountVectorizer,0.856274,0.877418,0.613497,571.239151
2,Linear SVM,TF-IDF_CountVectorizer,0.853202,0.872321,0.6,651.45959
1,Logistic Classifier,CountVectorizer,0.849145,0.859052,0.675325,580.651742
0,Logistic Classifier,TF-IDF_CountVectorizer,0.835339,0.863136,0.662338,840.606115
9,GradientBoost Classifier,CountVectorizer,0.794078,0.83409,0.246835,562.866477
8,GradientBoost Classifier,TF-IDF_CountVectorizer,0.780117,0.828776,0.212903,580.609896
7,RandomForest Classifier,CountVectorizer,0.766871,0.773877,0.0,597.816423
6,RandomForest Classifier,TF-IDF_CountVectorizer,0.749489,0.772141,0.012987,659.699081
5,DecisionTree Classifier,CountVectorizer,0.502504,0.822543,0.180645,550.868778
4,DecisionTree Classifier,TF-IDF_CountVectorizer,0.502504,0.820026,0.183544,581.151411


- The result suggest we should use Linear SVM + CountVectorizer for Luxury Beauty dataset
- Then cross validation to choose best parameters:

In [27]:
nlp_modelsm = pipe_countvec.fit(LB_trainsm)
LB_ptrainsm = nlp_modelsm.transform(LB_trainsm)
LB_ptestsm = nlp_modelsm.transform(LB_testsm)

                                                                                

`LinerSVC` parameters
- `regParam`: regularization parameter(how large the penalty is) default=0
- `threshold `:The threshold in **binary classification applied to the linear model prediction**. This threshold can be any real number, where Inf will make all predictions 0.0 and -Inf will make all predictions 1.0. default = 0.0

### Try to do hyperparamter tuning
- But it is too slow
- Give up

In [64]:
# Hyperparameter tuning
linsvc = LinearSVC(maxIter=300) #default=100

paramGrid = ParamGridBuilder() \
    .addGrid(linsvc.regParam, [0.1, 0.4,0.01]) \
    .addGrid(linsvc.threshold,[0.4,0.5,0.6])\
    .build()

# F1.2 for class1 the higher the better
# stress more on recall
# metricLabel:t
crossval = CrossValidator(estimator=linsvc,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                                                      metricName = "fMeasureByLabel",
                                                                     metricLabel = 1,beta = 1.2),numFolds=2)

In [None]:
%%time
cvModel_svc = crossval.fit(LB_ptrainsm) #after choosing best model,use that to predict testing data
# print("Start testing...")
# predictions = cvModel_log.transform(LB_ptestsm)

In [None]:
print(max(cvModel_svc.avgMetrics)) #best training F1.2 score

In [None]:
bestModel = cvModel_svc.bestModel

print('Best regParam: ', bestModel._java_obj.getRegParam())
print('Best threshold: ', bestModel._java_obj.getThreshold())

Eventually, train model on full dataset with tuuning

In [None]:
# train test split
LB_train = LB.sampleBy("label",fractions = {0:0.8,1:0.8},seed=42) #80% training data
LB_test = LB.subtract(LB_train)

In [67]:
nlp_model = pipe_countvec.fit(LB_train)
LB_ptrain = nlp_model.transform(LB_train)
LB_ptest = nlp_model.transform(LB_test)

In [95]:
%%time
# fit model
t0 = time.time()
SVCbest = LinearSVC(regParam = 0.01,
                   threshold = 0.5,
                   maxIter=200) 
bestModel = SVCbest.fit(LB_ptrain)

# predict
predictions = bestModel.transform(LB_ptest)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5,truncate=30)

# Evaluate
bevaluator = BinaryClassificationEvaluator(labelCol="label",rawPredictionCol="rawPrediction")
mevaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label')

# AUC
auc = bevaluator.evaluate(predictions, {bevaluator.metricName: "areaUnderROC"})

# Accuracy
acc = mevaluator.evaluate(predictions, {mevaluator.metricName: "accuracy"})

# F1
f1 = mevaluator.evaluate(predictions, {mevaluator.metricName: "f1"})

# recall/True Positive Rate
# metricLabel:The class whose metric will be computed in truePositiveRateByLabel|falsePositiveRateByLabel..default=0
# ONLY care about minority class(class=1) recall rate
recall = mevaluator.evaluate(predictions, {mevaluator.metricName: "recallByLabel",mevaluator.metricLabel: 1.0})

# fit + predict + eval time
elapsed = time.time() - t0

print(f"F1:{f1}\nAccuracy:{acc}\nAUC score:{auc}\nRecall:{recall}\nTime:{elapsed}")

22/03/12 20:32:01 WARN org.apache.spark.scheduler.AsyncEventQueue: Dropped 9585 events from executorManagement since Sat Mar 12 19:55:56 UTC 2022.
22/03/12 20:33:01 WARN org.apache.spark.scheduler.AsyncEventQueue: Dropped 48673 events from executorManagement since Sat Mar 12 20:32:01 UTC 2022.
                                                                                

+----------+-----+------------------------------+
|prediction|label|                      features|
+----------+-----+------------------------------+
|       0.0|    0|(10000,[3,37,934,2750],[1.0...|
|       0.0|    0|(10000,[0,1,4,5,6,7,8,9,15,...|
|       0.0|    0|(10000,[1,6,7,10,14,21,27,2...|
|       1.0|    0|(10000,[28,55,110,268,550,4...|
|       0.0|    1|(10000,[4,9,29,82,118,140,5...|
+----------+-----+------------------------------+
only showing top 5 rows





F1:0.9072989616109896
Accuracy:0.916710052270976
AUC score:0.9450277328450177
Recall:0.5194568301580867
Time:682.4030590057373
CPU times: user 2.72 s, sys: 1.8 s, total: 4.52 s
Wall time: 11min 22s


                                                                                

Coefficient Analysis
- SVM coefficient:direction means predicted class
    - if **weight > 0**: contribute to pos class--in our case:class=1--**Negative review!!**
    - if weight < 0: contribute to neg class--class=0--Positive review
    - So **OPPOSITE**

In [75]:
count_model = nlp_model.stages[-1]
len(count_model.vocabulary) #these number of words/ngram words satisfy minDF requirement

10000

In [99]:
# intercept
print("Intercept: " + str(bestModel.intercept))

# when review contains no word
# since intercept is negative:more likely to be negative review

Intercept: -0.7405971897374637


In [100]:
coeffs = pd.DataFrame(bestModel.coefficients.toArray())
coeffs = coeffs.rename(columns = {0:"coeff"})
coeffs["importance"] = np.abs(coeffs["coeff"])
coeffs["vocabulary"] = count_model.vocabulary

coeffs.sort_values(by="importance",axis=0,ascending=False,inplace=True)
coeffs = coeffs.reset_index(drop=True)

In [102]:
# coeff>0:contribute to pos class--negative reviews
neg = coeffs[coeffs["coeff"]>0].sort_values(by="importance",ascending=False)
neg.head(100)

Unnamed: 0,coeff,importance,vocabulary
0,3.205668,3.205668,two star
2,2.268331,2.268331,want love
5,2.049794,2.049794,zero star
6,1.885735,1.885735,disappointing
7,1.834778,1.834778,high hope
11,1.740188,1.740188,meh
15,1.689804,1.689804,useless
17,1.65498,1.65498,worthless
18,1.646752,1.646752,ineffective
19,1.645215,1.645215,give one star


Beauty
- no more looping through vectorizer:since `CountVectorizer` is proven to be better than `TF-IDF` for Luxury Beauty data on all models.


In [None]:
%%time

# loop over models
result = pd.DataFrame()
for name,clf in classifiers:
    print(f"{name} is training...")
    
    # text processing
    t0 = time.time()
    nlp_model = pipe_countvec.fit(B_trainsm)
    processed_train = nlp_model.transform(B_trainsm)
    processed_test = nlp_model.transform(B_testsm)

    # fit model
    print(f"{name} is fitting...")
    clfModel = clf.fit(processed_train)

    # predict
    print(f"{name} is predicting...")
    predictions = clfModel.transform(processed_test)

    # Select example rows to display.
    predictions.select("prediction", "label", "features").show(5,truncate=30)

    # Evaluate
    print(f"{name} is evaluating...")
    bevaluator = BinaryClassificationEvaluator(labelCol="label",rawPredictionCol="rawPrediction")
    mevaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label')

    # AUC
    auc = bevaluator.evaluate(predictions, {bevaluator.metricName: "areaUnderROC"})

    # Accuracy
    acc = mevaluator.evaluate(predictions, {mevaluator.metricName: "accuracy"})

    # F1
    f1 = mevaluator.evaluate(predictions, {mevaluator.metricName: "f1"})

    # recall/True Positive Rate
    # metricLabel:The class whose metric will be computed in truePositiveRateByLabel|falsePositiveRateByLabel..default=0
    # ONLY care about minority class(class=1) recall rate
    recall = mevaluator.evaluate(predictions, {mevaluator.metricName: "recallByLabel",mevaluator.metricLabel: 1.0})
    elapsed = time.time() - t0 # fit+predict+eval
    
    # save result
    res = [name,auc,acc,f1,recall,elapsed]
    result = result.append([res],ignore_index=True)
    
result.columns = ["Model","AUC","Accuracy","F1","Recall","Time"]

Logistic Classifier is training...


                                                                                

Logistic Classifier is fitting...


22/03/12 18:41:20 WARN org.apache.spark.scheduler.AsyncEventQueue: Dropped 36377 events from executorManagement since Sat Mar 12 18:33:38 UTC 2022.

Logistic Classifier is predicting...


                                                                                

+----------+-----+------------------------------+
|prediction|label|                      features|
+----------+-----+------------------------------+
|       0.0|    0| (830,[1,2,100],[1.0,1.0,1.0])|
|       0.0|    0|(830,[1,2,4,6,8,11,14,16,19...|
|       0.0|    0|               (830,[5],[1.0])|
|       0.0|    0|(830,[0,7,17,50,59],[1.0,1....|
|       0.0|    0|(830,[0,1,2,3,6,16,24,28,29...|
+----------+-----+------------------------------+
only showing top 5 rows

Logistic Classifier is evaluating...


                                                                                

Linear SVM is training...


                                                                                

Linear SVM is fitting...


22/03/12 18:51:11 WARN org.apache.spark.scheduler.AsyncEventQueue: Dropped 9501 events from executorManagement since Sat Mar 12 18:41:20 UTC 2022.

Linear SVM is predicting...


                                                                                

+----------+-----+------------------------------+
|prediction|label|                      features|
+----------+-----+------------------------------+
|       0.0|    0|               (830,[5],[1.0])|
|       0.0|    0|(830,[3,10,30,44,45,54,100,...|
|       0.0|    0|(830,[3,39,201],[1.0,1.0,1.0])|
|       0.0|    0|(830,[6,20,29,39,45,172,205...|
|       0.0|    0|(830,[5,56,117],[1.0,1.0,1.0])|
+----------+-----+------------------------------+
only showing top 5 rows

Linear SVM is evaluating...


                                                                                

DecisionTree Classifier is training...


                                                                                

DecisionTree Classifier is fitting...


                                                                                

DecisionTree Classifier is predicting...


                                                                                

+----------+-----+------------------------------+
|prediction|label|                      features|
+----------+-----+------------------------------+
|       0.0|    0| (830,[1,2,100],[1.0,1.0,1.0])|
|       0.0|    0|(830,[3,10,30,44,46,53,99,1...|
|       0.0|    0|(830,[1,2,4,6,8,11,14,16,19...|
|       0.0|    0|(830,[1,2,3,6,7,8,9,13,15,4...|
|       0.0|    0|(830,[3,39,202],[1.0,1.0,1.0])|
+----------+-----+------------------------------+
only showing top 5 rows

DecisionTree Classifier is evaluating...


                                                                                

RandomForest Classifier is training...


                                                                                

RandomForest Classifier is fitting...




RandomForest Classifier is predicting...


                                                                                

+----------+-----+------------------------------+
|prediction|label|                      features|
+----------+-----+------------------------------+
|       0.0|    0|(830,[1,2,4,6,8,11,14,16,19...|
|       0.0|    0|               (830,[5],[1.0])|
|       0.0|    0|(830,[0,5,12,13,18,25,36,37...|
|       0.0|    0|(830,[3,10,30,44,45,53,101,...|
|       0.0|    0|(830,[6,20,29,39,45,172,204...|
+----------+-----+------------------------------+
only showing top 5 rows

RandomForest Classifier is evaluating...


                                                                                

GradientBoost Classifier is training...


                                                                                

GradientBoost Classifier is fitting...


22/03/12 19:19:32 WARN org.apache.spark.scheduler.AsyncEventQueue: Dropped 26312 events from executorManagement since Sat Mar 12 18:51:11 UTC 2022.


GradientBoost Classifier is predicting...


                                                                                

+----------+-----+------------------------------+
|prediction|label|                      features|
+----------+-----+------------------------------+
|       0.0|    0|(830,[5,56,117],[1.0,1.0,1.0])|
|       0.0|    0| (830,[1,2,101],[1.0,1.0,1.0])|
|       0.0|    0|(830,[1,2,4,6,8,11,14,16,19...|
|       0.0|    1|(830,[0,1,9,15,22,37,38,45,...|
|       0.0|    0|               (830,[5],[1.0])|
+----------+-----+------------------------------+
only showing top 5 rows

GradientBoost Classifier is evaluating...


                                                                                

CPU times: user 8.54 s, sys: 3.75 s, total: 12.3 s
Wall time: 47min 15s


In [85]:
result.sort_values(by=["AUC","F1"],ascending=[False,False])

# linear SVM recall slightly lower than logistic regression(lower in AUC,ACC,F1)

Unnamed: 0,Model,AUC,Accuracy,F1,Recall,Time
1,Linear SVM,0.832525,0.875614,0.873719,0.523256,569.634468
4,GradientBoost Classifier,0.791008,0.870704,0.845017,0.232558,508.87413
0,Logistic Classifier,0.784817,0.842881,0.849538,0.55814,625.177967
3,RandomForest Classifier,0.741384,0.859247,0.794198,0.0,596.295657
2,DecisionTree Classifier,0.540653,0.860884,0.828266,0.162791,535.738722


- Linear SVM is the best model for Beauty category
- retrain on full dataset

In [86]:
# train test split
B_train = B.sampleBy("label",fractions = {0:0.8,1:0.8},seed=42) #80% training data
B_test = B.subtract(B_train)

In [87]:
nlp_model = pipe_countvec.fit(B_train)
B_ptrain = nlp_model.transform(B_train)
B_ptest = nlp_model.transform(B_test)

                                                                                

Retrain model on full Beauty dataset

In [109]:
%%time
# fit model
t0 = time.time()
SVCbest = LinearSVC(regParam = 0.01,
                   threshold = 0.5,
                   maxIter=200) 
bestModel = SVCbest.fit(B_ptrain)

# predict
predictions = bestModel.transform(B_ptest)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5,truncate=30)

# Evaluate
bevaluator = BinaryClassificationEvaluator(labelCol="label",rawPredictionCol="rawPrediction")
mevaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label')

# AUC
auc = bevaluator.evaluate(predictions, {bevaluator.metricName: "areaUnderROC"})

# Accuracy
acc = mevaluator.evaluate(predictions, {mevaluator.metricName: "accuracy"})

# F1
f1 = mevaluator.evaluate(predictions, {mevaluator.metricName: "f1"})

# recall/True Positive Rate
# metricLabel:The class whose metric will be computed in truePositiveRateByLabel|falsePositiveRateByLabel..default=0
# ONLY care about minority class(class=1) recall rate
recall = mevaluator.evaluate(predictions, {mevaluator.metricName: "recallByLabel",mevaluator.metricLabel: 1.0})

# fit + predict + eval time
elapsed = time.time() - t0

print(f"F1:{f1}\nAccuracy:{acc}\nAUC score:{auc}\nRecall:{recall}\nTime:{elapsed}")

22/03/12 21:02:46 WARN org.apache.spark.scheduler.AsyncEventQueue: Dropped 10601 events from executorManagement since Sat Mar 12 20:33:01 UTC 2022.
22/03/12 21:03:46 WARN org.apache.spark.scheduler.AsyncEventQueue: Dropped 53060 events from executorManagement since Sat Mar 12 21:02:46 UTC 2022.
                                                                                

+----------+-----+------------------------------+
|prediction|label|                      features|
+----------+-----+------------------------------+
|       0.0|    0|(10000,[0,9,14,28,31,40,44,...|
|       0.0|    0|(10000,[0,3,4,6,8,9,12,25,2...|
|       0.0|    0|(10000,[2,26,55,340,564,110...|
|       0.0|    0|(10000,[3,7,11,19,32,48,53,...|
|       0.0|    1|(10000,[70,288,2042],[1.0,1...|
+----------+-----+------------------------------+
only showing top 5 rows





F1:0.8918806837169349
Accuracy:0.9027089159189555
AUC score:0.9381077272401492
Recall:0.5148956817079088
Time:779.1942629814148
CPU times: user 2.65 s, sys: 1.81 s, total: 4.46 s
Wall time: 12min 59s


                                                                                

Coefficients Analysis

In [111]:
# intercept
print("Intercept: " + str(bestModel.intercept))

# when review contains no word

Intercept: -0.6260157480633176


In [112]:
coeffs = pd.DataFrame(bestModel.coefficients.toArray())
coeffs = coeffs.rename(columns = {0:"coeff"})
coeffs["importance"] = np.abs(coeffs["coeff"])
coeffs["vocabulary"] = count_model.vocabulary

coeffs.sort_values(by="importance",axis=0,ascending=False,inplace=True)
coeffs = coeffs.reset_index(drop=True)

In [113]:
# coeff>0:contribute to pos class--negative reviews
neg = coeffs[coeffs["coeff"]>0].sort_values(by="importance",ascending=False)
neg.head(100)

Unnamed: 0,coeff,importance,vocabulary
0,3.383875,3.383875,lacquer
5,2.17409,2.17409,couple week
9,1.942986,1.942986,look pretty
13,1.841766,1.841766,surely
14,1.827211,1.827211,love always
19,1.74392,1.74392,good facial
21,1.688967,1.688967,dryness
22,1.686986,1.686986,curl last day
24,1.661698,1.661698,hair heavy
26,1.634447,1.634447,pevonia


## Specific Product analysis

Specific brands: B00005J55C

In [13]:
specific = df2.filter(df2.asin == "B00005JS5C")

In [14]:
before_2012 = specific.filter(specific.year < 2012)
after_2012 = specific.filter(specific.year >= 2012)

In [18]:
before_2012.select("reviewText","overall","year","month").show(2,vertical=True,truncate=200)



-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 reviewText | THank YOu! I had been wanting something like this forever, I thought it only existed in my dreams!!!!!                                                                                                   
 overall    | 5.0                                                                                                                                                                                                      
 year       | 2005                                                                                                                                                                                                     
 month      | 8                                                                                                                         

                                                                                

In [19]:
after_2012.select("reviewText","overall","year","month").show(2,vertical=True,truncate=200)



-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------------
 reviewText | It works.                                                                                                                                                
 overall    | 5.0                                                                                                                                                      
 year       | 2018                                                                                                                                                     
 month      | 1                                                                                                                                                        
-RECORD 1-------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [15]:
before_2012.count()

                                                                                

96

In [16]:
after_2012.count()

                                                                                

2153

In [20]:
# YAKE pipeline
stopwords = StopWordsCleaner().getStopWords()
document = DocumentAssembler() \
    .setInputCol("reviewText") \
    .setOutputCol("document")

sentenceDetector = SentenceDetector() \
    .setInputCols("document") \
    .setOutputCol("sentence")

token = Tokenizer() \
    .setInputCols("sentence") \
    .setOutputCol("token") 

lemmatizer = LemmatizerModel.pretrained('lemma_antbnc',"en")\
    .setInputCols(["token"])\
    .setOutputCol("lemma")

keywords = YakeKeywordExtraction() \
    .setInputCols("lemma") \
    .setOutputCol("keywords") \
    .setMinNGrams(1) \
    .setMaxNGrams(3)\
    .setNKeywords(20)\
    .setStopWords(stopwords)

yake_pipeline = Pipeline(stages=[document, sentenceDetector, token,lemmatizer, keywords])

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ | ]lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
Download done! Loading the resource.


[Stage 77:>                                                         (0 + 1) / 1]

[ / ]



[ — ]

                                                                                

[OK!]


In [21]:
empty_df = spark.createDataFrame([['']]).toDF("reviewText")

yake_Model = yake_pipeline.fit(empty_df)
before_kws = yake_Model.transform(before_2012)
after_kws = yake_Model.transform(after_2012)

In [22]:
before_kws.select("reviewText","keywords.result").show(2,vertical=True,truncate=200)



-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 reviewText | THank YOu! I had been wanting something like this forever, I thought it only existed in my dreams!!!!!                                                                                                   
 result     | [thank, want, something, like, forever, think, exist, dream, want something, something like, want something like, like this forever]                                                                     
-RECORD 1--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 reviewText | I bought this for my wife because she complained of using a razor.  6 months later the thing just decides to stop working 

                                                                                

In [23]:
after_kws.select("reviewText","keywords.result").show(2,vertical=True,truncate=200)



-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 reviewText | It works.                                                                                                                                                                                    
 result     | [work]                                                                                                                                                                                       
-RECORD 1--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 reviewText | It does the job but I wish it could shave closer. So, I want to try maybe a men's electric razor so maybe I could skip the shaving part when I use this.                  

                                                                                

In [24]:
before_pdf = before_kws.select("keywords.result","brand").toPandas()
before_pdf.to_csv("gs://classificationdata112/spec_before.csv",index=False)

after_pdf = after_kws.select("keywords.result","brand").toPandas()
after_pdf.to_csv("gs://classificationdata112/spec_after.csv",index=False)

                                                                                

## Digress to brands analysis
- TOP brands: Lots of review & high star rating
    - which we can learn from
- LOW brands: Lots of review BUT low star rating
    - how to help them improve

### Beauty brands

In [23]:
B.columns

['asin', 'overall', 'verified', 'Category', 'brand', 'label', 'text']

In [39]:
# TOP brands
top_brands = B.groupBy("brand")\
            .agg(F.count("asin").alias("cnt"),\
                 F.avg("overall").alias("avg_rating"))\
            .orderBy(F.col("cnt").desc(),F.col("avg_rating").desc())

In [47]:
# LOW brands
# Add a threshold to num of counts
# <500 reviews are not informative
bot_brands = B.groupBy("brand")\
            .agg(F.count("asin").alias("cnt"),\
                 F.avg("overall").alias("avg_rating"))\
            .where(F.col("cnt")>500).where(~F.col("brand").isNull())\
            .orderBy(F.col("avg_rating").asc(),F.col("cnt").desc()) #rating small->big & review cnt big->small

In [42]:
top_brands.show(7)



+---------------+-----+-----------------+
|          brand|  cnt|       avg_rating|
+---------------+-----+-----------------+
|               |50069|3.907407777267371|
|       Waterpik|15950|4.494796238244514|
|Philips Norelco|11243|4.140087165347327|
|           null| 4822| 4.81625881377022|
|          Astra| 4356| 4.63475665748393|
|Pre de Provence| 3102|4.545454545454546|
|       Aquaphor| 2772|4.695887445887446|
+---------------+-----+-----------------+
only showing top 7 rows



                                                                                

In [46]:
top5 = top_brands.select("brand").take(7) #list of rows
top5

                                                                                

[Row(brand=''),
 Row(brand='Waterpik'),
 Row(brand='Philips Norelco'),
 Row(brand=None),
 Row(brand='Astra'),
 Row(brand='Pre de Provence'),
 Row(brand='Aquaphor')]

In [49]:
top5_brands = [r["brand"] for r in top5]
top5_brands.pop(0) #pop null value
top5_brands.pop(2) #pop None
print(top5_brands)

top5_text = B.filter(B.brand.isin(top5_brands)).select("brand","text","overall")

['Waterpik', 'Philips Norelco', 'Astra', 'Pre de Provence', 'Aquaphor']


In [51]:
bot_brands.show(7)



+-----------+----+------------------+
|      brand| cnt|        avg_rating|
+-----------+----+------------------+
|    Keyzone| 538|3.3717472118959106|
|    General| 955|3.5518324607329843|
|      Tojwi| 582|3.6443298969072164|
|  Remington|1030| 3.681553398058252|
|      Crest|1532|3.6906005221932117|
|    Hittime| 645| 3.710077519379845|
|ArtNaturals| 697| 3.781922525107604|
+-----------+----+------------------+
only showing top 7 rows



                                                                                

In [66]:
# manually select brands based on both rating & count
bot5_brands = ["Remington","Crest","General","ArtNaturals","Hittime"]
bot5_text = B.filter(B.brand.isin(bot5_brands)).select("brand","text","overall")

Extract keywords using "YAKE"--Unsupervised, Corpus-Independent, Domain and Language-Independent and Single-Document keyword extraction algorithm.
- **not rely on dictionaries nor thesauri, neither is trained against any corpora**. Instead, it follows an unsupervised approach which **builds upon features extracted from the text**
- thus applicable to documents written in different languages without the need for further knowledge.
- Lower the score better the keyword

In [50]:
# YAKE pipeline
stopwords = StopWordsCleaner().getStopWords()
document = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

sentenceDetector = SentenceDetector() \
    .setInputCols("document") \
    .setOutputCol("sentence")

token = Tokenizer() \
    .setInputCols("sentence") \
    .setOutputCol("token") 

lemmatizer = LemmatizerModel.pretrained('lemma_antbnc',"en")\
    .setInputCols(["token"])\
    .setOutputCol("lemma")

keywords = YakeKeywordExtraction() \
    .setInputCols("lemma") \
    .setOutputCol("keywords") \
    .setMinNGrams(1) \
    .setMaxNGrams(3)\
    .setNKeywords(20)\
    .setStopWords(stopwords)

yake_pipeline = Pipeline(stages=[document, sentenceDetector, token,lemmatizer, keywords])

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [53]:
empty_df = spark.createDataFrame([['']]).toDF("text")

yake_Model = yake_pipeline.fit(empty_df)
top5_kws = yake_Model.transform(top5_text)
bot5_kws = yake_Model.transform(bot5_text)

In [54]:
top5_kws.withColumn("tmp",
                    F.explode("keywords"))\
                    .select("tmp.*").select("result","metadata.score","metadata.sentence").show(20,truncate = False)

[Stage 132:>                                                        (0 + 1) / 1]

+--------+-------------------+--------+
|result  |score              |sentence|
+--------+-------------------+--------+
|replace |0.33314794283653154|0       |
|old     |0.32121341354607774|0       |
|norelco |0.3434944862586114 |0       |
|battery |0.7400971684685155 |0       |
|die     |0.7400971684685155 |0       |
|replace |0.33314794283653154|0       |
|make    |0.7822897391776581 |1       |
|sure    |0.7822897391776581 |1       |
|pick    |0.7822897391776581 |1       |
|model   |0.7822897391776581 |1       |
|use     |0.35793488361448167|1       |
|either  |0.7822897391776581 |1       |
|recharge|0.7822897391776581 |1       |
|plug    |0.7822897391776581 |1       |
|directly|0.7822897391776581 |1       |
|nice    |0.23550875425103696|2       |
|turbo   |0.8066379851555865 |2       |
|nice    |0.23550875425103696|2       |
|get     |0.8066379851555865 |2       |
|cut     |0.8066379851555865 |2       |
+--------+-------------------+--------+
only showing top 20 rows



                                                                                

In [56]:
top5_kws.select("text").show(1,truncate = False)

[Stage 137:>                                                        (0 + 1) / 1]

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                                                                                                                                                                                                                                                                                                  |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [57]:
# save to GCS
top5b_pdf = top5_kws.select("keywords.result","brand").toPandas()
top5b_pdf.to_csv("gs://classificationdata112/brand_top5.csv",index=False)

                                                                                

In [60]:
bot5_kws.withColumn("tmp",
                    F.explode("keywords"))\
                    .select("tmp.*").select("result","metadata.score","metadata.sentence").show(20,truncate = False)



+-----------+------------------+--------+
|result     |score             |sentence|
+-----------+------------------+--------+
|shaver     |0.744138854039555 |0       |
|many       |0.3707058744476707|0       |
|year       |0.744138854039555 |0       |
|work       |0.3707058744476707|0       |
|wonderfully|0.744138854039555 |0       |
|finally    |0.7858651376729936|1       |
|die        |0.7858651376729936|1       |
|find       |0.3707058744476707|1       |
|another    |0.7858651376729936|1       |
|one        |0.7858651376729936|1       |
|well       |0.3402730661923907|1       |
|try        |0.8099106823316435|2       |
|many       |0.3707058744476707|2       |
|different  |0.8099106823316435|2       |
|style      |0.8099106823316435|2       |
|find       |0.3707058744476707|2       |
|happy      |0.8258854983232315|3       |
|work       |0.3707058744476707|3       |
|well       |0.3402730661923907|3       |
|many year  |1.2307152451676047|0       |
+-----------+------------------+--

                                                                                

In [61]:
bot5_kws.select("text").show(1,truncate = False)



+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                                                                                                                      |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|i had this shaver for many years and it worked wonderfully. when it finally "died" i could not find another one that did as well. so after trying many different styles, i found this. i am so happy and it works so well.|
+-------------------------------------------------------------------------------------------------------------------

                                                                                

In [68]:
# save to GCS
bot5b_pdf = bot5_kws.select("keywords.result","brand").toPandas()
bot5b_pdf.to_csv("gs://classificationdata112/brand_bot5.csv",index=False)

                                                                                

### Luxury Beauty brands

In [81]:
top = FA.groupBy("brand")\
            .agg(F.count("asin").alias("cnt"),\
                 F.avg("overall").alias("avg_rating"))\
            .where(~F.col("brand").isNull())\
            .orderBy(F.col("cnt").desc(),F.col("avg_rating").desc()) #rating small->big & review cnt big->small

In [87]:
top.take(15)

                                                                                

[Row(brand='Allegra K', cnt=11805, avg_rating=3.7612875900042355),
 Row(brand='i play.', cnt=9283, avg_rating=4.543789723149843),
 Row(brand='MJ Metals Jewelry', cnt=4890, avg_rating=4.828220858895706),
 Row(brand='Pierced Owl', cnt=3341, avg_rating=3.454354983537863),
 Row(brand='Scarleton', cnt=2665, avg_rating=4.300562851782364),
 Row(brand='Ninimour', cnt=2107, avg_rating=3.3663977218794496),
 Row(brand='BRYK', cnt=1854, avg_rating=4.347357065803668),
 Row(brand='LaSuiveur', cnt=1814, avg_rating=3.8092613009922824),
 Row(brand='Vans', cnt=1714, avg_rating=4.477829638273046),
 Row(brand='BodyJ4You', cnt=1691, avg_rating=4.377291543465405),
 Row(brand='Amazon Collection', cnt=1652, avg_rating=4.4291767554479415),
 Row(brand='Zmart', cnt=1490, avg_rating=3.92751677852349),
 Row(brand='HDE', cnt=1403, avg_rating=3.736992159657876),
 Row(brand='Fawziya', cnt=1375, avg_rating=4.637090909090909),
 Row(brand='WearMe Pro', cnt=1345, avg_rating=4.2698884758364315)]

In [88]:
# TOP brands
top5_brands = ["MJ Metals Jewelry","i play.","Fawziya","Vans","Amazon Collection"] #Fashion的评分普遍偏低--打分人数多不代表rating高
top5_text = FA.filter(FA.brand.isin(top5_brands)).select("brand","text","overall")

In [89]:
# transform dataframe
top5_kws_fa = yake_Model.transform(top5_text)

In [93]:
# save to GCS
top5_fa = top5_kws_fa.select("keywords.result","brand").toPandas()
top5_fa.to_csv("gs://classificationdata112/brand_top5_FA.csv",index=False)

                                                                                

In [91]:
# Low brands
bot5_brands = ["Allegra K","LaSuiveur","Ninimour","Pierced Owl","HDE"] #4分以下的
bot5_text = FA.filter(FA.brand.isin(bot5_brands)).select("brand","text","overall")

# transform dataframe
bot5_kws_fa = yake_Model.transform(bot5_text)

In [94]:
# save to GCS
bot5_fa = bot5_kws_fa.select("keywords.result","brand").toPandas()
bot5_fa.to_csv("gs://classificationdata112/brand_bot5_FA.csv",index=False)

                                                                                