### Start instance, read in libraries, and change configuration settings

In [1]:
from pyspark.sql import SparkSession, functions as f
from pyspark.sql.types import *
from pyspark.sql.functions import col, monotonically_increasing_id, lower

from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import LDA

import matplotlib.pyplot as plt
%matplotlib inline

#create Spark session
spark = SparkSession.builder.appName('BDLDA').getOrCreate()

# Change configuration settings on Spark 
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '4g'), \
                                        ('spark.executor.memoryOverhead', '8g'), \
                                        ('spark.app.name', 'Spark Updated Conf'), \
                                        ('spark.executor.cores', '8'), \
                                        ('spark.cores.max', '4'), \
                                        ('spark.driver.memory','4g')])

### Data cleaning

In [2]:
# Reading in the data
df_lda = spark.read.json("gs://bdprojectfinal/df_full_feature_data")

In [3]:
# Drop unneeded columns
df_lda = df_lda.drop("Year", "also_buy", "also_view", "category", "description", "details", "fit", \
                     "main_cat", "rank", "reviewTime", "reviewerID", "reviewerName", \
                     "summary", "title", "unixReviewTime", "verified", "vote")

In [4]:
df_lda.printSchema()

root
 |-- asin: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- feature: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- overall: double (nullable = true)
 |-- price: string (nullable = true)
 |-- reviewText: string (nullable = true)



In [5]:
# Drop rows with nulls
df_lda = df_lda.where(col("reviewText").isNotNull())

In [7]:
# Set reviewText to lowercase
df_lda = df_lda.withColumn('reviewText', lower(col('reviewText')))

# Separate based on rating
df_lda1 = df_lda.filter(df_lda.overall.isin(1))
df_lda2 = df_lda.filter(df_lda.overall.isin(2))
df_lda3 = df_lda.filter(df_lda.overall.isin(3))
df_lda4 = df_lda.filter(df_lda.overall.isin(4))
df_lda5 = df_lda.filter(df_lda.overall.isin(5))

### Define tokenizer, remover, CV, and assembler

In [8]:
tokenizer = Tokenizer(inputCol="reviewText", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
cv = CountVectorizer(inputCol = "filtered", outputCol = "features")
assembler = VectorAssembler(inputCols = ["features"], outputCol = "vectors")

### Transform the subsets of data

In [9]:
df_lda1 = tokenizer.transform(df_lda1) #tokenize
df_lda1 = remover.transform(df_lda1) #remove stopwords
df_lda1 = df_lda1.drop("words") #drop redundant source column
df_lda1 = df_lda1.drop("overall") #drop rating column
cv1 = cv.fit(df_lda1)
df_lda1 = cv1.transform(df_lda1) #CV
df_lda1 = df_lda1.where(col("features").isNotNull()) #drop any rows with no features

In [10]:
df_lda2 = tokenizer.transform(df_lda2) #tokenize
df_lda2 = remover.transform(df_lda2) #remove stopwords
df_lda2 = df_lda2.drop("words") #drop redundant source column
df_lda2 = df_lda2.drop("overall") #drop rating column
cv2 = cv.fit(df_lda2)
df_lda2 = cv2.transform(df_lda2) #CV
df_lda2 = df_lda2.where(col("features").isNotNull()) #drop any rows with no features

In [11]:
df_lda3 = tokenizer.transform(df_lda3) #tokenize
df_lda3 = remover.transform(df_lda3) #remove stopwords
df_lda3 = df_lda3.drop("words") #drop redundant source column
df_lda3 = df_lda3.drop("overall") #drop rating column
cv3 = cv.fit(df_lda3)
df_lda3 = cv3.transform(df_lda3) #CV
df_lda3 = df_lda3.where(col("features").isNotNull()) #drop any rows with no features

In [12]:
df_lda4 = tokenizer.transform(df_lda4) #tokenize
df_lda4 = remover.transform(df_lda4) #remove stopwords
df_lda4 = df_lda4.drop("words") #drop redundant source column
df_lda4 = df_lda4.drop("overall") #drop rating column
cv4 = cv.fit(df_lda4)
df_lda4 = cv4.transform(df_lda4) #CV
df_lda4 = df_lda4.where(col("features").isNotNull()) #drop any rows with no features

In [13]:
df_lda5 = tokenizer.transform(df_lda5) #tokenize
df_lda5 = remover.transform(df_lda5) #remove stopwords
df_lda5 = df_lda5.drop("words") #drop redundant source column
df_lda5 = df_lda5.drop("overall") #drop rating column
cv5 = cv.fit(df_lda5)
df_lda5 = cv5.transform(df_lda5) #CV
df_lda5 = df_lda5.where(col("features").isNotNull()) #drop any rows with no features

### Encode the data

In [14]:
encodedDataCV1 = assembler.transform(df_lda1)
encodedDataCV2 = assembler.transform(df_lda2)
encodedDataCV3 = assembler.transform(df_lda3)
encodedDataCV4 = assembler.transform(df_lda4)
encodedDataCV5 = assembler.transform(df_lda5)

### Create dictionaries

In [15]:
vocab1 = cv1.vocabulary
vocab2 = cv2.vocabulary
vocab3 = cv3.vocabulary
vocab4 = cv4.vocabulary
vocab5 = cv5.vocabulary

### Train the LDA model

https://spark.apache.org/docs/latest/ml-clustering.html#latent-dirichlet-allocation-lda

In [16]:
# Define settings for LDA
num_topics = 5
max_iterations = 10 #default 20
seed_num = 47

In [17]:
# Train the LDA model
lda1 = LDA(k = num_topics, maxIter = max_iterations)
model1 = lda1.fit(encodedDataCV1).setSeed(seed_num)

In [18]:
# Train the LDA model
lda2 = LDA(k = num_topics, maxIter = max_iterations)
model2 = lda2.fit(encodedDataCV2).setSeed(seed_num)

In [19]:
# Train the LDA model
lda3 = LDA(k = num_topics, maxIter = max_iterations)
model3 = lda3.fit(encodedDataCV3).setSeed(seed_num)

In [20]:
# Train the LDA model
lda4 = LDA(k = num_topics, maxIter = max_iterations)
model4 = lda4.fit(encodedDataCV4).setSeed(seed_num)

In [22]:
# Train the LDA model
lda5 = LDA(k = num_topics, maxIter = max_iterations)
model5 = lda5.fit(encodedDataCV5).setSeed(seed_num)

### Look at results of LDA

In [23]:
# Describe topics.
# Returns DF that has mapping of term weights to term indices for each topic: https://stackoverflow.com/questions/49613772/get-automatic-topic-labels-from-lda-topic-model-in-apache-spark

#model1.describeTopics(maxTermsPerTopic = 15).show(truncate = False)

topics1 = model1.describeTopics()
print("The topics described by their top-weighted terms:")
topics1.show(truncate=False)

The topics described by their top-weighted terms:
+-----+-------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|topic|termIndices                          |termWeights                                                                                                                                                                                                                      |
+-----+-------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0    |[3, 2, 7, 1, 4, 5, 8, 16, 18, 9]     |[0.010383823694035992, 0.009801416884523788, 0.008655418909221531, 0.008527397447768273, 

In [24]:
# Describe topics.
topics2 = model2.describeTopics()
print("The topics described by their top-weighted terms:")
topics2.show(truncate=False)

The topics described by their top-weighted terms:
+-----+-----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|topic|termIndices                        |termWeights                                                                                                                                                                                                                         |
+-----+-----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0    |[3, 1, 4, 2, 9, 19, 18, 16, 37, 61]|[0.003808405425328728, 0.003475444028127159, 0.002547181772838547, 0.002350110708034457,

In [25]:
# Describe topics.
topics3 = model3.describeTopics()
print("The topics described by their top-weighted terms:")
topics3.show(truncate=False)

The topics described by their top-weighted terms:
+-----+-----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|topic|termIndices                        |termWeights                                                                                                                                                                                                                         |
+-----+-----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0    |[1, 2, 3, 4, 0, 7, 5, 10, 9, 18]   |[0.014428795081538624, 0.012758352646239231, 0.011672549287216193, 0.008290459628116817,

In [26]:
# Describe topics.
topics4 = model4.describeTopics()
print("The topics described by their top-weighted terms:")
topics4.show(truncate=False)

The topics described by their top-weighted terms:
+-----+-----------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|topic|termIndices                        |termWeights                                                                                                                                                                                                                      |
+-----+-----------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0    |[2, 4, 3, 1, 9, 6, 7, 0, 5, 8]     |[0.01430790902945976, 0.013335805892846641, 0.0120635379700435, 0.009412660684668459, 0.008648110

In [27]:
# Describe topics.
topics5 = model5.describeTopics()
print("The topics described by their top-weighted terms:")
topics5.show(truncate=False)

The topics described by their top-weighted terms:
+-----+------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|topic|termIndices                         |termWeights                                                                                                                                                                                                                      |
+-----+------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0    |[2, 19, 4, 3, 27, 18, 12, 1, 67, 98]|[0.04057304126496658, 0.02271974158678895, 0.021291487298591562, 0.015264697175333177, 0.0131

### Look up values in dictionary

#### 1 star reviews

In [28]:
# Show only the indices
t1indices = topics1.select('termIndices')
t1indices.show(truncate = False)

+-------------------------------------+
|termIndices                          |
+-------------------------------------+
|[3, 2, 7, 1, 4, 5, 8, 16, 18, 9]     |
|[0, 2, 1, 13, 4, 5, 6, 11, 9, 12]    |
|[0, 1, 31, 14, 19, 3, 6, 52, 5, 10]  |
|[0, 1, 3, 14, 25, 7, 10, 44, 42, 66] |
|[0, 1, 14, 11, 32, 6, 50, 75, 20, 47]|
+-------------------------------------+



In [29]:
# Save indices as separate vectors
t1indices0 = [3, 2, 7, 1, 4, 5, 8, 16, 18, 9]
t1indices1 = [0, 2, 1, 13, 4, 5, 6, 11, 9, 12]
t1indices2 = [0, 1, 31, 14, 19, 3, 6, 52, 5, 10]
t1indices3 = [0, 1, 3, 14, 25, 7, 10, 44, 42, 66]
t1indices4 = [0, 1, 14, 11, 32, 6, 50, 75, 20, 47]

In [30]:
for i in t1indices0:
    print(vocab1[i])

one
size
small
like
fit
ordered
way
first
time
wear


In [31]:
for i in t1indices1:
    print(vocab1[i])


size
like
return
fit
ordered
even
shoes
wear
get


In [32]:
for i in t1indices2:
    print(vocab1[i])


like
product
quality
cheap
one
even
see
ordered
bought


In [33]:
for i in t1indices3:
    print(vocab1[i])


like
one
quality
received
small
bought
returned
big
fell


In [34]:
for i in t1indices4:
    print(vocab1[i])


like
quality
shoes
looks
even
poor
quality.
buy
waste


#### 2 star reviews

In [35]:
# Show only the indices
t2indices = topics2.select('termIndices')
t2indices.show(truncate = False)

+-----------------------------------+
|termIndices                        |
+-----------------------------------+
|[3, 1, 4, 2, 9, 19, 18, 16, 37, 61]|
|[0, 1, 2, 3, 4, 6, 5, 7, 8, 9]     |
|[0, 1, 3, 2, 22, 8, 1656, 20, 5, 4]|
|[0, 1, 7, 56, 8, 36, 2, 14, 3, 17] |
|[0, 1, 2, 9, 11, 5, 6, 24, 4, 3]   |
+-----------------------------------+



In [36]:
# Save indices as separate vectors
t2indices0 = [3, 1, 4, 2, 9, 19, 18, 16, 37, 61]
t2indices1 = [0, 1, 2, 3, 4, 6, 5, 7, 8, 9]
t2indices2 = [0, 1, 3, 2, 22, 8, 1656, 20, 5, 4]
t2indices3 = [0, 1, 7, 56, 8, 36, 2, 14, 3, 17]
t2indices4 = [0, 1, 2, 9, 11, 5, 6, 24, 4, 3]

In [37]:
for i in t2indices0:
    print(vocab2[i])

fit
like
one
size
really
shoe
material
look
big
dress


In [38]:
for i in t2indices1:
    print(vocab2[i])


like
size
fit
one
small
wear
way
ordered
really


In [39]:
for i in t2indices2:
    print(vocab2[i])


like
fit
size
much
ordered
la
small.
wear
one


In [40]:
for i in t2indices3:
    print(vocab2[i])


like
way
-
ordered
little
size
quality
fit
it.


In [41]:
for i in t2indices4:
    print(vocab2[i])


like
size
really
shoes
wear
small
return
one
fit


#### 3 star reviews

In [42]:
# Show only the indices
t3indices = topics3.select('termIndices')
t3indices.show(truncate = False)

+-----------------------------------+
|termIndices                        |
+-----------------------------------+
|[1, 2, 3, 4, 0, 7, 5, 10, 9, 18]   |
|[0, 8, 6, 12, 14, 1, 22, 54, 2, 13]|
|[88, 6, 0, 2, 5, 1, 8, 7, 25, 32]  |
|[0, 1, 3, 2, 8, 5, 20, 48, 17, 6]  |
|[0, 11, 2, 5, 27, 6, 9, 7, 16, 17] |
+-----------------------------------+



In [43]:
# Save indices as separate vectors
t3indices0 = [1, 2, 3, 4, 0, 7, 5, 10, 9, 18]
t3indices1 = [0, 8, 6, 12, 14, 1, 22, 54, 2, 13]
t3indices2 = [88, 6, 0, 2, 5, 1, 8, 7, 25, 32]
t3indices3 = [0, 1, 3, 2, 8, 5, 20, 48, 17, 6]
t3indices4 = [0, 11, 2, 5, 27, 6, 9, 7, 16, 17]

In [44]:
for i in t3indices0:
    print(vocab3[i])

size
like
fit
wear

small
little
ordered
really
way


In [45]:
for i in t3indices1:
    print(vocab3[i])


one
good
get
quality
size
it.
first
like
great


In [46]:
for i in t3indices2:
    print(vocab3[i])

ok
good

like
little
size
one
small
color
looks


In [47]:
for i in t3indices3:
    print(vocab3[i])


size
fit
like
one
little
love
shirt
material
good


In [48]:
for i in t3indices4:
    print(vocab3[i])


nice
like
little
much
good
really
small
look
material


#### 4 star reviews

In [49]:
# Show only the indices
t4indices = topics4.select('termIndices')
t4indices.show(truncate = False)

+-----------------------------------+
|termIndices                        |
+-----------------------------------+
|[2, 4, 3, 1, 9, 6, 7, 0, 5, 8]     |
|[0, 6, 7, 139, 2, 4, 5, 30, 19, 12]|
|[0, 5, 1, 6, 3, 8, 7, 23, 16, 11]  |
|[1, 0, 6, 7, 5, 3, 9, 4, 13, 17]   |
|[2, 0, 9, 3, 8, 1, 6, 14, 10, 15]  |
+-----------------------------------+



In [50]:
# Save indices as separate vectors
t4indices0 = [2, 4, 3, 1, 9, 6, 7, 0, 5, 8]
t4indices1 = [0, 6, 7, 139, 2, 4, 5, 30, 19, 12]
t4indices2 = [0, 5, 1, 6, 3, 8, 7, 23, 16, 11]
t4indices3 = [1, 0, 6, 7, 5, 3, 9, 4, 13, 17]
t4indices4 = [2, 0, 9, 3, 8, 1, 6, 14, 10, 15]

In [51]:
for i in t4indices0:
    print(vocab4[i])

size
little
fit
like
love
great
nice

good
wear


In [52]:
for i in t4indices1:
    print(vocab4[i])


great
nice
watch
size
little
good
big
quality
one


In [53]:
for i in t4indices2:
    print(vocab4[i])


good
like
great
fit
wear
nice
shoes
well
really


In [54]:
for i in t4indices3:
    print(vocab4[i])

like

great
nice
good
fit
love
little
fits
small


In [55]:
for i in t4indices4:
    print(vocab4[i])

size

love
fit
wear
like
great
ordered
bit
comfortable


#### 5 star reviews

In [56]:
# Show only the indices
t5indices = topics5.select('termIndices')
t5indices.show(truncate = False)

+------------------------------------+
|termIndices                         |
+------------------------------------+
|[2, 19, 4, 3, 27, 18, 12, 1, 67, 98]|
|[0, 1, 2, 5, 7, 6, 3, 12, 16, 11]   |
|[0, 3, 10, 1, 8, 2, 4, 14, 9, 11]   |
|[2, 14, 27, 522, 8, 4, 0, 3, 5, 45] |
|[0, 5, 9, 1, 86, 19, 77, 10, 2, 3]  |
+------------------------------------+



In [57]:
# Save indices as separate vectors
t5indices0 = [2, 19, 4, 3, 27, 18, 12, 1, 67, 98]
t5indices1 = [0, 1, 2, 5, 7, 6, 3, 12, 16, 11]
t5indices2 = [0, 3, 10, 1, 8, 2, 4, 14, 9, 11]
t5indices3 = [2, 14, 27, 522, 8, 4, 0, 3, 5, 45]
t5indices4 = [0, 5, 9, 1, 86, 19, 77, 10, 2, 3]

In [58]:
for i in t5indices0:
    print(vocab5[i])

great
loves
good
fit
loved
it.
bought
love
daughter
son


In [59]:
for i in t5indices1:
    print(vocab5[i])


love
great
like
wear
size
fit
bought
one
fits


In [60]:
for i in t5indices2:
    print(vocab5[i])


fit
nice
love
perfect
great
good
quality
comfortable
fits


In [61]:
for i in t5indices3:
    print(vocab5[i])

great
quality
loved
described
perfect
good

fit
like
exactly


In [62]:
for i in t5indices4:
    print(vocab5[i])


like
comfortable
love
excellent
loves
husband
nice
great
fit


### Transformed results

In [63]:
# Shows the result
# Shows the weight for each topic
transformed1 = model1.transform(encodedDataCV1)
transformed1.show(5)

+----------+-----------+--------------------+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|      asin|      brand|             feature|           price|          reviewText|            filtered|            features|             vectors|   topicDistribution|
+----------+-----------+--------------------+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|B0001YRW3I|       null|[100% cotton, Pac...|            null|   did not like them|              [like]|  (124872,[1],[1.0])|  (124872,[1],[1.0])|[0.10992527110721...|
|B0002M9DO0|STABILicers|[MAXX: Winter can...|$34.99 - $101.79|size is way off t...|   [size, way, fit!]|(124872,[2,8,1992...|(124872,[2,8,1992...|[0.81431960889390...|
|B0003DKV80|      Perry|[Elastic, Made in...| $11.29 - $16.76|not tall. tried t...|[tall., tried, tw...|(124872,[6,12,17,...|(124872,[6,12,17,...|[0.93861272013

In [64]:
transformed2 = model2.transform(encodedDataCV2)
transformed2.show(5)

+----------+-----------+--------------------+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|      asin|      brand|             feature|           price|          reviewText|            filtered|            features|             vectors|   topicDistribution|
+----------+-----------+--------------------+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|6030555170|  Lady olga|[100% Polyester, ...| $26.80 - $26.94|definitely too sm...|[definitely, smal...|(99842,[3,6,24,85...|(99842,[3,6,24,85...|[0.00686013754254...|
|B0000WL750|       null|[100% Cotton, Imp...|$54.99 - $104.67|after a few washi...|[washings, leg, s...|(99842,[76,120,15...|(99842,[76,120,15...|[0.00960849801255...|
|B0002M9DO0|STABILicers|[MAXX: Winter can...|$34.99 - $101.79|work really well....|[work, really, we...|(99842,[9,23,27,6...|(99842,[9,23,27,6...|[0.00752931301

In [65]:
transformed3 = model3.transform(encodedDataCV3)
transformed3.show(5)

+----------+-----+--------------------+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|      asin|brand|             feature|           price|          reviewText|            filtered|            features|             vectors|   topicDistribution|
+----------+-----+--------------------+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|B0000WL750| null|[100% Cotton, Imp...|$54.99 - $104.67|order 2 sizes lar...|[order, 2, sizes,...|(123526,[4,49,73,...|(123526,[4,49,73,...|[0.88002741566708...|
|B0000WL750| null|[100% Cotton, Imp...|$54.99 - $104.67|carhartt has alwa...|[carhartt, always...|(123526,[3,6,13,2...|(123526,[3,6,13,2...|[0.98062815965508...|
|B0000WL750| null|[100% Cotton, Imp...|$54.99 - $104.67|fits well as long...|[fits, well, long...|(123526,[0,2,16,3...|(123526,[0,2,16,3...|[0.60707282594841...|
|B0002M4VBA| null|[100% Cott

In [66]:
transformed4 = model4.transform(encodedDataCV4)
transformed4.show(5)

+----------+---------+--------------------+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|      asin|    brand|             feature|           price|          reviewText|            filtered|            features|             vectors|   topicDistribution|
+----------+---------+--------------------+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|6030555170|Lady olga|[100% Polyester, ...| $26.80 - $26.94|           i love it|              [love]|  (167604,[9],[1.0])|  (167604,[9],[1.0])|[0.62690480734490...|
|B00001OGXK|     null|[Glass, This Aust...|            null|did the job. won ...|[job., won, first...|(167604,[71,501,5...|(167604,[71,501,5...|[0.89570390469220...|
|B0000WL750|     null|[100% Cotton, Imp...|$54.99 - $104.67|great product. i ...|[great, product.,...|(167604,[3,6,8,18...|(167604,[3,6,8,18...|[0.95971429254605...|
|B00

In [67]:
transformed5 = model5.transform(encodedDataCV5)
transformed5.show(5)

+----------+---------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|      asin|    brand|             feature|            price|          reviewText|            filtered|            features|             vectors|   topicDistribution|
+----------+---------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|6030555170|Lady olga|[100% Polyester, ...|  $26.80 - $26.94|this bed jacket w...|[bed, jacket, wor...|(262144,[13,148,3...|(262144,[13,148,3...|[0.02099007203407...|
|6030555170|Lady olga|[100% Polyester, ...|  $26.80 - $26.94|       mom loved it.|   [mom, loved, it.]|(262144,[18,27,51...|(262144,[18,27,51...|[0.80887541889690...|
|B0000E02V7|     null|[100% Leather, Im...|$107.94 - $188.00|nice boots as alw...|[nice, boots, alw...|(262144,[10,92,10...|(262144,[10,92,10...|[0.04748302397271...