### Import data

In [1]:
%load_ext autotime

time: 0 ns (started: 2021-05-03 13:08:47 -04:00)


In [2]:
import sparknlp
spark = sparknlp.start() 

time: 15.4 s (started: 2021-05-03 13:08:47 -04:00)


In [4]:
reviews = spark.read.json('../yelp_academic_dataset_review.json')
reviews = reviews.select(['business_id', 'text', 'stars'])
reviews.count()

8021122

time: 21.6 s (started: 2021-05-03 13:15:21 -04:00)


Dataset is 5.89 GB. Project requires a dataset of at least 500 MB. Dataset has ~8,000,000 rows. By the below calculations, a dataset ~10x smaller, or a dataset with ~700,000 rows will satisfy the 500 MB requirement

In [5]:
# 5.89 / x = 0.5
# 0.5 * x = 5.89
x = 5.89 / 0.5
print(x)
print(int(8021122 / x))

11.78
680910
time: 0 ns (started: 2021-05-03 13:15:43 -04:00)


Convert stars column to string so that the logistic regression below treats one vs. five-stars as a classification rather than a regression column

In [6]:
from pyspark.sql.functions import col
from pyspark.sql.types import StringType
reviews = reviews.withColumn('stars', col('stars').cast(StringType()))

time: 47 ms (started: 2021-05-03 13:15:43 -04:00)


### Subset data for Databricks

In [7]:
# x = (1/10)
# subset_df, large_df = reviews.randomSplit([x, 1 - x])

time: 0 ns (started: 2021-05-03 13:15:43 -04:00)


In [8]:
# subset_df.coalesce(1).write.format('json').save('reviews_1-10.json')

time: 0 ns (started: 2021-05-03 13:15:43 -04:00)


### Back to Jupyter
Merge the businesses and reviews datasets, filter by one or five-star reviews. This leaves us with ~1,000,000 rows.

In [9]:
businesses = spark.read.json('../yelp_academic_dataset_business.json')
businesses = businesses.select(['business_id', 'categories'])

time: 1.69 s (started: 2021-05-03 13:15:43 -04:00)


In [10]:
restaurants = businesses.filter(businesses.categories.contains('Restaurants'))
restaurant_reviews = reviews.join(restaurants, "business_id", "inner")
restaurant_reviews = restaurant_reviews.select(['text', 'stars'])
restaurant_reviews.count()

5055992

time: 11 s (started: 2021-05-03 13:15:44 -04:00)


In [11]:
five_stars = restaurant_reviews.filter(restaurant_reviews.stars == 5.0)
one_stars = restaurant_reviews.filter(restaurant_reviews.stars == 1.0)
num_one_stars = one_stars.count()
five_stars = five_stars.limit(num_one_stars)
one_or_five_stars = five_stars.union(one_stars)
num_one_stars * 2

1256088

time: 12.7 s (started: 2021-05-03 13:15:56 -04:00)


Convert "1" and "5"-star reviews to 0 and 1 for classification.

In [12]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol = 'stars', outputCol = 'categoryIndex')
indexed = indexer.fit(one_or_five_stars).transform(one_or_five_stars)

time: 28.8 s (started: 2021-05-03 13:16:08 -04:00)


By the below calculation, this one or five-stars dataset needs to be cut in about half to meet the 500 MB requirement.

In [13]:
# 680910 rows = ~500MB
# 1256088 * x = 680910
print(680910 / 1256088)

0.5420878155033724
time: 0 ns (started: 2021-05-03 13:16:37 -04:00)


In [14]:
x = 0.25
subset_df, large_df = indexed.randomSplit([x, 1 - x])
subset_count = subset_df.count()
print(subset_count)

314693
time: 38.8 s (started: 2021-05-03 13:16:37 -04:00)


An example review from the final subsetted dataset.

In [15]:
subset_df.take(1)

[Row(text='! The food is very tasty!  Every dish we have savored with my family has been simply spectacular!  Totally delighted with this place!\n Oysters and aguachiles !!  Omg a delight !!!', stars='5.0', categoryIndex=1.0)]

time: 22 s (started: 2021-05-03 13:17:16 -04:00)


### Spark NLP

In [16]:
# Build NLP preprocessing pipeline
from sparknlp.base import DocumentAssembler
document_assembler = DocumentAssembler() \
.setInputCol('text') \
.setOutputCol('document')
from sparknlp.annotator import Tokenizer
tokenizer = Tokenizer() \
.setInputCols(['document']) \
.setOutputCol('tokenized') \
.setContextChars(['(', ')']) \
.setSplitChars(['-'])
from sparknlp.annotator import Normalizer
normalizer = Normalizer() \
.setInputCols(['tokenized']) \
.setOutputCol('normalized') \
.setLowercase(True) \
.setCleanupPatterns(['[^A-Za-z]'])
from sparknlp.annotator import LemmatizerModel
lemmatizer = LemmatizerModel \
.pretrained() \
.setInputCols(['normalized']) \
.setOutputCol('lemmatized')
from nltk.corpus import stopwords
nltk_stopwords = stopwords.words('english')
from sparknlp.annotator import StopWordsCleaner
stopwords_cleaner = StopWordsCleaner() \
.setInputCols(['lemmatized']) \
.setOutputCol('unigrams') \
.setStopWords(nltk_stopwords)
from sparknlp.annotator import NGramGenerator
ngrammer = NGramGenerator() \
    .setInputCols(['unigrams']) \
    .setOutputCol('ngrams') \
    .setN(2) \
    .setEnableCumulative(True) \
    .setDelimiter('_')
from sparknlp.base import Finisher
finisher = Finisher() \
.setInputCols(['unigrams', 'ngrams'])

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
time: 12.6 s (started: 2021-05-03 13:17:38 -04:00)


In [17]:
# Assemble pipeline
from pyspark.ml import Pipeline
pipeline = Pipeline() \
.setStages([document_assembler,
            tokenizer,
            normalizer,
            lemmatizer,
            stopwords_cleaner,
            ngrammer,
            finisher])

time: 0 ns (started: 2021-05-03 13:17:50 -04:00)


In [18]:
# Fit pipeline
processed_reviews = pipeline.fit(subset_df).transform(subset_df)

time: 594 ms (started: 2021-05-03 13:17:50 -04:00)


In [19]:
# Examine one processed review
processed_reviews.take(1)

[Row(text='! The food is very tasty!  Every dish we have savored with my family has been simply spectacular!  Totally delighted with this place!\n Oysters and aguachiles !!  Omg a delight !!!', stars='5.0', categoryIndex=1.0, finished_unigrams=['food', 'tasty', 'every', 'dish', 'savored', 'family', 'simply', 'spectacular', 'totally', 'delight', 'place', 'oyster', 'aguachiles', 'omg', 'delight'], finished_ngrams=['food', 'tasty', 'every', 'dish', 'savored', 'family', 'simply', 'spectacular', 'totally', 'delight', 'place', 'oyster', 'aguachiles', 'omg', 'delight', 'food_tasty', 'tasty_every', 'every_dish', 'dish_savored', 'savored_family', 'family_simply', 'simply_spectacular', 'spectacular_totally', 'totally_delight', 'delight_place', 'place_oyster', 'oyster_aguachiles', 'aguachiles_omg', 'omg_delight'])]

time: 26.1 s (started: 2021-05-03 13:17:51 -04:00)


In [20]:
# Train test split
(trainingData, testData) = processed_reviews.randomSplit([0.8, 0.2])

time: 0 ns (started: 2021-05-03 13:18:17 -04:00)


In [21]:
trainingData_count = subset_count * 0.8
print(trainingData_count)

251754.40000000002
time: 0 ns (started: 2021-05-03 13:18:17 -04:00)


In [22]:
# Count vectorization with minDF and maxDF parameters
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import IDF

tfizer = CountVectorizer(inputCol = 'finished_ngrams', outputCol = 'tf_features', 
                         minDF = 0.01, maxDF = 0.1, vocabSize = int(trainingData_count / 2))

tf_model = tfizer.fit(trainingData)
tf_result_training = tf_model.transform(trainingData)
tf_result_test = tf_model.transform(testData)

idfizer = IDF(inputCol = 'tf_features', outputCol = 'tfidf_features')

idf_model = idfizer.fit(tf_result_training)
tfidf_result_training = idf_model.transform(tf_result_training)
tfidf_result_test = idf_model.transform(tf_result_test)

time: 24min 23s (started: 2021-05-03 13:18:17 -04:00)


In [56]:
tf_model.save("tfModel.model")
idf_model.save("idfModel.model")

time: 797 ms (started: 2021-05-03 16:55:40 -04:00)


In [69]:
from pyspark.ml.feature import CountVectorizerModel
tf_model = CountVectorizerModel.load("tfModel.model")

time: 203 ms (started: 2021-05-03 17:11:55 -04:00)


In [70]:
from pyspark.ml.feature import IDFModel
idf_model = IDFModel.load("idfModel.model")

time: 172 ms (started: 2021-05-03 17:11:56 -04:00)


In [71]:
# Print vocablary length (i.e. # of columns)
print(len(tf_model.vocabulary))

787
time: 0 ns (started: 2021-05-03 17:11:57 -04:00)


In [72]:
tf_model.vocabulary

['pizza',
 'server',
 'fry',
 'meal',
 'manager',
 'way',
 'sauce',
 'sit',
 'friend',
 'call',
 'find',
 'bar',
 'night',
 'two',
 'thing',
 'hour',
 'much',
 'location',
 'serve',
 'salad',
 'day',
 'fresh',
 'walk',
 'cheese',
 'everything',
 'right',
 'dish',
 'still',
 'burger',
 'long',
 'work',
 'another',
 'pay',
 'dinner',
 'feel',
 'seat',
 'cook',
 'review',
 'check',
 'need',
 'little',
 'bring',
 'waitress',
 'come_back',
 'new',
 'visit',
 'around',
 'sure',
 'last',
 'star',
 'lunch',
 'cant',
 'every',
 'sandwich',
 'use',
 'go_back',
 'side',
 'meat',
 'wasnt',
 'next',
 'since',
 'open',
 'vegas',
 'small',
 'year',
 'something',
 'favorite',
 'many',
 'top',
 'close',
 'quality',
 'late',
 'lot',
 'flavor',
 'enjoy',
 'rice',
 'start',
 'away',
 'sushi',
 'worth',
 'room',
 'hot',
 'seem',
 'pretty',
 'cold',
 'decide',
 'roll',
 'taco',
 'customer_service',
 'super',
 'plate',
 'area',
 'put',
 'return',
 'family',
 'owner',
 'waiter',
 'excellent',
 'awesome',
 'mo

time: 15 ms (started: 2021-05-03 17:11:58 -04:00)


### Exploratory topic modeling

In [27]:
from pyspark.ml.clustering import LDA
num_topics = 10
max_iter = 10
lda = LDA(k = num_topics, 
          maxIter = max_iter, 
          featuresCol = 'tfidf_features')
ldaModel = lda.fit(tfidf_result_training)

time: 12min 32s (started: 2021-05-03 14:05:09 -04:00)


In [29]:
from pyspark.sql import types as T
from pyspark.sql import functions as F
vocab = tf_model.vocabulary
def get_words(token_list):
    return [vocab[token_id] for token_id in token_list]
udf_to_words = F.udf(get_words, T.ArrayType(T.StringType()))

time: 0 ns (started: 2021-05-03 14:20:19 -04:00)


In [30]:
num_top_words = 10
topics = lda_model \
.describeTopics(num_top_words) \
.withColumn('topicWords', udf_to_words(F.col('termIndices')))
topics.select('topic', 'topicWords').show(truncate = 100)

+-----+----------------------------------------------------------------------------------------------------+
|topic|                                                                                          topicWords|
+-----+----------------------------------------------------------------------------------------------------+
|    0|                               [business, rude, sandwich, owner, work, walk, server, bar, seem, sit]|
|    1|                         [taco, burrito, dog, salsa, chip, mexican, waste, bean, burger, happy_hour]|
|    2|                           [call, manager, waitress, pizza, charge, pay, another, phone, hour, bill]|
|    3|                [drive, every, every_time, thru, thai, drive_thru, location, burger, curry, time_go]|
|    4|[awesome, highly, highly_recommend, perfect, tea, favorite, atmosphere, everything, great_service...|
|    5|                                       [beer, bar, sit, night, last, game, busy, around, hair, walk]|
|    6|            

### Logistic regression

In [31]:
# Define logistic regression with ridge
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol = 'tfidf_features', labelCol = 'categoryIndex', 
                        family = 'binomial', elasticNetParam = 0, regParam = 0.1)

time: 62 ms (started: 2021-05-03 14:20:44 -04:00)


In [32]:
# Print all parameters
{param[0].name: param[1] for param in lr.extractParamMap().items()}

{'threshold': 0.5,
 'aggregationDepth': 2,
 'standardization': True,
 'fitIntercept': True,
 'elasticNetParam': 0.0,
 'predictionCol': 'prediction',
 'featuresCol': 'tfidf_features',
 'labelCol': 'categoryIndex',
 'rawPredictionCol': 'rawPrediction',
 'probabilityCol': 'probability',
 'maxIter': 100,
 'regParam': 0.1,
 'tol': 1e-06,
 'family': 'binomial'}

time: 0 ns (started: 2021-05-03 14:20:44 -04:00)


In [33]:
# Fit LR model
lrModel = lr.fit(tfidfResult_training)

time: 11min 52s (started: 2021-05-03 14:20:44 -04:00)


In [54]:
lrModel.save("lrModel.model")

time: 438 ms (started: 2021-05-03 16:55:23 -04:00)


In [63]:
from pyspark.ml.classification import LogisticRegressionModel
lrModel = LogisticRegressionModel.load("lrModel.model")

time: 250 ms (started: 2021-05-03 16:59:58 -04:00)


In [34]:
lrPredictions_training = lrModel.transform(tfidf_result_training)
lrPredictions_test = lrModel.transform(tfidf_result_test)

time: 63 ms (started: 2021-05-03 14:32:36 -04:00)


In [35]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol = 'categoryIndex', predictionCol = 'prediction')

time: 32 ms (started: 2021-05-03 14:32:37 -04:00)


In [36]:
acc_training_lr = evaluator.evaluate(lrPredictions_training, {evaluator.metricName: "accuracy"})
acc_test_lr = evaluator.evaluate(lrPredictions_test, {evaluator.metricName: "accuracy"})
# f1 = evaluator.evaluate(lrPredictions_training, {evaluator.metricName: "f1"})
# weightedPrecision = evaluator.evaluate(lrPredictions_training, {evaluator.metricName: "weightedPrecision"})
# weightedRecall = evaluator.evaluate(lrPredictions_training, {evaluator.metricName: "weightedRecall"})

time: 23min 46s (started: 2021-05-03 14:32:37 -04:00)


In [37]:
print('Training accuracy: ' + str(acc_training_lr))
print('Test accuracy: ' + str(acc_test_lr))

Training accuracy: 0.9409918392969241
Test accuracy: 0.9402371466213749
time: 0 ns (started: 2021-05-03 14:56:23 -04:00)


In [38]:
coef_matrix = lrModel.coefficientMatrix
coef_list = coef_matrix.toArray().tolist()

time: 16 ms (started: 2021-05-03 14:56:23 -04:00)


In [39]:
import pandas as pd
pd.DataFrame(coef_list).T.sort_values(0)

Unnamed: 0,0
101,-0.219863
128,-0.215739
476,-0.196160
105,-0.193927
370,-0.182215
...,...
66,0.193411
244,0.193799
321,0.198794
97,0.224351


time: 859 ms (started: 2021-05-03 14:56:23 -04:00)


In [40]:
coef_df = pd.DataFrame(coef_list).T.sort_values(0, ascending = True)
for i in range(0, 20):
    print(tf_model.vocabulary[coef_df.index[i]])

horrible
terrible
disappointing
rude
bland
mediocre
bad_service
awful
disgusting
overpriced
never_go
poor
suck
never_come
one_star
tasteless
gross
disappointment
waste
slow
time: 219 ms (started: 2021-05-03 14:56:24 -04:00)


In [41]:
coef_df = pd.DataFrame(coef_list).T.sort_values(0, ascending = False)
for i in range(0, 20):
    print(tf_model.vocabulary[coef_df.index[i]])

awesome
excellent
love_place
fantastic
favorite
definitely_back
food_amazing
cant_wait
perfect
food_delicious
definitely_come
highly_recommend
great_food
one_good
great_service
gem
friendly_staff
wonderful
attentive
yummy
time: 172 ms (started: 2021-05-03 14:56:24 -04:00)


### Random forest

In [42]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol = 'tfidf_features', labelCol = 'categoryIndex')

time: 62 ms (started: 2021-05-03 15:37:25 -04:00)


In [43]:
{param[0].name: param[1] for param in rf.extractParamMap().items()}

{'seed': 5867653639892386317,
 'maxDepth': 5,
 'maxBins': 32,
 'minInstancesPerNode': 1,
 'minInfoGain': 0.0,
 'maxMemoryInMB': 256,
 'cacheNodeIds': False,
 'checkpointInterval': 10,
 'impurity': 'gini',
 'numTrees': 20,
 'featureSubsetStrategy': 'auto',
 'subsamplingRate': 1.0,
 'leafCol': '',
 'minWeightFractionPerNode': 0.0,
 'bootstrap': True,
 'predictionCol': 'prediction',
 'featuresCol': 'tfidf_features',
 'labelCol': 'categoryIndex',
 'rawPredictionCol': 'rawPrediction',
 'probabilityCol': 'probability'}

time: 15 ms (started: 2021-05-03 15:37:25 -04:00)


In [44]:
rfModel = rf.fit(tfidf_result_training)

time: 43min 39s (started: 2021-05-03 15:37:25 -04:00)


In [45]:
rfPredictions_training = rfModel.transform(tfidf_result_training)
rfPredictions_test = rfModel.transform(tfidf_result_test)

time: 94 ms (started: 2021-05-03 16:21:04 -04:00)


In [46]:
acc_training_rf = evaluator.evaluate(rfPredictions_training, {evaluator.metricName: "accuracy"})
acc_test_rf = evaluator.evaluate(rfPredictions_test, {evaluator.metricName: "accuracy"})
# f1 = evaluator.evaluate(rfPredictions_training, {evaluator.metricName: "f1"})
# weightedPrecision = evaluator.evaluate(rfPredictions_training, {evaluator.metricName: "weightedPrecision"})
# weightedRecall = evaluator.evaluate(rfPredictions_training, {evaluator.metricName: "weightedRecall"})

time: 28min 45s (started: 2021-05-03 16:21:05 -04:00)


In [47]:
print('Training accuracy: ' + str(acc_training_rf))
print('Test accuracy: ' + str(acc_test_rf))

Training accuracy: 0.8126494870755759
Test accuracy: 0.8103779425070239
time: 15 ms (started: 2021-05-03 16:49:51 -04:00)


In [48]:
coef_matrix = rfModel.featureImportances
coef_list = coef_matrix.toArray().tolist()

time: 78 ms (started: 2021-05-03 16:49:51 -04:00)


In [49]:
import pandas as pd
coef_df = pd.DataFrame(coef_list).sort_values(0, ascending = False)
for i in range(0, 40):
    print(tf_model.vocabulary[coef_df.index[i]])

rude
perfect
excellent
horrible
call
pay
awesome
another
terrible
someone
waste
cold
fresh
waitress
highly_recommend
charge
fantastic
attentive
bland
min
money
attitude
nothing
great_food
bill
favorite
ok
never_come
perfectly
wrong
atmosphere
ignore
arrive
taste_like
disappointing
receive
seem
disgusting
yummy
highly
time: 375 ms (started: 2021-05-03 16:49:51 -04:00)
