### Import data

In [1]:
%load_ext autotime

time: 0 ns (started: 2021-04-12 13:39:16 -04:00)


In [2]:
import sparknlp
spark = sparknlp.start() 

time: 10.3 s (started: 2021-04-12 13:39:16 -04:00)


In [3]:
reviews = spark.read.json('yelp_academic_dataset_review.json')
reviews = reviews.select(['business_id', 'text', 'stars'])
reviews.count()

8021122

time: 30.5 s (started: 2021-04-12 13:39:26 -04:00)


In [4]:
# 5.89 / x = 0.5
# 0.5 * x = 5.89
x = 5.89 / 0.5
8021122 / x 

680910.1867572156

time: 0 ns (started: 2021-04-12 13:39:56 -04:00)


In [5]:
from pyspark.sql.functions import col
from pyspark.sql.types import StringType
reviews = reviews.withColumn('stars', col('stars').cast(StringType()))

time: 63 ms (started: 2021-04-12 13:39:56 -04:00)


### Subset data for Databricks

In [6]:
# x = (1/10)
# subset_df, large_df = reviews.randomSplit([x, 1 - x])

time: 0 ns (started: 2021-04-12 13:39:56 -04:00)


In [7]:
# subset_df.coalesce(1).write.format('json').save('reviews_1-10.json')

time: 0 ns (started: 2021-04-12 13:39:57 -04:00)


### Back to Jupyter

In [8]:
businesses = spark.read.json('yelp_academic_dataset_business.json')
businesses = businesses.select(['business_id', 'categories'])

time: 1.84 s (started: 2021-04-12 13:39:57 -04:00)


In [9]:
restaurants = businesses.filter(businesses.categories.contains('Restaurants'))
restaurant_reviews = reviews.join(restaurants, "business_id", "inner")
restaurant_reviews = restaurant_reviews.select(['text', 'stars'])
restaurant_reviews.count()

5055992

time: 9.62 s (started: 2021-04-12 13:39:58 -04:00)


In [10]:
five_stars = restaurant_reviews.filter(restaurant_reviews.stars == 5.0)
one_stars = restaurant_reviews.filter(restaurant_reviews.stars == 1.0)
num_one_stars = one_stars.count()
five_stars = five_stars.limit(num_one_stars)
one_or_five_stars = five_stars.union(one_stars)
num_one_stars * 2

1256088

time: 10.1 s (started: 2021-04-12 13:40:08 -04:00)


In [11]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol = 'stars', outputCol = 'categoryIndex')
indexed = indexer.fit(one_or_five_stars).transform(one_or_five_stars)

time: 20.7 s (started: 2021-04-12 13:40:18 -04:00)


In [12]:
# 680910 rows = ~500MB
# 1256088 * x = 680910
680910 / 1256088

0.5420878155033724

time: 0 ns (started: 2021-04-12 13:40:39 -04:00)


In [13]:
x = 0.1
subset_df, large_df = indexed.randomSplit([x, 1 - x])
subset_count = subset_df.count()
print(subset_count)

125587
time: 30.6 s (started: 2021-04-12 13:40:39 -04:00)


In [14]:
subset_df.take(1)

[Row(text="!!AMAZING RESTAURANT!!\n\nI have got to buy a fondue set now lol.\n\nThe creamy spinach artichoke cheese dip, minced walnut melted milk chocolate, and mixture of marinated meats were incredibly delicious but what earns this place without any doubt 5 Stars instead 3 or 4, is their incredible service.\n\nRichard our server was sociable and almost too considerate lol (we stayed to 1:20am even though this place closed at 11pm!!!).\n\nSteve the manager treated my girlfriend and I with a card signed by most of the restaurant staff and some treats to make the anniversary of our 1st date really special.\n\nThis place isn't for the cheap at heart or those with quality taste but don't have a lot of spending cash on them.\n\nDefinitely GO!! but I would suggest you go if you:\n1) are in a group bigger than 4\n2) just don't mind spending more than $20-40 on an entree\n3) want to try something different/new\n4) are looking to make a romantic occasion really special (let the restaurant kno

time: 18.5 s (started: 2021-04-12 13:41:10 -04:00)


### Spark NLP

In [15]:
# Build NLP preprocessing pipeline
from sparknlp.base import DocumentAssembler
document_assembler = DocumentAssembler() \
.setInputCol('text') \
.setOutputCol('document')
from sparknlp.annotator import Tokenizer
tokenizer = Tokenizer() \
.setInputCols(['document']) \
.setOutputCol('tokenized') \
.setContextChars(['(', ')']) \
.setSplitChars(['-'])
from sparknlp.annotator import Normalizer
normalizer = Normalizer() \
.setInputCols(['tokenized']) \
.setOutputCol('normalized') \
.setLowercase(True) \
.setCleanupPatterns(['[^A-Za-z]'])
from sparknlp.annotator import LemmatizerModel
lemmatizer = LemmatizerModel \
.pretrained() \
.setInputCols(['normalized']) \
.setOutputCol('lemmatized')
from nltk.corpus import stopwords
nltk_stopwords = stopwords.words('english')
from sparknlp.annotator import StopWordsCleaner
stopwords_cleaner = StopWordsCleaner() \
.setInputCols(['lemmatized']) \
.setOutputCol('unigrams') \
.setStopWords(nltk_stopwords)
from sparknlp.annotator import NGramGenerator
ngrammer = NGramGenerator() \
    .setInputCols(['unigrams']) \
    .setOutputCol('ngrams') \
    .setN(2) \
    .setEnableCumulative(True) \
    .setDelimiter('_')
from sparknlp.base import Finisher
finisher = Finisher() \
.setInputCols(['unigrams', 'ngrams'])

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
time: 10.5 s (started: 2021-04-12 13:41:28 -04:00)


In [16]:
# Assemble pipeline
from pyspark.ml import Pipeline
pipeline = Pipeline() \
.setStages([document_assembler,
            tokenizer,
            normalizer,
            lemmatizer,
            stopwords_cleaner,
            ngrammer,
            finisher])

time: 0 ns (started: 2021-04-12 13:41:39 -04:00)


In [17]:
# Fit pipeline
processed_reviews = pipeline.fit(subset_df).transform(subset_df)

time: 656 ms (started: 2021-04-12 13:41:39 -04:00)


In [18]:
# Examine one processed review
processed_reviews.take(1)

[Row(text="!!AMAZING RESTAURANT!!\n\nI have got to buy a fondue set now lol.\n\nThe creamy spinach artichoke cheese dip, minced walnut melted milk chocolate, and mixture of marinated meats were incredibly delicious but what earns this place without any doubt 5 Stars instead 3 or 4, is their incredible service.\n\nRichard our server was sociable and almost too considerate lol (we stayed to 1:20am even though this place closed at 11pm!!!).\n\nSteve the manager treated my girlfriend and I with a card signed by most of the restaurant staff and some treats to make the anniversary of our 1st date really special.\n\nThis place isn't for the cheap at heart or those with quality taste but don't have a lot of spending cash on them.\n\nDefinitely GO!! but I would suggest you go if you:\n1) are in a group bigger than 4\n2) just don't mind spending more than $20-40 on an entree\n3) want to try something different/new\n4) are looking to make a romantic occasion really special (let the restaurant kno

time: 19.2 s (started: 2021-04-12 13:41:39 -04:00)


In [19]:
# Train test split
(trainingData, testData) = processed_reviews.randomSplit([0.8, 0.2])

time: 16 ms (started: 2021-04-12 13:41:58 -04:00)


In [20]:
trainingData_count = subset_count * 0.8
print(trainingData_count)

100469.6
time: 0 ns (started: 2021-04-12 13:41:58 -04:00)


In [21]:
# Count vectorization with minDF and maxDF parameters
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import IDF

tfizer = CountVectorizer(inputCol = 'finished_ngrams', outputCol = 'tf_features', 
                         minDF = 0.01, maxDF = 0.1, vocabSize = int(trainingData_count / 2))

tf_model = tfizer.fit(trainingData)
tf_result_training = tf_model.transform(trainingData)
tf_result_test = tf_model.transform(testData)

idfizer = IDF(inputCol = 'tf_features', outputCol = 'tfidf_features')

idf_model = idfizer.fit(tf_result_training)
tfidf_result_training = idf_model.transform(tf_result_training)
tfidf_result_test = idf_model.transform(tf_result_test)

time: 10min 8s (started: 2021-04-12 13:41:58 -04:00)


In [22]:
# Print vocablary length (i.e. # of columns)
print(len(tf_model.vocabulary))

785
time: 16 ms (started: 2021-04-12 13:52:07 -04:00)


In [23]:
tf_model.vocabulary

['pizza',
 'server',
 'fry',
 'meal',
 'manager',
 'sit',
 'sauce',
 'way',
 'bar',
 'friend',
 'night',
 'call',
 'hour',
 'find',
 'two',
 'thing',
 'much',
 'day',
 'salad',
 'serve',
 'location',
 'cheese',
 'fresh',
 'walk',
 'everything',
 'dish',
 'right',
 'long',
 'another',
 'still',
 'burger',
 'feel',
 'seat',
 'pay',
 'review',
 'work',
 'little',
 'cook',
 'need',
 'check',
 'dinner',
 'bring',
 'waitress',
 'around',
 'visit',
 'star',
 'new',
 'last',
 'come_back',
 'sure',
 'lunch',
 'sandwich',
 'cant',
 'use',
 'every',
 'next',
 'meat',
 'go_back',
 'side',
 'wasnt',
 'since',
 'favorite',
 'year',
 'small',
 'vegas',
 'open',
 'many',
 'something',
 'top',
 'close',
 'lot',
 'quality',
 'start',
 'sushi',
 'away',
 'late',
 'hot',
 'enjoy',
 'flavor',
 'seem',
 'rice',
 'roll',
 'room',
 'decide',
 'worth',
 'pretty',
 'cold',
 'taco',
 'plate',
 'family',
 'put',
 'customer_service',
 'area',
 'waiter',
 'excellent',
 'owner',
 'horrible',
 'let',
 'super',
 'mone

time: 47 ms (started: 2021-04-12 13:52:07 -04:00)


### Logistic regression

In [24]:
# Define logistic regression with ridge
from pyspark.ml.classification import LogisticRegression
# lr = LogisticRegression(featuresCol = 'tfidf_features', labelCol = 'stars')
lr = LogisticRegression(featuresCol = 'tfidf_features', labelCol = 'categoryIndex', 
                        family = 'binomial', elasticNetParam = 0, regParam = 0.1)

time: 62 ms (started: 2021-04-12 13:52:07 -04:00)


In [25]:
# Print all parameters
{param[0].name: param[1] for param in lr.extractParamMap().items()}

{'threshold': 0.5,
 'aggregationDepth': 2,
 'standardization': True,
 'fitIntercept': True,
 'elasticNetParam': 0.0,
 'predictionCol': 'prediction',
 'featuresCol': 'tfidf_features',
 'labelCol': 'categoryIndex',
 'rawPredictionCol': 'rawPrediction',
 'probabilityCol': 'probability',
 'maxIter': 100,
 'regParam': 0.1,
 'tol': 1e-06,
 'family': 'binomial'}

time: 0 ns (started: 2021-04-12 13:52:07 -04:00)


In [26]:
# Fit LR model
lrModel = lr.fit(tfidf_result_training)

time: 5min 8s (started: 2021-04-12 13:52:07 -04:00)


In [27]:
lrPredictions_training = lrModel.transform(tfidf_result_training)
lrPredictions_test = lrModel.transform(tfidf_result_test)

time: 78 ms (started: 2021-04-12 13:57:15 -04:00)


In [28]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol = 'categoryIndex', predictionCol = 'prediction')

time: 15 ms (started: 2021-04-12 13:57:15 -04:00)


In [29]:
acc_training_lr = evaluator.evaluate(lrPredictions_training, {evaluator.metricName: "accuracy"})
acc_test_lr = evaluator.evaluate(lrPredictions_test, {evaluator.metricName: "accuracy"})
# f1 = evaluator.evaluate(lrPredictions_training, {evaluator.metricName: "f1"})
# weightedPrecision = evaluator.evaluate(lrPredictions_training, {evaluator.metricName: "weightedPrecision"})
# weightedRecall = evaluator.evaluate(lrPredictions_training, {evaluator.metricName: "weightedRecall"})

time: 9min 56s (started: 2021-04-12 13:57:16 -04:00)


In [30]:
print('Training accuracy: ' + str(acc_training_lr))
print('Test accuracy: ' + str(acc_test_lr))

Training accuracy: 0.9404485000298205
Test accuracy: 0.9425255153091855
time: 0 ns (started: 2021-04-12 14:07:12 -04:00)


In [31]:
coef_matrix = lrModel.coefficientMatrix
coef_list = coef_matrix.toArray().tolist()

time: 16 ms (started: 2021-04-12 14:07:12 -04:00)


In [32]:
import pandas as pd
pd.DataFrame(coef_list).T.sort_values(0)

Unnamed: 0,0
124,-0.225919
96,-0.220192
561,-0.195700
113,-0.192852
485,-0.190591
...,...
61,0.191573
325,0.194716
709,0.199487
94,0.221117


time: 657 ms (started: 2021-04-12 14:07:12 -04:00)


In [33]:
coef_df = pd.DataFrame(coef_list).T.sort_values(0, ascending = True)
for i in range(0, 20):
    print(tf_model.vocabulary[coef_df.index[i]])

terrible
horrible
mediocre
rude
disappointing
overpriced
awful
bland
bad_service
disgusting
never_go
poor
waste
tasteless
one_star
suck
slow
never_come
gross
dry
time: 171 ms (started: 2021-04-12 14:07:13 -04:00)


In [34]:
coef_df = pd.DataFrame(coef_list).T.sort_values(0, ascending = False)
for i in range(0, 20):
    print(tf_model.vocabulary[coef_df.index[i]])

awesome
excellent
food_amazing
love_place
favorite
fantastic
definitely_back
cant_wait
perfect
food_delicious
great_service
great_food
one_good
gem
yummy
friendly_staff
definitely_come
highly_recommend
outstanding
wonderful
time: 156 ms (started: 2021-04-12 14:07:13 -04:00)


### Random forest

In [46]:
# from pyspark.ml.classification import RandomForestClassifier
# rf = RandomForestClassifier(featuresCol = 'tfidf_features', labelCol = 'categoryIndex')

time: 47 ms (started: 2021-04-12 14:31:42 -04:00)


In [47]:
# {param[0].name: param[1] for param in rf.extractParamMap().items()}

{'seed': -7873517711665355170,
 'maxDepth': 5,
 'maxBins': 32,
 'minInstancesPerNode': 1,
 'minInfoGain': 0.0,
 'maxMemoryInMB': 256,
 'cacheNodeIds': False,
 'checkpointInterval': 10,
 'impurity': 'gini',
 'numTrees': 20,
 'featureSubsetStrategy': 'auto',
 'subsamplingRate': 1.0,
 'leafCol': '',
 'minWeightFractionPerNode': 0.0,
 'bootstrap': True,
 'predictionCol': 'prediction',
 'featuresCol': 'tfidf_features',
 'labelCol': 'categoryIndex',
 'rawPredictionCol': 'rawPrediction',
 'probabilityCol': 'probability'}

time: 0 ns (started: 2021-04-12 14:31:43 -04:00)


In [48]:
# rfModel = rf.fit(tfidf_result_training)

time: 19min 43s (started: 2021-04-12 14:31:43 -04:00)


In [49]:
# rfPredictions_training = rfModel.transform(tfidf_result_training)
# rfPredictions_test = rfModel.transform(tfidf_result_test)

time: 94 ms (started: 2021-04-12 14:51:26 -04:00)


In [50]:
# acc_training_rf = evaluator.evaluate(rfPredictions_training, {evaluator.metricName: "accuracy"})
# acc_test_rf = evaluator.evaluate(rfPredictions_test, {evaluator.metricName: "accuracy"})
# # f1 = evaluator.evaluate(rfPredictions_training, {evaluator.metricName: "f1"})
# # weightedPrecision = evaluator.evaluate(rfPredictions_training, {evaluator.metricName: "weightedPrecision"})
# # weightedRecall = evaluator.evaluate(rfPredictions_training, {evaluator.metricName: "weightedRecall"})

time: 11min 7s (started: 2021-04-12 14:51:26 -04:00)


In [51]:
# print('Training accuracy: ' + str(acc_training_rf))
# print('Test accuracy: ' + str(acc_test_rf))

Training accuracy: 0.8163555396512991
Test accuracy: 0.8170502301380829
time: 0 ns (started: 2021-04-12 15:02:34 -04:00)


In [52]:
# coef_matrix = rfModel.featureImportances
# coef_list = coef_matrix.toArray().tolist()

time: 47 ms (started: 2021-04-12 15:02:34 -04:00)


In [53]:
# import pandas as pd
# coef_df = pd.DataFrame(coef_list).sort_values(0, ascending = False)
# for i in range(0, 40):
#     print(tf_model.vocabulary[coef_df.index[i]])

terrible
rude
horrible
money
awful
cold
awesome
highly_recommend
charge
another
pay
favorite
wasnt
fantastic
tasty
attitude
excellent
perfect
disgusting
min
wonderful
seem
never_go
wont
dirty
wait_minute
bill
arrive
atmosphere
sorry
perfectly
walk
employee
waste
put
receive
nothing
dry
slow
finally
time: 390 ms (started: 2021-04-12 15:02:34 -04:00)


### Spark LDA topic modeling

In [43]:
# from pyspark.ml.clustering import LDA
# num_topics = 10
# max_iter = 10
# lda = LDA(k = num_topics, 
#           maxIter = max_iter, 
#           featuresCol = 'tfidf_features')
# lda_model = lda.fit(tfidf_result)

time: 0 ns (started: 2021-04-12 14:07:14 -04:00)


In [44]:
# from pyspark.sql import types as T
# vocab = tf_model.vocabulary
# def get_words(token_list):
#     return [vocab[token_id] for token_id in token_list]
# udf_to_words = F.udf(get_words, T.ArrayType(T.StringType()))

time: 0 ns (started: 2021-04-12 14:07:14 -04:00)


In [45]:
# num_top_words = 10
# topics = lda_model \
# .describeTopics(num_top_words) \
# .withColumn('topicWords', udf_to_words(F.col('termIndices')))
# topics.select('topic', 'topicWords').show(truncate = 100)

time: 0 ns (started: 2021-04-12 14:07:14 -04:00)
