### Import data

In [1]:
%load_ext autotime

time: 0 ns (started: 2021-04-11 11:36:39 -04:00)


In [45]:
import sparknlp
spark = sparknlp.start() 

time: 0 ns (started: 2021-04-11 12:14:40 -04:00)


In [46]:
reviews = spark.read.json('yelp_academic_dataset_review.json')
reviews = reviews.select(['business_id', 'text', 'stars'])
reviews.count()

8021122

time: 25 s (started: 2021-04-11 12:14:40 -04:00)


In [47]:
from pyspark.sql.functions import col
from pyspark.sql.types import StringType
reviews = reviews.withColumn('stars', col('stars').cast(StringType()))

time: 0 ns (started: 2021-04-11 12:15:05 -04:00)


### Subset data for Databricks

In [37]:
# x = (1/10)
# subset_df, large_df = reviews.randomSplit([x, 1 - x])

time: 0 ns (started: 2021-04-11 12:06:07 -04:00)


In [38]:
# subset_df.coalesce(1).write.format('json').save('reviews_1-10.json')

time: 0 ns (started: 2021-04-11 12:06:07 -04:00)


### Back to Jupyter

In [50]:
businesses = spark.read.json('yelp_academic_dataset_business.json')
businesses = businesses.select(['business_id', 'categories'])

time: 719 ms (started: 2021-04-11 12:17:11 -04:00)


In [51]:
restaurants = businesses.filter(businesses.categories.contains('Restaurants'))
restaurant_reviews = reviews.join(restaurants, "business_id", "inner")
restaurant_reviews = restaurant_reviews.select(['text', 'stars'])
restaurant_reviews.count()

5055992

time: 13 s (started: 2021-04-11 12:17:12 -04:00)


In [52]:
five_stars = restaurant_reviews.filter(restaurant_reviews.stars == 5.0)
one_stars = restaurant_reviews.filter(restaurant_reviews.stars == 1.0)
num_one_stars = one_stars.count()
five_stars = five_stars.limit(num_one_stars)
one_or_five_stars = five_stars.union(one_stars)
num_one_stars * 2

1256088

time: 13 s (started: 2021-04-11 12:17:25 -04:00)


In [60]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol = 'stars', outputCol = 'categoryIndex')
indexed = indexer.fit(one_or_five_stars).transform(one_or_five_stars)

[Row(text='Threw away food I Chick-fil-A trash can. Straight garbage. Was accused of  "stealing" even thought I paid for it.', stars='1.0', categoryIndex=0.0),
 Row(text="Never in my life, of 62 years, have I ever tasted fried or grilled-chicken, this terrible. \nWhat a waste, of a seemingly attractive establishment. \nThere's nothing else I can possibly add. \nEven their punch is watered down. \nI must say, that the plantains were alright.\nDoes the management or owner, ever taste the food they're selling?", stars='1.0', categoryIndex=0.0),
 Row(text="Horrible customer service and service in general! Called ahead and even placed reservation via yelp as they requested. We still ended up waiting 50 minutes for a table. No one apologized or even attempted to accommodate us. I'm very disappointed in their service here his evening. Worse birthday dinner ever!", stars='1.0', categoryIndex=0.0),
 Row(text="You know the food there is and have bad. The DJ killed it. I was there after the footb

time: 46.5 s (started: 2021-04-11 12:30:20 -04:00)


In [63]:
x = 0.001
subset_df, large_df = indexed.randomSplit([x, 1 - x])
subset_df.count()

1269

time: 33.5 s (started: 2021-04-11 12:31:55 -04:00)


In [64]:
subset_df.take(1)

[Row(text='"Dem potatoes doe." Imagine cajun spices grinded into a dust then is sprinkled on crispy bitesized potato cubes. \nThe french toast is fluffy crispy sweet & eggy.\n\nDon\'t leave without the banana nut muffin. Its moist sweet and super mouth watering!!!', stars='5.0', categoryIndex=1.0)]

time: 19.8 s (started: 2021-04-11 12:32:29 -04:00)


In [65]:
from nltk.corpus import stopwords
nltk_stopwords = stopwords.words('english')
# sql_stopwords = pd.read_csv('MySQL_stopwords.csv', header = None)
# sql_stopwords = list(sql_stopwords[0].values)
# eng_stopwords = nltk_stopwords + sql_stopwords
# eng_stopwords = list(set(eng_stopwords))

time: 15 ms (started: 2021-04-11 12:34:46 -04:00)


In [66]:
# Build NLP preprocessing pipeline
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import Tokenizer
from sparknlp.annotator import Normalizer
from sparknlp.annotator import LemmatizerModel
from sparknlp.annotator import StopWordsCleaner
from sparknlp.annotator import NGramGenerator
from sparknlp.base import Finisher

document_assembler = DocumentAssembler() \
.setInputCol('text') \
.setOutputCol('document')

tokenizer = Tokenizer() \
.setInputCols(['document']) \
.setOutputCol('tokenized') \
.setContextChars(['(', ')']) \
.setSplitChars(['-'])

normalizer = Normalizer() \
.setInputCols(['tokenized']) \
.setOutputCol('normalized') \
.setLowercase(True) \
.setCleanupPatterns(['[^A-Za-z]'])

lemmatizer = LemmatizerModel \
.pretrained() \
.setInputCols(['normalized']) \
.setOutputCol('lemmatized')

stopwords_cleaner = StopWordsCleaner() \
.setInputCols(['lemmatized']) \
.setOutputCol('unigrams') \
.setStopWords(nltk_stopwords)

ngrammer = NGramGenerator() \
    .setInputCols(['unigrams']) \
    .setOutputCol('ngrams') \
    .setN(2) \
    .setEnableCumulative(True) \
    .setDelimiter('_')

finisher = Finisher() \
.setInputCols(['ngrams'])

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
time: 2.89 s (started: 2021-04-11 12:34:47 -04:00)


In [67]:
# Assemble pipeline
from pyspark.ml import Pipeline
nlp_pipeline = Pipeline() \
.setStages([document_assembler,
            tokenizer,
            normalizer,
            lemmatizer,
            stopwords_cleaner,
            ngrammer,
            finisher])

time: 0 ns (started: 2021-04-11 12:34:49 -04:00)


In [68]:
# Fit pipeline
preprocessed = nlp_pipeline.fit(subset_df).transform(subset_df)

time: 281 ms (started: 2021-04-11 12:34:49 -04:00)


In [69]:
preprocessed.take(1)

[Row(text='"Dem potatoes doe." Imagine cajun spices grinded into a dust then is sprinkled on crispy bitesized potato cubes. \nThe french toast is fluffy crispy sweet & eggy.\n\nDon\'t leave without the banana nut muffin. Its moist sweet and super mouth watering!!!', stars='5.0', categoryIndex=1.0, finished_ngrams=['dem', 'potato', 'imagine', 'cajun', 'spice', 'grinded', 'dust', 'sprinkle', 'crispy', 'bitesized', 'potato', 'cube', 'french', 'toast', 'fluffy', 'crispy', 'sweet', 'eggy', 'dont', 'leave', 'without', 'banana', 'nut', 'muffin', 'moist', 'sweet', 'super', 'mouth', 'water', 'dem_potato', 'potato_imagine', 'imagine_cajun', 'cajun_spice', 'spice_grinded', 'grinded_dust', 'dust_sprinkle', 'sprinkle_crispy', 'crispy_bitesized', 'bitesized_potato', 'potato_cube', 'cube_french', 'french_toast', 'toast_fluffy', 'fluffy_crispy', 'crispy_sweet', 'sweet_eggy', 'eggy_dont', 'dont_leave', 'leave_without', 'without_banana', 'banana_nut', 'nut_muffin', 'muffin_moist', 'moist_sweet', 'sweet_

time: 18.5 s (started: 2021-04-11 12:34:50 -04:00)


In [70]:
# Train test split
(trainingData, testData) = preprocessed.randomSplit([0.8, 0.2])
trainingData_count = trainingData.count()
print(trainingData_count)

1022
time: 33.3 s (started: 2021-04-11 12:35:08 -04:00)


In [71]:
# ML pipeline
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import IDF
from pyspark.ml.classification import LogisticRegression

tfizer = CountVectorizer(inputCol = "finished_ngrams", outputCol = "tf_features")
idfizer = IDF(inputCol = "tf_features", outputCol = "idf_features")
lr = LogisticRegression(featuresCol = 'idf_features', labelCol = 'categoryIndex', family = 'binomial')

time: 16 ms (started: 2021-04-11 12:35:42 -04:00)


In [72]:
{param[0].name: param[1] for param in lr.extractParamMap().items()}

{'threshold': 0.5,
 'aggregationDepth': 2,
 'standardization': True,
 'fitIntercept': True,
 'elasticNetParam': 0.0,
 'predictionCol': 'prediction',
 'featuresCol': 'idf_features',
 'labelCol': 'categoryIndex',
 'rawPredictionCol': 'rawPrediction',
 'probabilityCol': 'probability',
 'maxIter': 100,
 'regParam': 0.0,
 'tol': 1e-06,
 'family': 'auto'}

time: 0 ns (started: 2021-04-11 12:35:42 -04:00)


In [73]:
ml_pipeline = Pipeline() \
.setStages([tfizer,
            idfizer,
            lr])

time: 0 ns (started: 2021-04-11 12:35:42 -04:00)


In [74]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

minDF = 0.01
maxDF = 0.1
vocabSize = int(trainingData_count / 2)

paramGrid = ParamGridBuilder() \
    .addGrid(tfizer.minDF, [minDF]) \
    .addGrid(tfizer.maxDF, [maxDF]) \
    .addGrid(tfizer.vocabSize, [vocabSize]) \
    .addGrid(lr.regParam, [0.1, 1]) \
    .addGrid(lr.elasticNetParam, [0]) \
    .build()

time: 0 ns (started: 2021-04-11 12:35:42 -04:00)


In [75]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
crossval = CrossValidator(estimator = ml_pipeline,
                          estimatorParamMaps = paramGrid,
                          evaluator = BinaryClassificationEvaluator(labelCol = "categoryIndex", rawPredictionCol = "prediction"),
                          numFolds = 2) 

time: 16 ms (started: 2021-04-11 12:35:42 -04:00)


In [76]:
cvModel = crossval.fit(trainingData)

time: 4min 56s (started: 2021-04-11 12:35:42 -04:00)


In [77]:
import numpy as np
cvModel.getEstimatorParamMaps()[ np.argmax(cvModel.avgMetrics) ]

{Param(parent='CountVectorizer_073e0d64aa46', name='minDF', doc='Specifies the minimum number of different documents a term must appear in to be included in the vocabulary. If this is an integer >= 1, this specifies the number of documents the term must appear in; if this is a double in [0,1), then this specifies the fraction of documents. Default 1.0'): 0.01,
 Param(parent='CountVectorizer_073e0d64aa46', name='maxDF', doc='Specifies the maximum number of different documents a term could appear in to be included in the vocabulary. A term that appears more than the threshold will be ignored. If this is an integer >= 1, this specifies the maximum number of documents the term could appear in; if this is a double in [0,1), then this specifies the maximum fraction of documents the term could appear in. Default (2^63) - 1'): 0.1,
 Param(parent='CountVectorizer_073e0d64aa46', name='vocabSize', doc='max size of the vocabulary. Default 1 << 18.'): 511,
 Param(parent='LogisticRegression_1c757974

time: 0 ns (started: 2021-04-11 12:41:03 -04:00)


In [78]:
tfizer = CountVectorizer(inputCol = "finished_ngrams", outputCol = "tf_features", 
                         vocabSize = vocabSize, minDF = minDF, maxDF = maxDF)
tf_model = tfizer.fit(trainingData)

time: 44.2 s (started: 2021-04-11 12:41:06 -04:00)


In [79]:
len(tf_model.vocabulary)

511

time: 15 ms (started: 2021-04-11 12:42:12 -04:00)


In [80]:
tf_model.vocabulary

['pizza',
 'people',
 'burger',
 'way',
 'two',
 'cheese',
 'bar',
 'walk',
 'always',
 'location',
 'night',
 'see',
 'fry',
 'manager',
 'meal',
 'everything',
 'call',
 'little',
 'need',
 'sit',
 'sauce',
 'salad',
 'check',
 'find',
 'still',
 'pay',
 'another',
 'fresh',
 'new',
 'much',
 'seat',
 'serve',
 'dinner',
 'friend',
 'long',
 'right',
 'cook',
 'quality',
 'come_back',
 'many',
 'meat',
 'around',
 'bring',
 'work',
 'last',
 'dish',
 'year',
 'hour',
 'beer',
 'review',
 'small',
 'feel',
 'waitress',
 'since',
 'day',
 'wasnt',
 'vegas',
 'top',
 'flavor',
 'sure',
 'visit',
 'start',
 'every',
 'away',
 'use',
 'nothing',
 'big',
 'roll',
 'late',
 'room',
 'favorite',
 'next',
 'stop',
 'hot',
 'steak',
 'sandwich',
 'busy',
 'star',
 'excellent',
 'cold',
 'side',
 'lot',
 'awesome',
 'bread',
 'enjoy',
 'area',
 'horrible',
 'else',
 'old',
 'seem',
 'business',
 'money',
 'lunch',
 'high',
 'line',
 'half',
 'let',
 'end',
 'super',
 'open',
 'worth',
 'custome

time: 15 ms (started: 2021-04-11 12:42:23 -04:00)


In [81]:
predictions = cvModel.transform(testData)

time: 47 ms (started: 2021-04-11 12:42:34 -04:00)


In [97]:
selected = predictions.select('text', 'categoryIndex', 'prediction')
for row in selected.collect():
    print(row)

Row(text="'Eat like Jefe' is hands down my favourite and probably the best option. It allows you to try what I believe are the signature dishes at Baro. Great for a group of 4. Add the empanadas as a shared appetizer. You will be stuffed after all this amazing food.", categoryIndex=1.0, prediction=1.0)
Row(text="A Hidden Gem at the corner of Tropicana and Pecos. I would have never expected this place to even exist. I walked in and both the sushi chef and front of house staff screamed something at me in Japanese, I'm assuming welcome. It made the experience fun right away. I ordered two sushi rolls, a ginger side salad, and a miso soup. Everything is delicious. I would definitely come back!", categoryIndex=1.0, prediction=1.0)
Row(text="Adorable hole in the wall in China town. There's a lot of dessert places in China town but I feel like they go out of their way to serve quality food. The presentation is amazing and the price isn't bad at all. This place is easily my go-to place for des

In [82]:
evaluator = MulticlassClassificationEvaluator(labelCol = "categoryIndex", predictionCol = "prediction")
acc_test_lr = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})

time: 40.7 s (started: 2021-04-11 12:42:37 -04:00)


In [83]:
print('Test accuracy: ' + str(acc_test_lr))

Test accuracy: 0.8947368421052632
time: 0 ns (started: 2021-04-11 12:43:18 -04:00)


In [84]:
coef_matrix = cvModel.bestModel.stages[-1].coefficientMatrix
coef_list = coef_matrix.toArray().tolist()

time: 31 ms (started: 2021-04-11 12:44:23 -04:00)


In [91]:
import pandas as pd
pd.DataFrame(coef_list).T.sort_values(0)

Unnamed: 0,0
85,-0.240770
132,-0.237014
156,-0.207880
484,-0.204485
473,-0.203521
...,...
423,0.238032
277,0.238672
344,0.274846
78,0.294194


time: 31 ms (started: 2021-04-11 12:45:32 -04:00)


In [98]:
coef_df = pd.DataFrame(coef_list).T.sort_values(0, ascending = False)
for i in range(0, 50):
    print(tf_model.vocabulary[coef_df.index[i]])

enjoy
excellent
might
miss
stick
read
drive
especially
wonderful
late
ago
fantastic
job
yet
glass
bread
almost
selection
yummy
ill
another
toppings
least
watch
everything
huge
french
mind
three
suppose
though
explain
less
girl
offer
pack
bathroom
change
horrible
quick
dollar
become
beer
little
stand
walk
extra
return
whole
la_vegas
time: 297 ms (started: 2021-04-11 12:47:28 -04:00)


In [99]:
coef_df = pd.DataFrame(coef_list).T.sort_values(0, ascending = True)
for i in range(0, 50):
    print(tf_model.vocabulary[coef_df.index[i]])

area
plate
charge
accommodate
expensive
even_though
take_order
real
stuff
tasty
bottle
saturday
chip
barely
impressed
soggy
fill
turn
old
wont
plus
last
anything
seem
hotel
sandwich
worth
broth
piece
sad
ok
flavour
employee
name
chef
great_food
party
care
chance
couldnt
cold
pay
pork
fine
year
money
id
stay
entire
anyway
time: 281 ms (started: 2021-04-11 12:47:42 -04:00)


### Random forest

In [None]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol = 'tfidf_features', labelCol = 'categoryIndex')

In [None]:
{param[0].name: param[1] for param in rf.extractParamMap().items()}

In [None]:
from datetime import datetime
now = datetime.now()
current_time = now.strftime("%H:%M")
print("Model fitting started at: ", current_time)
rfModel = rf.fit(trainingData)

In [None]:
rfPredictions_training = rfModel.transform(trainingData)
rfPredictions_test = rfModel.transform(testData)

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol = 'categoryIndex', predictionCol = "prediction")

In [None]:
acc_training_rf = evaluator.evaluate(rfPredictions_training, {evaluator.metricName: "accuracy"})
acc_test_rf = evaluator.evaluate(rfPredictions_test, {evaluator.metricName: "accuracy"})
# f1 = evaluator.evaluate(rfPredictions_training, {evaluator.metricName: "f1"})
# weightedPrecision = evaluator.evaluate(rfPredictions_training, {evaluator.metricName: "weightedPrecision"})
# weightedRecall = evaluator.evaluate(rfPredictions_training, {evaluator.metricName: "weightedRecall"})

In [None]:
print('Training accuracy: ' + str(acc_training_rf))
print('Test accuracy: ' + str(acc_test_rf))

In [None]:
coef_matrix = rfModel.featureImportances
coef_list = coef_matrix.toArray().tolist()

In [None]:
import pandas as pd
coef_df = pd.DataFrame(coef_list).sort_values(0, ascending = False)
for i in range(0, 50):
    print(tf_model.vocabulary[coef_df.index[i]])