In [0]:
import pandas as pd
import numpy as np
import nltk

from nltk.tokenize import sent_tokenize
nltk.download('punkt')
nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer

from pyspark import SparkContext
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [0]:
# start spark session
spark = SparkSession.builder.appName('nltk_test').getOrCreate()

In [0]:
# File location and type
file_location = "/FileStore/tables/restaurantrev1.csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# read file into dataframe
df = spark.read \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .option("multiLine", "true") \
  .csv(file_location)


In [0]:
# check dataframe
df.show(n=20, truncate=50)

+----------------------+----------------------+----------------------+-----+--------------------------------------------------+-------------------+
|             review_id|               user_id|           business_id|stars|                                              text|               date|
+----------------------+----------------------+----------------------+-----+--------------------------------------------------+-------------------+
|KU_O5udG6zpxOg-VcAEodg|mh_-eMZ6K5RLWhZyISBhwA|XQfwVwDr-v0ZS3_CbbE5Xw|  3.0|If you decide to eat here, just be aware it is ...|2018-07-07 22:09:11|
|saUsX_uimxRlCVr67Z4Jig|8g_iMtfSiwikVnbP2etR0A|YjUWPpI6HXG530lwP-fb2A|  3.0|Family diner. Had the buffet. Eclectic assortme...|2014-02-05 20:30:30|
|AqPFMleE6RsU23_auESxiA|_7bHUi9Uuf5__HHc_Q8guQ|kxX2SOes4o-D3ZQBkiMRfA|  5.0|Wow!  Yummy, different,  delicious.   Our favor...|2015-01-04 00:01:03|
|Sx8TMOWLNuJBWer-0pcmoA|bcjbaE6dDog4jkNY91ncLQ|e4Vwtrqf-wpJfwesgvdgxQ|  4.0|Cute interior and owner (?) gave us 

In [0]:
df.dtypes

Out[22]: [('review_id', 'string'),
 ('user_id', 'string'),
 ('business_id', 'string'),
 ('stars', 'string'),
 ('text', 'string'),
 ('date', 'string')]

In [0]:
# create small dataset for testing
small = spark.createDataFrame(df.head(5), df.schema)
small.show(truncate=50)

+----------------------+----------------------+----------------------+-----+--------------------------------------------------+-------------------+
|             review_id|               user_id|           business_id|stars|                                              text|               date|
+----------------------+----------------------+----------------------+-----+--------------------------------------------------+-------------------+
|KU_O5udG6zpxOg-VcAEodg|mh_-eMZ6K5RLWhZyISBhwA|XQfwVwDr-v0ZS3_CbbE5Xw|  3.0|If you decide to eat here, just be aware it is ...|2018-07-07 22:09:11|
|saUsX_uimxRlCVr67Z4Jig|8g_iMtfSiwikVnbP2etR0A|YjUWPpI6HXG530lwP-fb2A|  3.0|Family diner. Had the buffet. Eclectic assortme...|2014-02-05 20:30:30|
|AqPFMleE6RsU23_auESxiA|_7bHUi9Uuf5__HHc_Q8guQ|kxX2SOes4o-D3ZQBkiMRfA|  5.0|Wow!  Yummy, different,  delicious.   Our favor...|2015-01-04 00:01:03|
|Sx8TMOWLNuJBWer-0pcmoA|bcjbaE6dDog4jkNY91ncLQ|e4Vwtrqf-wpJfwesgvdgxQ|  4.0|Cute interior and owner (?) gave us 

In [0]:
# pandas user defined function to sentence tokenize review text

@F.pandas_udf(ArrayType(StringType()))
def sent_tok(textcol: pd.Series) -> pd.Series:
    return textcol.map(lambda x: sent_tokenize(str(x)))


In [0]:
small.select('text').show(truncate=150)

+------------------------------------------------------------------------------------------------------------------------------------------------------+
|                                                                                                                                                  text|
+------------------------------------------------------------------------------------------------------------------------------------------------------+
|If you decide to eat here, just be aware it is going to take about 2 hours from beginning to end. We have tried it multiple times, because I want t...|
|Family diner. Had the buffet. Eclectic assortment: a large chicken leg, fried jalapeño, tamale, two rolled grape leaves, fresh melon. All good. Lot...|
|Wow!  Yummy, different,  delicious.   Our favorite is the lamb curry and korma.  With 10 different kinds of naan!!!  Don't let the outside deter yo...|
|Cute interior and owner (?) gave us tour of upcoming patio/rooftop area which wil

In [0]:
# check output of sent_tok
sentsmalldf = small.select(sent_tok('text').alias('sent_tokenized'))
sentsmalldf.show(truncate=150)

+------------------------------------------------------------------------------------------------------------------------------------------------------+
|                                                                                                                                        sent_tokenized|
+------------------------------------------------------------------------------------------------------------------------------------------------------+
|[If you decide to eat here, just be aware it is going to take about 2 hours from beginning to end., We have tried it multiple times, because I want...|
|[Family diner., Had the buffet., Eclectic assortment: a large chicken leg, fried jalapeño, tamale, two rolled grape leaves, fresh melon., All good....|
|[Wow!, Yummy, different,  delicious., Our favorite is the lamb curry and korma., With 10 different kinds of naan!!!, Don't let the outside deter yo...|
|[Cute interior and owner (?), gave us tour of upcoming patio/rooftop area which w

In [0]:
sentsmalldf.take(2)

Out[27]: [Row(sent_tokenized=['If you decide to eat here, just be aware it is going to take about 2 hours from beginning to end.', 'We have tried it multiple times, because I want to like it!', "I have been to it's other locations in NJ and never had a bad experience.", 'The food is good, but it takes a very long time to come out.', 'The waitstaff is very young, but usually pleasant.', 'We have just had too many experiences where we spent way too long waiting.', 'We usually opt for another diner or restaurant on the weekends, in order to be done quicker.']),
 Row(sent_tokenized=['Family diner.', 'Had the buffet.', 'Eclectic assortment: a large chicken leg, fried jalapeño, tamale, two rolled grape leaves, fresh melon.', 'All good.', 'Lots of Mexican choices there.', 'Also has a menu with breakfast served all day long.', 'Friendly, attentive staff.', 'Good place for a casual relaxed meal with no expectations.', 'Next to the Clarion Hotel.'])]

In [0]:
# python udf to get sentiment for each sentence in review
# outputs dict of VADER sentiment (positive, negative, neutral, compound), averaged over sentences in review

outputdict = MapType(StringType(), FloatType())

@F.udf(returnType=outputdict)
def vader_analysis(revtext):
    # process review as a dataframe of sentences
    sentdf = pd.DataFrame({'sentences': revtext})

    analyzer = SentimentIntensityAnalyzer()
    sentdf['sentiment'] = sentdf['sentences'].apply(analyzer.polarity_scores)

    # split sentiment dicts into positive, neutral, negative, and compound columns
    sentimentdf = pd.DataFrame(sentdf['sentiment'].apply(pd.Series))
    
    # calculate mean of each category
    avgneg = sentimentdf['neg'].mean().round(3)
    avgneu = sentimentdf['neu'].mean().round(3)
    avgpos = sentimentdf['pos'].mean().round(3)
    avgcomp = sentimentdf['compound'].mean().round(4)

    return {'pos': float(avgpos), 'neu': float(avgneu), 'neg': float(avgneg), 'compound': float(avgcomp)}


In [0]:
# check output of vader_analysis
sentsmalldf = small.select('*', sent_tok('text').alias('sent_tokenized'))
testsmalldf = sentsmalldf.select('stars', 
                                 'text', 
                                 F.size('sent_tokenized').alias('num_sent'),
                                 F.size(F.split('text', ' ')).alias('num_word'),
                                 vader_analysis('sent_tokenized').alias('vader_analysis'))
testsmalldf.show(truncate=75)

+-----+---------------------------------------------------------------------------+--------+--------+--------------------------------------------------------------+
|stars|                                                                       text|num_sent|num_word|                                                vader_analysis|
+-----+---------------------------------------------------------------------------+--------+--------+--------------------------------------------------------------+
|  3.0|If you decide to eat here, just be aware it is going to take about 2 hou...|       7|     101|  {neg -> 0.0, pos -> 0.148, compound -> 0.2585, neu -> 0.852}|
|  3.0|Family diner. Had the buffet. Eclectic assortment: a large chicken leg, ...|       9|      55|{neg -> 0.016, pos -> 0.226, compound -> 0.2159, neu -> 0.758}|
|  5.0|Wow!  Yummy, different,  delicious.   Our favorite is the lamb curry and...|       6|      48|  {neg -> 0.0, pos -> 0.438, compound -> 0.3981, neu -> 0.562}|
|  4.0|Cut

In [0]:
# split vader_analysis column into multiple columns, one for each key

# get keys of vader_analysis column
vaderkeys = testsmalldf.select(F.map_keys('vader_analysis').alias('keys')).first().keys
vaderkeys

# make new columns
testsmalldf = testsmalldf.select(['*'] + 
                                 [F.col('vader_analysis').getItem(key).alias(key) for key in vaderkeys])
testsmalldf.drop('vader_analysis').show(truncate=100)

+-----+----------------------------------------------------------------------------------------------------+--------+--------+-----+-----+--------+-----+
|stars|                                                                                                text|num_sent|num_word|  neg|  pos|compound|  neu|
+-----+----------------------------------------------------------------------------------------------------+--------+--------+-----+-----+--------+-----+
|  3.0|If you decide to eat here, just be aware it is going to take about 2 hours from beginning to end....|       7|     101|  0.0|0.148|  0.2585|0.852|
|  3.0|Family diner. Had the buffet. Eclectic assortment: a large chicken leg, fried jalapeño, tamale, t...|       9|      55|0.016|0.226|  0.2159|0.758|
|  5.0|Wow!  Yummy, different,  delicious.   Our favorite is the lamb curry and korma.  With 10 differen...|       6|      48|  0.0|0.438|  0.3981|0.562|
|  4.0|Cute interior and owner (?) gave us tour of upcoming patio/rooftop ar

In [0]:
# process full dataset

# sentence tokenize
tokenized_df = df.select('*', sent_tok('text').alias('sent_tokenized'))

# VADER analysis of each review
vader_df = tokenized_df.select('stars',
                               'text',
                               F.size('sent_tokenized').alias('num_sent'),
                               F.size(F.split('text', ' ')).alias('num_word'),
                               vader_analysis('sent_tokenized').alias('vader_analysis'))

# get keys of vader_analysis column
vaderkeys = vader_df.select(F.map_keys('vader_analysis').alias('keys')).first().keys
vaderkeys

# make new columns
vadersplit_df = vader_df.select(['*'] + 
                                [F.col('vader_analysis').getItem(key).alias(key) for key in vaderkeys])

# cleaned up dataframe
vadersplit_df.drop('vader_analysis').show(truncate=100)


+-----+----------------------------------------------------------------------------------------------------+--------+--------+-----+-----+--------+-----+
|stars|                                                                                                text|num_sent|num_word|  neg|  pos|compound|  neu|
+-----+----------------------------------------------------------------------------------------------------+--------+--------+-----+-----+--------+-----+
|  3.0|If you decide to eat here, just be aware it is going to take about 2 hours from beginning to end....|       7|     101|  0.0|0.148|  0.2585|0.852|
|  3.0|Family diner. Had the buffet. Eclectic assortment: a large chicken leg, fried jalapeño, tamale, t...|       9|      55|0.016|0.226|  0.2159|0.758|
|  5.0|Wow!  Yummy, different,  delicious.   Our favorite is the lamb curry and korma.  With 10 differen...|       6|      48|  0.0|0.438|  0.3981|0.562|
|  4.0|Cute interior and owner (?) gave us tour of upcoming patio/rooftop ar