In [32]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

In [33]:
sc

In [None]:
from pyspark.sql.functions import udf
import string
import re
from pyspark.ml.feature import Tokenizer,StopWordsRemover
from pyspark.ml import Pipeline, PipelineModel

In [34]:
review_raw =spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri","mongodb://ec2-34-216-224-27.us-west-2.compute.amazonaws.com/project.review").load()

In [35]:
review = review_raw.select('text','stars')
review_sub = review.sample(False, 0.05, 1)
review_neg = review_sub.filter("stars<3").select("text")
review_pos = review_sub.filter("stars>=4").select("text")

In [13]:
def remove_num_punct(text):
    text = text.lower()
    my_string = text.replace("-", " ")
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", my_string)  # delete stuff but leave at least a space to avoid clumping together
    return nopunct

udf_num_punct = udf(lambda x:remove_num_punct(x))

In [25]:
def bi_gram(words):
    words = [w for w in words if len(w) > 0]
    bigram = [" ".join([words[i],words[i+1]])for i in range(len(words)-1)]
    return bigram

In [40]:
def tri_gram(words):
    words = [w for w in words if len(w) > 0]
    trigram = [" ".join([words[i],words[i+1],words[i+2]])for i in range(len(words)-2)]
    return trigram

In [17]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered", caseSensitive=False)
pipeline=Pipeline(stages=[tokenizer,remover])

## Positive Review

In [21]:
review_pos = review_pos.select(udf_num_punct('text').alias('text'))
pos_words=pipeline.fit(review_pos).transform(review_pos).select("filtered")
pos_rdd= pos_words.rdd.map(list).map(lambda x:x[0]).cache()

+--------------------+
|            filtered|
+--------------------+
|[location, everyt...|
|[, st, , place, c...|
|[went, dinner, pe...|
|[love, food, cont...|
|[service, great, ...|
|[downside, place,...|
|[hear, couple, ti...|
|[one, great, las,...|
|[first, visit, ev...|
|[great, scenery, ...|
|[today, another, ...|
|[yum, , good, foo...|
|[really, good, sa...|
|[small, restauran...|
|[clean, friendly,...|
|[walked, scrubs, ...|
|[craving, authent...|
|[mi, pueblo, best...|
|[food, tasty, , g...|
|[place, great, , ...|
+--------------------+
only showing top 20 rows



In [None]:
pos_uni=pos_rdd.flatMap(lambda words: [w for w in words if len(w) > 0])
top_pos_uni = pos_uni.map(lambda x: (x,1)).reduceByKey(lambda x,y: x+y).sortBy(lambda x: x[1], ascending=False)
top_pos_uni.collect()[0:20]

In [30]:
pos_bi= pos_rdd.flatMap(lambda x: bi_gram(x))
top_pos_bi = pos_bi.map(lambda x: (x,1)).reduceByKey(lambda x,y: x+y).sortBy(lambda x: x[1], ascending=False)
top_pos_bi.collect()[0:20]

[(u'highly_recommend', 7745),
 (u'first_time', 6614),
 (u'customer_service', 5770),
 (u'really_good', 5609),
 (u'come_back', 5473),
 (u'go_back', 5378),
 (u'great_place', 4864),
 (u'las_vegas', 4806),
 (u'great_service', 4780),
 (u'love_place', 4540),
 (u'next_time', 4426),
 (u've_ever', 4367),
 (u'great_food', 4345),
 (u'ice_cream', 4315),
 (u'every_time', 3878),
 (u'make_sure', 3837),
 (u'one_best', 3792),
 (u'staff_friendly', 3720),
 (u'food_great', 3713),
 (u'service_great', 3660),
 (u'happy_hour', 3613),
 (u'pretty_good', 3598),
 (u'food_good', 3429),
 (u'good_food', 3245),
 (u'coming_back', 2893),
 (u'feel_like', 2884),
 (u'even_though', 2857),
 (u'definitely_back', 2765),
 (u'definitely_recommend', 2634),
 (u'friendly_staff', 2518)]

pos_tri= pos_rdd.flatMap(lambda x: bi_gram(x))
top_pos_tri = pos_tri.map(lambda x: (x,1)).reduceByKey(lambda x,y: x+y).sortBy(lambda x: x[1], ascending=False)
top_pos_tri.collect()[0:20]

## Negative Review

In [None]:
review_neg = review_neg.select(udf_num_punct('text').alias('text'))
neg_words=pipeline.fit(review_neg).transform(review_neg).select("filtered")
neg_rdd= neg_words.rdd.map(list).map(lambda x:x[0]).cache()

In [None]:
neg_uni=neg_rdd.flatMap(lambda words: [w for w in words if len(w) > 0])
top_neg_uni = neg_uni.map(lambda x: (x,1)).reduceByKey(lambda x,y: x+y).sortBy(lambda x: x[1], ascending=False)
top_neg_uni.collect()[0:20]

In [None]:
neg_bi= neg_rdd.flatMap(lambda x: bi_gram(x))
top_neg_bi = neg_bi.map(lambda x: (x,1)).reduceByKey(lambda x,y: x+y).sortBy(lambda x: x[1], ascending=False)
top_neg_bi.collect()[0:20]

In [41]:
neg_tri= neg_rdd.flatMap(lambda x: tri_gram(x))
top_neg_tri = neg_tri.map(lambda x: (x,1)).reduceByKey(lambda x,y: x+y).sortBy(lambda x: x[1], ascending=False)
top_neg_tri.collect()[0:20]

[(u'never_go_back', 770),
 (u'go_somewhere_else', 404),
 (u'worst_customer_service', 377),
 (u'never_come_back', 330),
 (u'poor_customer_service', 323),
 (u'horrible_customer_service', 322),
 (u'won_going_back', 313),
 (u'waste_time_money', 256),
 (u'long_story_short', 245),
 (u'never_going_back', 229),
 (u'terrible_customer_service', 219),
 (u'worst_service_ever', 213),
 (u'worst_experience_ever', 194),
 (u'give_zero_stars', 192),
 (u'customer_service_ever', 190),
 (u'won_coming_back', 189),
 (u'really_wanted_like', 186),
 (u'wanted_like_place', 183),
 (u'bad_customer_service', 174),
 (u'never_coming_back', 174),
 (u'recommend_place_anyone', 170),
 (u'took_minutes_get', 168),
 (u'won_go_back', 167),
 (u'm_pretty_sure', 166),
 (u'good_customer_service', 163),
 (u'nothing_write_home', 162),
 (u'food_good_service', 161),
 (u'asked_speak_manager', 153),
 (u'didn_even_get', 150),
 (u'customer_service_skills', 148)]