In [1]:
import re
import nltk
import string

from pyspark import SparkContext

from pyspark.sql import Row
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import udf

from pyspark.ml.feature import CountVectorizer
from pyspark.ml.clustering import LDA
from pyspark.ml.linalg import Vectors, VectorUDT

sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

sc

In [16]:
ip = "54.186.189.193"
city = "Las Vegas"
format_string = "com.mongodb.spark.sql.DefaultSource"

business = spark.read.format(format_string).option("uri","mongodb://{}/yelp.business".format(ip)).load()
review = spark.read.format(format_string).option("uri","mongodb://{}/yelp.review".format(ip)).load()

print "ip = %s" % ip
print ""
review.printSchema()

ip = 54.186.189.193

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- cool: integer (nullable = true)
 |-- date: string (nullable = true)
 |-- funny: integer (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: integer (nullable = true)
 |-- text: string (nullable = true)
 |-- useful: integer (nullable = true)
 |-- user_id: string (nullable = true)



In [17]:
business.groupby('city').count().orderBy('count', ascending=False).show(10)

+----------+-----+
|      city|count|
+----------+-----+
| Las Vegas|24768|
|   Phoenix|15656|
|   Toronto|15483|
| Charlotte| 7557|
|Scottsdale| 7510|
|Pittsburgh| 5688|
|  Montréal| 5175|
|      Mesa| 5146|
| Henderson| 4130|
|     Tempe| 3949|
+----------+-----+
only showing top 10 rows



In [19]:
review = review.select('business_id', 'text')
ids = business.filter(business.city == city).select('business_id')
joined = ids.join(review, 'business_id', 'inner')
train_raw = joined.rdd.map(lambda x: x[1]).cache()
train_raw.take(1)

[u"What can I say.. Wowzers! Probably one of the best steak houses I've been too. Service was absolutely flawless and dinner was excellent . Ordered seafood tower, wedge, wagyu filet, chateaubriand, bacon grits and saut\xe9ed  mushrooms Will definitely be back!"]

In [41]:
# train_raw.count()

108512

In [22]:
stop_words = sc.parallelize(open("./stop_words","r").readlines()).map(lambda x: x.strip()).collect()
common_words = ['place','time','times','food','things','customer','service','people','staff','area','order',\
                'year','years','day','days','minutes','company','city','customers','price','prices','money',\
                'location','hour','hours',"don't","doesn't","didn't","won't"]
accepted_pos = ['NN','NNP','NNS']#,'JJ','JJR','JJS']

def tokenize(text):
    regex = re.compile('[' + re.escape('!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~') + '0-9\\r\\t\\n]')
    words = nltk.word_tokenize(regex.sub(" ", text.lower()))
    words = [w for w in words if len(w) > 2 and w not in stop_words and w not in common_words]
    unigrams = [word for word,pos in nltk.pos_tag(words) if pos in accepted_pos]
    return unigrams

In [23]:
# list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())
word_occurence = train_raw.map(lambda x: [tokenize(x)])
word_occurence = word_occurence.toDF()

cv = CountVectorizer(inputCol='_1', outputCol='features', vocabSize=2000)
cv_model = cv.fit(word_occurence)
train = cv_model.transform(word_occurence).cache()

train.show(5)

+--------------------+--------------------+
|                  _1|            features|
+--------------------+--------------------+
|[wowzers, steak, ...|(2000,[12,65,155,...|
|[hopes, delmonico...|(2000,[3,4,12,20,...|
|[guys, weekend, a...|(2000,[65,97,122,...|
|[stars, bump, thi...|(2000,[3,7,14,16,...|
|[herb, butter, st...|(2000,[2,35,109,1...|
+--------------------+--------------------+
only showing top 5 rows



In [28]:
lda_model = LDA(k=15, maxIter=10).fit(train)

In [29]:
def indices_to_terms(vocabulary):
    def indices_to_terms(xs):
        return [vocabulary[int(x)] for x in xs]
    return udf(indices_to_terms, ArrayType(StringType()))

# Describe topics.
topics = lda_model.describeTopics(10)
topics.withColumn("topics_words", \
                  indices_to_terms(cv_model.vocabulary)("termIndices")).select('topics_words').show(truncate=False)

+-----------------------------------------------------------------------------------+
|topics_words                                                                       |
+-----------------------------------------------------------------------------------+
|[buffet, lunch, quality, dinner, crab, restaurant, meat, chef, rib, eat]           |
|[cream, ice, vegas, chocolate, tea, love, bar, buffet, flavors, experience]        |
|[sushi, pork, rolls, rice, spicy, roll, vegas, soup, beef, dishes]                 |
|[breakfast, eggs, massage, vegas, coffee, pancakes, toast, ramen, menu, eat]       |
|[room, hotel, rooms, pool, vegas, night, stay, casino, strip, check]               |
|[restaurant, steak, waiter, experience, meal, vegas, dinner, night, lobster, filet]|
|[fries, burger, burgers, sandwich, beer, coffee, sandwiches, bar, meat, beef]      |
|[car, work, job, experience, phone, office, business, store, call, manager]        |
|[hair, vegas, las, hotel, night, cut, salon, cake, ex