In [1]:
import re
import nltk

from pyspark import SparkContext
from user_definition import *

from pyspark.sql import Row
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import udf

from pyspark.ml.feature import CountVectorizer
from pyspark.ml.clustering import LDA
from pyspark.ml.linalg import Vectors, VectorUDT

sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

sc

In [2]:
format_string = "com.mongodb.spark.sql.DefaultSource"

business = spark.read.format(format_string).option("uri","mongodb://{}/yelp.business".format(ip)).load()
#review = spark.read.format(format_string).option("uri","mongodb://{}/yelp.review".format(ip)).load()

print "ip = %s" % ip
print ""
business.printSchema()

ip = 54.186.189.193

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- address: string (nullable = true)
 |-- attributes: struct (nullable = true)
 |    |-- AcceptsInsurance: boolean (nullable = true)
 |    |-- AgesAllowed: string (nullable = true)
 |    |-- Alcohol: string (nullable = true)
 |    |-- Ambience: struct (nullable = true)
 |    |    |-- casual: boolean (nullable = true)
 |    |    |-- classy: boolean (nullable = true)
 |    |    |-- divey: boolean (nullable = true)
 |    |    |-- hipster: boolean (nullable = true)
 |    |    |-- intimate: boolean (nullable = true)
 |    |    |-- romantic: boolean (nullable = true)
 |    |    |-- touristy: boolean (nullable = true)
 |    |    |-- trendy: boolean (nullable = true)
 |    |    |-- upscale: boolean (nullable = true)
 |    |-- BYOB: boolean (nullable = true)
 |    |-- BYOBCorkage: string (nullable = true)
 |    |-- BestNights: struct (nullable = true)
 |    |    |-- monday: boolean (nullable =

In [3]:
review = review.select('business_id', 'text')
review.show(5)

+--------------------+--------------------+
|         business_id|                text|
+--------------------+--------------------+
|uYHaNptLzDLoV_JZ_...|If you need an in...|
|uYHaNptLzDLoV_JZ_...|Mittlerweile gibt...|
|uYHaNptLzDLoV_JZ_...|Location is every...|
|uYHaNptLzDLoV_JZ_...|My girlfriend and...|
|uYHaNptLzDLoV_JZ_...|gute lage im stad...|
+--------------------+--------------------+
only showing top 5 rows



In [4]:
ids = business.filter(business.city == city).select('business_id')
joined = ids.join(review, 'business_id', 'inner')
train_raw = joined.rdd.map(lambda x: x[1]).cache()
train_raw.take(5)

[u"This is one star for service from Jennifer. I had set up time to see her at 9am when they opened. At 9am I was still waiting outside in line behind another gentleman, who happened to be in a wheelchair. At 9:04, Jennifer walked to the front door and unlocked it. I found it rude for her not to open the door for the wheelchair client. She hesitated to speak to me, assuming I was there with the other gentleman. If I'm going to sit in a chair with a stylist for hours, I don't care how good she/he is, I will not to pay my hard earned $ to someone who doesn't treat others kindly. I pay well for good service, so due to Jennifer, I will not only never return but I will not recommend Yelpers/friends/family to see Jennifer at Supercuts",
 u"Nope. Wont happen again. I promise! In a hurry after moving to town without my favorite hair stylist. Decided to gamble. Brought in a pic and got the same, usual cut, just like last time (elsewhere). A 10 minute cut is not the same as a 30-45 minute cut. M

In [7]:
train_raw.count()

114829

In [100]:
stop_words = sc.textFile("./stop_words").collect()
common_words = ['place','time','times','food','things','customer','service','people','staff','area','order',\
                'year','years','day','days','minutes','company','city','customers','price','prices','money',\
                'location','hour','hours']
accepted_pos = ['NN','NNP','NNS']#,'JJ','JJR','JJS']

def tokenize(text):
    regex = re.compile('[' + re.escape("!\"#$%&()*+,-./:;<=>?@[\]^_`{|}~") + '0-9\\r\\t\\n]')
    words = nltk.word_tokenize(regex.sub(" ", text.lower()), 'english', False)
    words = [w for w in words if len(w) > 2 and w not in stop_words and w not in common_words]
    unigrams = [word for word,pos in nltk.pos_tag(words) if pos in accepted_pos]
    return unigrams

In [101]:
list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())
word_occurence = train_raw.map(lambda x: [tokenize(x)]).toDF()

cv = CountVectorizer(inputCol='_1', outputCol='features', vocabSize=2000)
cv_model = cv.fit(word_occurence)
train = cv_model.transform(word_occurence).cache()

train.show(5)

+--------------------+--------------------+
|                  _1|            features|
+--------------------+--------------------+
|[star, jennifer, ...|(2000,[6,40,67,75...|
|[nope, promise, h...|(2000,[72,92,169,...|
|[review, lot, mor...|(2000,[2,6,14,34,...|
|[haircut, experie...|(2000,[0,1,5,9,12...|
|[absolute, haircu...|(2000,[72,169,408...|
+--------------------+--------------------+
only showing top 5 rows



In [137]:
lda_model = LDA(k=30, maxIter=10).fit(train)

In [138]:
def indices_to_terms(vocabulary):
    def indices_to_terms(xs):
        return [vocabulary[int(x)] for x in xs]
    return udf(indices_to_terms, ArrayType(StringType()))

# Describe topics.
topics = lda_model.describeTopics(5)
topics.withColumn("topics_words", \
                  indices_to_terms(cv_model.vocabulary)("termIndices")).select('topics_words').show(truncate=False)

+------------------------------------------------+
|topics_words                                    |
+------------------------------------------------+
|[sushi, bar, night, bread, roll]                |
|[car, work, oil, shop, change]                  |
|[work, call, business, phone, job]              |
|[office, manager, care, experience, water]      |
|[dress, dresses, work, sum, dim]                |
|[chicken, rice, restaurant, thai, soup]         |
|[drive, stars, pho, joe, hour]                  |
|[party, barber, birthday, lee, tenders]         |
|[location, quality, job, shop, breakfast]       |
|[store, prices, selection, items, shop]         |
|[room, hotel, desk, night, stay]                |
|[hair, love, cut, salon, appointment]           |
|[fries, cream, store, ice, kids]                |
|[pizza, wings, crust, bar, location]            |
|[unit, job, gas, guy, business]                 |
|[massage, apartment, management, room, pedicure]|
|[beef, lunch, tacos, sauce, ch

In [134]:
business.groupby('city').count().orderBy('count', ascending=False).show(10)

+----------+-----+
|      city|count|
+----------+-----+
| Las Vegas|24768|
|   Phoenix|15656|
|   Toronto|15483|
| Charlotte| 7557|
|Scottsdale| 7510|
|Pittsburgh| 5688|
|  Montréal| 5175|
|      Mesa| 5146|
| Henderson| 4130|
|     Tempe| 3949|
+----------+-----+
only showing top 10 rows



In [1]:
import string

In [2]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'