In [14]:
from pyspark import SparkContext
from pyspark.sql.types import *
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

In [21]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import NGram
from pyspark.sql.functions import udf
from pyspark.ml.feature import StopWordsRemover

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

from pyspark.ml import Pipeline, PipelineModel


In [3]:
review_data = spark.read.json("review.json")

In [122]:
review_data.show(1)

+--------------------+----+----------+-----+--------------------+-----+--------------------+------+--------------------+
|         business_id|cool|      date|funny|           review_id|stars|                text|useful|             user_id|
+--------------------+----+----------+-----+--------------------+-----+--------------------+------+--------------------+
|uYHaNptLzDLoV_JZ_...|   0|2016-07-12|    0|VfBHSwC5Vz_pbFluy...|    5|My girlfriend and...|     0|cjpdDjZyprfyDG3Rl...|
+--------------------+----+----------+-----+--------------------+-----+--------------------+------+--------------------+
only showing top 1 row



In [None]:
## Business average rating

### check distribution of the rating star

In [3]:
print review_data.groupBy(review_data["stars"]).count().show()

+-----+-------+
|stars|  count|
+-----+-------+
|    5|1988003|
|    1| 639849|
|    3| 570819|
|    2| 402396|
|    4|1135830|
+-----+-------+

None


Exclude neutral review

In [6]:
def pos_neg(star):
    if star <3:
        return int(0) #negative
    elif star >3 :
        return int(1) #positive
    else:
        return int(2) #neutral
    
star_to_senti = udf(lambda x:pos_neg(x))
train_test_DF_raw = review_data.select('text',star_to_senti('stars').alias('label')).filter("label != 2") #exclude neutral reviews

In [7]:
from pyspark.sql.types import *
train_test_DF = train_test_DF_raw.withColumn("label", train_test_DF_raw["label"].cast(DoubleType()))

In [8]:
train_test_DF.printSchema()

root
 |-- text: string (nullable = true)
 |-- label: double (nullable = true)



In [6]:
print train_test_DF.groupBy(train_test_DF["label"]).count().show()

+-----+-------+
|label|  count|
+-----+-------+
|    0|1042245|
|    1|3123833|
+-----+-------+

None


In [6]:
train_test_DF.show(1)

+--------------------+-----+
|                text|label|
+--------------------+-----+
|My girlfriend and...|    1|
+--------------------+-----+
only showing top 1 row



### Create TFIDF features

In [9]:
#remove punctuation
import re
import string

def remove_num_punct(text):

    my_string = text.replace("-", " ")
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", my_string)  # delete stuff but leave at least a space to avoid clumping together

    nopunct = nopunct.split()
    #nopunct = [stemmer.stem(w).strip(" ") for w in nopunct] #remove stop word and normalize word using stemmer.
    nopunct = [w.strip() for w in nopunct]
    nopunct = ' '.join(nopunct)
    
    return nopunct

udf_num_punct = udf(lambda x:remove_num_punct(x))
review_rmsw = train_test_DF.select(udf_num_punct('text').alias('text'), 'label')
review_rmsw.show(1,truncate = False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [10]:
### setNumFeatures(20)
n_features = 1000

### Unigram tfidf

In [11]:
tokenizer = Tokenizer().setInputCol("text").setOutputCol("words")
remover= StopWordsRemover().setInputCol("words").setOutputCol("filtered").setCaseSensitive(False)
hashingTF = HashingTF().setNumFeatures(n_features).setInputCol("filtered").setOutputCol("rawFeatures")
idf = IDF().setInputCol("rawFeatures").setOutputCol("features").setMinDocFreq(0)

### Split train and test data

In [12]:
train_set, test_set= review_rmsw.randomSplit([0.8, 0.2])
train_set = train_set.cache()
test_set = test_set.cache()

### Define evaluation metrics

In [19]:
# compute accuracy on the test set 
def evaluate_metric(predictions):
    
    evaluator = BinaryClassificationEvaluator().setMetricName("areaUnderROC")
    print "Area under ROC curve:",evaluator.evaluate(predictions)

    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                                  metricName="f1")
    f1 = evaluator.evaluate(predictions)
    print("F1_score = %0.4f" %(f1))

    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Accuracy = %0.4f" %(accuracy))
    

### Model 1: Logistic regression

In [16]:
%%time
lr =  LogisticRegression(maxIter=100, regParam=0.01, elasticNetParam=0.8)
pipeline=Pipeline(stages=[tokenizer,remover,hashingTF,idf, lr])
logreg_model=pipeline.fit(train_set)
predictions = logreg_model.transform(test_set)
#print evaluation metrics
evaluate_metric(predictions)

CPU times: user 3.31 ms, sys: 3.09 ms, total: 6.4 ms
Wall time: 8.25 ms


In [None]:
predictions.show(5)

In [None]:
# print "Logistic regression features column=",lr.getFeaturesCol()
# print "logistic regression label column=",lr.getLabelCol()
# print "Logistic regression threshold=",lr.getThreshold()

In [None]:
# print "Tokenizer:"
# print tokenizer.explainParams()
# print "***************************"
# print "Remover:"
# print remover.explainParams()
# print "***************************"
# print "HashingTF:"
# print hashingTF.explainParams()
# print "***************************"
# print "IDF:"
# print idf.explainParams()
# print "***************************"
# print "LogisticRegression:"
# print lr.explainParams()
# print "***************************"
# print "Pipeline:"
# print pipeline.explainParams()

### Cross validation to find best parameter

In [None]:
# paramGrid = ParamGridBuilder()\
#     .addGrid(hashingTF.numFeatures,[100,1000,10000])\
#     .addGrid(idf.minDocFreq,[0,10,100])\
#     .build()

In [None]:
# evaluator = BinaryClassificationEvaluator().setMetricName("areaUnderROC")
# cv = CrossValidator().setEstimator(pipeline).setEvaluator(evaluator).setEstimatorParamMaps(paramGrid).setNumFolds(5)

In [None]:
# %%time
# cvModel = cv.fit(train_set)
# print "Area under the ROC curve for best fitted model =",evaluator.evaluate(cvModel.transform(test_set))

In [None]:
# print "Area under ROC curve for non-tuned model:",evaluator.evaluate(predictions)
# print "Area under ROC curve for fitted model:",evaluator.evaluate(cvModel.transform(test_set))

### Model 2: Unigram Naive Bayes

In [None]:
%%time
nb = NaiveBayes(smoothing = 1.0, modelType = "multinomial")
pipeline=Pipeline(stages=[tokenizer,remover,hashingTF,idf, nb])
nb_model=pipeline.fit(train_set)
nb_prediction = nb_model.transform(test_set)
#print evaluation metrics
evaluate_metric(nb_prediction)

In [17]:
nb_prediction.show(5)

+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|                text|label|               words|            filtered|         rawFeatures|            features|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|A Brazilian culin...|  1.0|[a, brazilian, cu...|[brazilian, culin...|(20,[2,5,6,7,8,9,...|(20,[2,5,6,7,8,9,...|[-23.283075401701...|[0.24375668975552...|       1.0|
|A C wasn t workin...|  1.0|[a, c, wasn, t, w...|[c, wasn, working...|(20,[0,2,3,6,7,8,...|(20,[0,2,3,6,7,8,...|[-24.680653558029...|[0.22196539433659...|       1.0|
|A Cut above the r...|  1.0|[a, cut, above, t...|[cut, rest, impre...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|[-153.87506614939...|[0.21145996055961...|       1.0|
|A F

### Model 3: Bigram Naive Bayes

#### Bigram tfidf

In [23]:
#tokenizer = Tokenizer().setInputCol("text").setOutputCol("words")
#remover= StopWordsRemover().setInputCol("words").setOutputCol("filtered").setCaseSensitive(False)
bigram = NGram(n=2, inputCol="filtered", outputCol="bigrams")
hashingTF_bigram = HashingTF().setNumFeatures(n_features).setInputCol("bigrams").setOutputCol("rawFeatures")
idf_bigram = IDF().setInputCol("rawFeatures").setOutputCol("features").setMinDocFreq(0)

In [24]:
%%time
nb = NaiveBayes(smoothing = 1.0, modelType = "multinomial")
pipeline=Pipeline(stages=[tokenizer,remover,bigram,hashingTF_bigram,idf_bigram, nb])
nb_model_bigram=pipeline.fit(train_set)
nb_prediction_bigram = nb_model_bigram.transform(test_set)

#print evaluation metrics
evaluate_metric(nb_prediction_bigram)

In [None]:
nb_prediction_bigram.show(5)

### Model 4: Trigram Naive Bayes

#### Tribgram tfidf

In [None]:
#tokenizer = Tokenizer().setInputCol("text").setOutputCol("words")
#remover= StopWordsRemover().setInputCol("words").setOutputCol("filtered").setCaseSensitive(False)
trigram = NGram(n=3, inputCol="filtered", outputCol="trigrams")
hashingTF_trigram = HashingTF().setNumFeatures(n_features).setInputCol("trigrams").setOutputCol("rawFeatures")
idf_trigram = IDF().setInputCol("rawFeatures").setOutputCol("features").setMinDocFreq(0)

In [None]:
%%time
nb = NaiveBayes(smoothing = 1.0, modelType = "multinomial")
pipeline=Pipeline(stages=[tokenizer,remover,trigram,hashingTF_trigram,idf_trigram, nb])
nb_model_trigram=pipeline.fit(train_set)
nb_prediction_trigram = nb_model_trigram.transform(test_set)
#print evaluation metrics
evaluate_metric(nb_prediction_bigram)

In [None]:
nb_prediction_trigram.show(5)

### Model 5: Random Forest

In [None]:
from pyspark.ml.classification import RandomForestClassifier
%%time
rf = RandomForestClassifier(maxDepth=20)

pipeline=Pipeline(stages=[tokenizer,remover,hashingTF,idf, nb])
rf_model = pipeline.fit(train_set)
rf_prediction = rf_model.transform(test_set)
#print evaluation metrics
evaluate_metric(rf_prediction)


### Model 6: Multilayer perceptron classifier 

In [28]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [None]:
# specify layers for the neural network:
# input layer of size 20 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)

#%%time

layers = [n_features, 5 , 2] 
# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=10, layers=layers, blockSize=128, seed=1234)
pipeline=Pipeline(stages=[tokenizer,remover,hashingTF,idf, trainer])
nn_model = pipeline.fit(train_set)

nn_prediction = nn_model.transform(test_set)

In [34]:
nn_prediction.show(5)

 +--------------------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|                text|label|               words|            filtered|         rawFeatures|            features|prediction|
+--------------------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|A Brazilian culin...|  1.0|[a, brazilian, cu...|[brazilian, culin...|(20,[2,5,6,7,8,9,...|(20,[2,5,6,7,8,9,...|       1.0|
|A C wasn t workin...|  1.0|[a, c, wasn, t, w...|[c, wasn, working...|(20,[0,2,3,6,7,8,...|(20,[0,2,3,6,7,8,...|       1.0|
|A Cut above the r...|  1.0|[a, cut, above, t...|[cut, rest, impre...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|       1.0|
|A First class and...|  1.0|[a, first, class,...|[first, class, hi...|(20,[0,1,3,4,5,7,...|(20,[0,1,3,4,5,7,...|       1.0|
|A Foodie Delight ...|  1.0|[a, foodie, delig...|[foodie, delight,...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|       1.0|
+------

In [43]:
# print nn_prediction.groupBy(nn_prediction["label"], nn_prediction["prediction"]).count().show()

In [41]:
   # evaluator = BinaryClassificationEvaluator().setMetricName("areaUnderROC")
   # print "Area under ROC curve:",evaluator.evaluate(nn_prediction)


In [37]:
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                                  metricName="f1")
    f1 = evaluator.evaluate(nn_prediction)
    print("F1_score = %0.4f" %(f1))

 F1_score = 0.6418


In [39]:
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(nn_prediction)
    print("Accuracy = %0.4f" %(accuracy))
    

Accuracy = 0.7492


In [44]:
#print evaluation metrics
#evaluate_metric(nn_prediction)