In [8]:
import os
import pyspark as ps
import numpy as np
from pyspark.sql.functions import count, rand

In [9]:
sc = ps.SparkContext('local[4]')
sqlContext = ps.SQLContext(sc)

In [10]:
ACCESS_KEY = os.environ['AWS_ACCESS_KEY']
SECRET_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")
DSI_CSTON_BUCKET = "galvanize.dsi.capstone.alex"
MOUNT_PROJECT = "dsi_capstone_s3"

In [11]:
link = 's3a://{}:{}@{}/dsi_data/reviews_Musical_Instruments_5.json.gz'.format(ACCESS_KEY, SECRET_KEY, DSI_CSTON_BUCKET)
amazon_df = sqlContext.read.json(link)

In [12]:
amazon_df.printSchema()
print amazon_df.count()

root
 |-- asin: string (nullable = true)
 |-- helpful: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)

10261


In [13]:
df = amazon_df.select('reviewText', 'overall')
df.printSchema()

root
 |-- reviewText: string (nullable = true)
 |-- overall: double (nullable = true)



In [14]:
rest_count = df.groupby("overall").agg(count("overall"))

In [15]:
classes_count = dict(rest_count.collect())

In [16]:
rating_1 = df.filter(df["overall"] <= 1.0).orderBy(rand()).limit(classes_count[1.0])
rating_5 = df.filter(df["overall"] >= 5.0).orderBy(rand()).limit(classes_count[1.0])

In [17]:
df_total = rating_1.union(rating_5)

In [21]:
df_total = df_total.withColumn("label", (df_total['overall']-1.0)/4.0)
df_total.printSchema()

root
 |-- reviewText: string (nullable = true)
 |-- overall: double (nullable = true)
 |-- label: double (nullable = true)



In [19]:
df_raw = df_total

In [22]:
df_raw['overall']-1.0

Column<(overall - 1.0)>

In [13]:
import string
import nltk
nltk.data.path.append("/mnt1/nltk_data/")
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

def preprocess_raw_text(text):
    stopwords_ = set(stopwords.words('english'))
    stemmer_ = PorterStemmer()
    
    if (text == None):
        return []
    
    if (len(text) < 1):
        return []
    
    if (type(text) == unicode):
        text = text.encode('utf-8')
        
    unpunctuated_text = text.translate(None, string.punctuation)

    tokens = word_tokenize(unpunctuated_text)

    lowercased_tokens = [t.lower() for t in tokens]
    
    filtered_tokens = [w for w in lowercased_tokens if not w in stopwords_]
    
    stemmed = [stemmer_.stem(w) for w in filtered_tokens]
    stemmed = [stemmer_.stem(w) for w in lowercased_tokens]

    
    return(stemmed)

In [14]:
t_paragraph = "7\. Implement `NaiveBayes` specifying the columns for features (`featuresCol`), labels (`labelCol`) and prediction (`predictionCol`). Then `.fit()` to obtain a model, and apply this model on the testing test."
#preprocess_raw_text(t_paragraph)

In [15]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

In [18]:
tokenizer_udf = udf(lambda x: preprocess_raw_text(x), ArrayType(StringType()))
df_tokens = df_raw.withColumn("tokens", tokenizer_udf(df_raw.reviewText))

In [19]:
df_tokens.printSchema()

root
 |-- reviewText: string (nullable = true)
 |-- overall: double (nullable = true)
 |-- label: double (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [20]:
df_tokens.count()

434

In [17]:
nltk.data.path.append("/mnt1/nltk_data/")

In [21]:
df_tokens.first()

Row(reviewText=u'A great little package for the price, but unfortunately, the zipper failed the first time I tried zipping it up. I have a standard size acoustic guitar, but be aware, the fit was snug, but not tight.', overall=1.0, label=0.0, tokens=[u'a', u'great', u'littl', u'packag', u'for', u'the', u'price', u'but', u'unfortun', u'the', u'zipper', u'fail', u'the', u'first', u'time', u'i', u'tri', u'zip', u'it', u'up', u'i', u'have', u'a', u'standard', u'size', u'acoust', u'guitar', u'but', u'be', u'awar', u'the', u'fit', u'wa', u'snug', u'but', u'not', u'tight'])

In [22]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/hadoop/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [23]:
from pyspark.ml.feature import CountVectorizer

In [24]:
cv = CountVectorizer(inputCol="tokens", outputCol="features_tf", vocabSize=5000, minDF=10.0)
cv_model = cv.fit(df_tokens)

In [25]:
df_features_tf = cv_model.transform(df_tokens)
df_features_tf.printSchema()

root
 |-- reviewText: string (nullable = true)
 |-- overall: double (nullable = true)
 |-- label: double (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features_tf: vector (nullable = true)



In [26]:
df_features_tf.first()

Row(reviewText=u'A great little package for the price, but unfortunately, the zipper failed the first time I tried zipping it up. I have a standard size acoustic guitar, but be aware, the fit was snug, but not tight.', overall=1.0, label=0.0, tokens=[u'a', u'great', u'littl', u'packag', u'for', u'the', u'price', u'but', u'unfortun', u'the', u'zipper', u'fail', u'the', u'first', u'time', u'i', u'tri', u'zip', u'it', u'up', u'i', u'have', u'a', u'standard', u'size', u'acoust', u'guitar', u'but', u'be', u'awar', u'the', u'fit', u'wa', u'snug', u'but', u'not', u'tight'], features_tf=SparseVector(478, {0: 4.0, 1: 2.0, 3: 2.0, 4: 1.0, 9: 1.0, 15: 3.0, 16: 1.0, 18: 1.0, 20: 1.0, 21: 1.0, 31: 1.0, 50: 1.0, 53: 1.0, 56: 1.0, 70: 1.0, 95: 1.0, 135: 1.0, 136: 1.0, 144: 1.0, 158: 1.0, 238: 1.0, 288: 1.0, 300: 1.0, 408: 1.0}))

In [27]:
from pyspark.ml.feature import IDF

In [28]:
idf = IDF(inputCol="features_tf", outputCol="features")
idfModel = idf.fit(df_features_tf)

In [29]:
df_features = idfModel.transform(df_features_tf)

In [30]:
df_features.show(5)

+--------------------+-------+-----+--------------------+--------------------+--------------------+
|          reviewText|overall|label|              tokens|         features_tf|            features|
+--------------------+-------+-----+--------------------+--------------------+--------------------+
|A great little pa...|    1.0|  0.0|[a, great, littl,...|(478,[0,1,3,4,9,1...|(478,[0,1,3,4,9,1...|
|I thought somethi...|    1.0|  0.0|[i, thought, some...|(478,[1,2,4,11,12...|(478,[1,2,4,11,12...|
|kripes man..the o...|    1.0|  0.0|[kripe, manth, on...|(478,[0,1,2,3,4,5...|(478,[0,1,2,3,4,5...|
|At the time I bou...|    1.0|  0.0|[at, the, time, i...|(478,[0,1,2,3,4,5...|(478,[0,1,2,3,4,5...|
|One of the ball b...|    1.0|  0.0|[one, of, the, ba...|(478,[0,2,5,8,12,...|(478,[0,2,5,8,12,...|
+--------------------+-------+-----+--------------------+--------------------+--------------------+
only showing top 5 rows



In [31]:
splits = df_features.randomSplit([0.7, 0.3])

In [33]:
df_train = splits[0]
df_test = splits[1]
df_train.persist()

DataFrame[reviewText: string, overall: double, label: double, tokens: array<string>, features_tf: vector, features: vector]

In [34]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [35]:
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
model = nb.fit(df_train)

In [36]:
result = model.transform(df_test)

In [37]:
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

In [38]:
print "Accuracy: {}".format(str(evaluator.evaluate(predictionAndLabels)))

Accuracy: 0.822222222222


In [39]:
thetaarray = model.theta.toArray().T

In [41]:
#thetaarray

In [40]:
vocab = cv_model.vocabulary

In [45]:
vocab_size = len(cv_model.vocabulary)

In [47]:
dtype = [('label', 'S10'), ('neg', float), ('pos', float)]
prob_values = [(vocab[i], np.exp(thetaarray[i,0]), np.exp(thetaarray[i,1])) for i in range(vocab_size)]

In [49]:
a = np.array(prob_values, dtype = dtype)

In [55]:
np.sort(a, order='pos')[::-1][:50]
np.sort(a, order='neg')[::-1][:50]

array([('the', 0.007975022396569017, 0.006937359971266315),
       ('it', 0.007756336958215204, 0.005942090073255785),
       ('you', 0.007297737665878048, 0.007290713975877631),
       ('of', 0.0072614551712374365, 0.006674905604580032),
       ('to', 0.006806027764126175, 0.006558189769655631),
       ('on', 0.006661994749412663, 0.004973544624367517),
       ('wa', 0.00651907637427299, 0.004438657075080826),
       ('not', 0.006494504104117895, 0.0042864391631455435),
       ('that', 0.006481331889211326, 0.006969833094877404),
       ('as', 0.006459947723715366, 0.00565548839521435),
       ('have', 0.006407384585883609, 0.005363177847669031),
       ('i', 0.00631703200479609, 0.006171931374015065),
       ('is', 0.006225190328295895, 0.007790816834200771),
       ('one', 0.005903452498732205, 0.005037272747794601),
       ('in', 0.005748869482649859, 0.006315929800839576),
       ('thi', 0.005653630837498155, 0.005977627142031766),
       ('a', 0.005542095248807324, 0.006753880074