In [1]:
# init SparkSession
from pyspark import SparkContext
from pyspark.sql import SparkSession

sc = SparkContext('local[4]', 'Pipeline')
spark = SparkSession.builder.appName('Pipeline').getOrCreate()

In [None]:
!head ../data/reviews_sample.json

In [2]:
# load reviews fro json file
data_frame = spark.read.json('../data/reviews_sample.json') \
                       .select('overall', 'reviewText')

print('Number of reviews: %d' % data_frame.count())
data_frame.show(5, truncate=True)

Number of reviews: 5000
+-------+--------------------+
|overall|          reviewText|
+-------+--------------------+
|    4.0|It fits and repla...|
|    5.0|I own several Ank...|
|    5.0|Love these kind o...|
|    5.0|This is the best ...|
|    5.0|Great collection ...|
+-------+--------------------+
only showing top 5 rows



In [3]:
# TODO: convert lines to lower case using data_frame.rdd.map transformation
from pyspark.sql import Row

lower_rdd = data_frame.rdd.map(
                    lambda row: Row(
                        overall=row.overall - 1, 
                        reviewText=row.reviewText.lower()))

data_frame_lower = spark.createDataFrame(lower_rdd)

data_frame_lower.show(5)

+-------+--------------------+
|overall|          reviewText|
+-------+--------------------+
|    3.0|it fits and repla...|
|    4.0|i own several ank...|
|    4.0|love these kind o...|
|    4.0|this is the best ...|
|    4.0|great collection ...|
+-------+--------------------+
only showing top 5 rows



In [13]:
from pyspark.sql.functions import count
data_frame_lower.groupBy('overall').agg(count('overall')).show(10)

+-------+--------------+
|overall|count(overall)|
+-------+--------------+
|    0.0|           356|
|    1.0|           292|
|    4.0|          2815|
|    3.0|          1003|
|    2.0|           534|
+-------+--------------+



In [14]:
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import NGram
from pyspark.ml.feature import HashingTF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

tokenizer = Tokenizer(inputCol='reviewText',
                      outputCol='tokenized')

with open('../data/stopwords.txt') as src:
    stopwords_list = list(map(str.strip, src))
    
stopwords = StopWordsRemover(
    stopWords=stopwords_list,
    inputCol=tokenizer.getOutputCol(),
    outputCol='stopwords')

ngram = NGram(n=2, inputCol=stopwords.getOutputCol(),
                 outputCol='ngram')

hashing = HashingTF(numFeatures=1024, 
                    binary=True, 
                    inputCol=ngram.getOutputCol(),
                    outputCol='hashing')

logreg = LogisticRegression(featuresCol=hashing.getOutputCol(),
                    labelCol='overall',
                    family='multinomial',
                    regParam=1e-3)

pipeline = Pipeline(stages=[tokenizer,
                            stopwords, 
                            ngram,
                            hashing,
                            logreg])

model = pipeline.fit(data_frame_lower)

In [15]:
# TODO: get pipeline prediction using transform()
pred_df = model.transform(data_frame_lower)

In [16]:
pred_df.select(['overall', 'prediction', 'reviewText'])\
       .sample(False, .25).show(20, truncate=True)

+-------+----------+--------------------+
|overall|prediction|          reviewText|
+-------+----------+--------------------+
|    3.0|       4.0|it fits and repla...|
|    4.0|       2.0|love these kind o...|
|    3.0|       3.0|i recently receiv...|
|    4.0|       4.0|fits just as well...|
|    4.0|       4.0|this little guy i...|
|    4.0|       4.0|slim, light, grea...|
|    4.0|       4.0|this replaced my ...|
|    0.0|       4.0|i wouldn't recomm...|
|    3.0|       4.0|the three items h...|
|    3.0|       4.0|you can always us...|
|    4.0|       4.0|i read other revi...|
|    2.0|       2.0|i purchased the h...|
|    3.0|       3.0|as others have sa...|
|    4.0|       4.0|this case is beau...|
|    4.0|       4.0|this product hold...|
|    2.0|       2.0|but i passed this...|
|    1.0|       1.0|overall seems a g...|
|    4.0|       4.0|these work great ...|
|    3.0|       3.0|i received my blu...|
|    2.0|       4.0|since you have th...|
+-------+----------+--------------

In [17]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# TODO: estimate quality

target = np.array(pred_df.select(['overall']).collect())
predicted = np.array(pred_df.select(['prediction']).collect())

acc = accuracy_score(target, predicted)
report = classification_report(target, predicted)

print(report)

             precision    recall  f1-score   support

        0.0       0.90      0.71      0.79       356
        1.0       0.93      0.80      0.86       292
        2.0       0.74      0.49      0.59       534
        3.0       0.69      0.47      0.56      1003
        4.0       0.74      0.91      0.82      2815

avg / total       0.76      0.75      0.74      5000



In [20]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# TODO: build param grid
# TODO: find best param match using accuracy as a target metric

params = ParamGridBuilder()\
  .addGrid(hashing.numFeatures, [128, 512, 1024]) \
  .addGrid(logreg.regParam, [1e-1, 1e-3])\
  .build()

evaluator = MulticlassClassificationEvaluator(
    metricName='accuracy', 
    labelCol='overall')

cv = CrossValidator(estimator=pipeline, 
                    estimatorParamMaps=params,
                    evaluator=evaluator)

crossval_model = cv.fit(data_frame_lower)

In [22]:
# output average metric for each param set
for accuracy, params in zip(crossval_model.avgMetrics, 
                            cv.getEstimatorParamMaps()):
    params = [(p.name, v) for p, v in params.items()]
    print(params, accuracy)

[('numFeatures', 128), ('regParam', 0.1)] 0.5572597844490846
[('numFeatures', 128), ('regParam', 0.001)] 0.5409111962743384
[('numFeatures', 512), ('regParam', 0.1)] 0.5355030436548964
[('numFeatures', 512), ('regParam', 0.001)] 0.4266091768797028
[('numFeatures', 1024), ('regParam', 0.1)] 0.5186645530270984
[('numFeatures', 1024), ('regParam', 0.001)] 0.36458632906686267
