In [62]:
from threading import Thread

class StreamingThread(Thread):
    def __init__(self, ssc):
        Thread.__init__(self)
        self.ssc = ssc
    def run(self):
        ssc.start()
        ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [63]:
sc

# 1. Data loading and exploration

In [64]:
# start with easy implemetation: only consider the content of the 2 fields review_title and review_text
# concantenate them in one new field "review_concat"from pyspark.sql import SQLContext
from pyspark.sql import functions as fn
from pyspark.sql.types import IntegerType
import pandas as pd

filepath = 'data_processed/ExctractedData.json'
# load JSON file
s_df = spark.read.json(filepath)
s_df.count()
s_df = s_df.drop_duplicates(subset=['review_id'])
pd_df = s_df.groupBy('review_id').count().toPandas().set_index("count").sort_index(ascending=False)

In [65]:
# control no duplicate
pd_df.head()

Unnamed: 0_level_0,review_id
count,Unnamed: 1_level_1
1,R15DG6BI3K1I78
1,R1UU50BM0S4LPY
1,R27KEMBTEQ4MHI
1,R1HMP34XP1V9BE
1,R22I2JYOOXA3PP


In [66]:
# concatenate review text and title in one field
s_df = s_df.withColumn('review_concat',fn.concat(fn.col('review_title'),fn.lit(' '), fn.col('review_text')))
# review_score is of type String ==> cast it from String to Integer
s_df = s_df.withColumn("review_score", s_df["review_score"].cast(IntegerType()))
#s_df = s_df.withColumn("book_id", s_df["book_id"].cast(IntegerType()))
s_df.printSchema()

root
 |-- book_id: string (nullable = true)
 |-- book_title: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- review_score: integer (nullable = true)
 |-- review_text: string (nullable = true)
 |-- review_title: string (nullable = true)
 |-- review_user: string (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- review_concat: string (nullable = true)



In [67]:
print('Total # of rows: ' + str(s_df.count()))
print('# of rows per class:')
s_df.groupBy("review_user") \
    .count() \
    .orderBy(fn.col("count").desc()) \
    .show()

Total # of rows: 11573
# of rows per class:
+--------------------+-----+
|         review_user|count|
+--------------------+-----+
|        D.P. McHenry|  505|
|     Amazon Customer|  410|
|The Guide To Roma...|  251|
|Beckyrae99 (Becky...|  189|
|              Liz R.|  169|
|         Keith Hauge|  156|
|     Kindle Customer|  150|
|              Sheela|  117|
|            MaureenB|  115|
|         JB VonShirl|  108|
|            Daniotra|  106|
|           Mary Lins|  102|
|         Nolia Nessa|  101|
|     Paul A. Johnson|   97|
|           R. Zocher|   95|
|Jessica Sotelo (A...|   89|
|         KindleReads|   82|
|     Keith A. Comess|   75|
|             Eric M.|   73|
|        Lisa Kersten|   72|
+--------------------+-----+
only showing top 20 rows



In [68]:
# check how many review per user
s_df.groupBy("review_user") \
    .count() \
    .orderBy(fn.col("count").desc()) \
    .show(20, False)

+-----------------------------------------------+-----+
|review_user                                    |count|
+-----------------------------------------------+-----+
|D.P. McHenry                                   |505  |
|Amazon Customer                                |410  |
|The Guide To Romance Novels                    |251  |
|Beckyrae99 (Becky Wise)                        |189  |
|Liz R.                                         |169  |
|Keith Hauge                                    |156  |
|Kindle Customer                                |150  |
|Sheela                                         |117  |
|MaureenB                                       |115  |
|JB VonShirl                                    |108  |
|Daniotra                                       |106  |
|Mary Lins                                      |102  |
|Nolia Nessa                                    |101  |
|Paul A. Johnson                                |97   |
|R. Zocher                                      

In [9]:
# check min / max / avg rating for top reviewer ==> we see they all give 5 star reviews
print('Min / max / avg review score for top reviewer (100 or more reviews)')
s_df.select("review_user", "review_score") \
    .groupBy("review_user").agg( s_df.review_user, fn.min("review_score"), fn.max("review_score"), fn.avg("review_score") ) \
    .where( (fn.col('review_user') == 'D.P. McHenry') | (fn.col('review_user') == 'Amazon Customer') \
           | (fn.col('review_user') == 'The Guide To Romance Novels') | (fn.col('review_user') == 'Beckyrae99 (Becky Wise)')  \
           | (fn.col('review_user') == 'Liz R.') | (fn.col('review_user') == 'Keith Hauge')  \
           | (fn.col('review_user') == 'Kindle Customer') | (fn.col('review_user') == 'Sheela')  \
           | (fn.col('review_user') == 'MaureenB') | (fn.col('review_user') == 'JB VonShirl')  \
           | (fn.col('review_user') == 'Daniotra') | (fn.col('review_user') == 'Mary Lins') | (fn.col('review_user') == 'Nolia Nessa'))  \
    .show()

Min / max / avg review score for top reviewer (100 or more reviews)
+--------------------+--------------------+-----------------+-----------------+------------------+
|         review_user|         review_user|min(review_score)|max(review_score)| avg(review_score)|
+--------------------+--------------------+-----------------+-----------------+------------------+
|         Nolia Nessa|         Nolia Nessa|                5|                5|               5.0|
|Beckyrae99 (Becky...|Beckyrae99 (Becky...|                4|                4|               4.0|
|           Mary Lins|           Mary Lins|                5|                5|               5.0|
|              Liz R.|              Liz R.|                5|                5|               5.0|
|         Keith Hauge|         Keith Hauge|                5|                5|               5.0|
|            MaureenB|            MaureenB|                5|                5|               5.0|
|              Sheela|              Sheel

In [10]:
# check how many review per book
s_df.groupBy("book_id") \
    .count() \
    .orderBy(fn.col("count").desc()) \
    .show(25, False)

+----------+-----+
|book_id   |count|
+----------+-----+
|0143110438|2129 |
|0062678426|1071 |
|198211598X|579  |
|62678426  |506  |
|1984898329|251  |
|0800736524|242  |
|0525538194|242  |
|0525572643|231  |
|1542046513|189  |
|62319795  |176  |
|0553448234|169  |
|1400209609|158  |
|0525536582|156  |
|1400208017|134  |
|162860378X|117  |
|1644450003|117  |
|0310353629|115  |
|194883605X|108  |
|0393239861|106  |
|0525436146|102  |
|0316414212|101  |
|1982111003|97   |
|1607749580|97   |
|0316316121|95   |
|B071SBMK94|89   |
+----------+-----+
only showing top 25 rows



In [11]:
# check min rating for top reviewer ==> we see they all give 5 star reviews
#    .groupBy("book_id").agg( s_df.book_id, f.min("review_score"), f.max("review_score"), f.avg("review_score") ) \
print('Min / max / avg review score for top books')
s_df.select("book_id", "review_score") \
    .groupBy("book_id").agg( s_df.book_id, fn.min("review_score"), fn.max("review_score"), fn.avg("review_score") ) \
    .where( (fn.col('book_id') == '0143110438') | (fn.col('book_id') == '0062678426') \
           | (fn.col('book_id') == '198211598X') | (fn.col('book_id') == '62678426')  \
           | (fn.col('book_id') == '1984898329') | (fn.col('book_id') == '0525538194')  \
           | (fn.col('book_id') == '0800736524') | (fn.col('book_id') == '0525572643')  \
           | (fn.col('book_id') == '1542046513') | (fn.col('book_id') == '62319795')  \
           | (fn.col('book_id') == '0553448234') | (fn.col('book_id') == '1400209609') | (fn.col('book_id') == '0525536582'))  \
    .show()

Min / max / avg review score for top books
+----------+----------+-----------------+-----------------+------------------+
|   book_id|   book_id|min(review_score)|max(review_score)| avg(review_score)|
+----------+----------+-----------------+-----------------+------------------+
|0553448234|0553448234|                5|                5|               5.0|
|  62678426|  62678426|                1|                5|4.2272727272727275|
|0525538194|0525538194|                1|                5| 4.743801652892562|
|198211598X|198211598X|                1|                5|4.6217616580310885|
|0525536582|0525536582|                5|                5|               5.0|
|1400209609|1400209609|                1|                5| 4.234177215189874|
|0143110438|0143110438|                1|                5| 4.789572569281352|
|1542046513|1542046513|                4|                4|               4.0|
|  62319795|  62319795|                1|                5| 4.482954545454546|
|00626784

In [12]:
# look at first 5 star review
s_df.where(fn.col('review_score') == 5).first()

Row(book_id='0062678426', book_title='The Woman in the Window: A Novel', review_id='R15DG6BI3K1I78', review_score=5, review_text="Extraordinary on any & every level. Astonishing that it' s a debut novel. Transfixing.", review_title='Although reviews are universally stellar, highly recommend one avoids reading them & any synopsis preplunging in.', review_user='Perel Soreh', timestamp=1556661613, review_concat="Although reviews are universally stellar, highly recommend one avoids reading them & any synopsis preplunging in. Extraordinary on any & every level. Astonishing that it' s a debut novel. Transfixing.")

In [13]:
# look at 1 very bad review
s_df.where(fn.col('review_score') == 1).first()

Row(book_id='0062824619', book_title='Cemetery Road: A Novel', review_id='R1T4O9RXIKX7D9', review_score=1, review_text='I am a huge fan of Greg Isles, but Cemetery Road was a outline of the garbage that the publishers must insist on before they will publish your book.  Mr. Isles, you are better than this, and you disappointed us with Cemetery Road.  I am going back to your older books, which are far superior to your latest endeavor.  In closing, there are no grey areas like you are suggesting in your book.  It is either moral or immoral.  There is no in between.', review_title='Disappointed', review_user='Jeanette Grayeb-Mihal', timestamp=1554878526, review_concat='Disappointed I am a huge fan of Greg Isles, but Cemetery Road was a outline of the garbage that the publishers must insist on before they will publish your book.  Mr. Isles, you are better than this, and you disappointed us with Cemetery Road.  I am going back to your older books, which are far superior to your latest endeav

In [14]:
# Show ony review_concat field
s_df.select('review_concat').where(fn.col('review_score') == 1).first()

Row(review_concat='Disappointed I am a huge fan of Greg Isles, but Cemetery Road was a outline of the garbage that the publishers must insist on before they will publish your book.  Mr. Isles, you are better than this, and you disappointed us with Cemetery Road.  I am going back to your older books, which are far superior to your latest endeavor.  In closing, there are no grey areas like you are suggesting in your book.  It is either moral or immoral.  There is no in between.')

# 2. Define pre-processing pipeline

In [8]:
# import stop words to filter them out from the reviews
import requests
stop_words = requests.get('http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words').text.split()
stop_words[0:10]

['a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost']

In [8]:
# define processing 4 steps and execute them with a trsnformation pipeline
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, IDF
from nltk.stem.snowball import SnowballStemmer
from pyspark.ml import Pipeline

# 1. Tokenizer, .setPattern("\\p{L}+") means that it remove accent from words (check it has no impact on the smileys !!!)
tokenizer = RegexTokenizer().setGaps(False)\
  .setPattern("\\p{L}+")\
  .setInputCol("review_concat")\
  .setOutputCol("words")

# 2. filter out stop words
sw_filter = StopWordsRemover()\
  .setStopWords(stop_words)\
  .setCaseSensitive(False)\
  .setInputCol("words")\
  .setOutputCol("filtered")

# 3. TF: TF vectorization + remove words that appear in 5 docs or less
#  converts text documents to vectors of term counts
cv = CountVectorizer(minTF=1., minDF=5., vocabSize=2**17)\
  .setInputCol("filtered")\
  .setOutputCol("tf")

# 4. TF-IDF transform
# The IDFModel takes feature vectors (generally created from HashingTF or CountVectorizer) and scales each column. 
# Intuitively, it down-weights columns which appear frequently in a corpus.
idf = IDF().\
    setInputCol('tf').\
    setOutputCol('tfidf')

# Create a pipelined transformer and fit it with full data set
tfidf_pipeline = Pipeline(stages=[tokenizer, sw_filter, cv, idf]).fit(s_df)

# Control execution of preprocessing pipeline by pre-processing the data
s_df_transform = tfidf_pipeline.transform(s_df)
s_df_transform.select('tfidf').where(fn.col('review_score') == 1).first()


NameError: name 'stop_words' is not defined

In [17]:
# check schema of output of preprocessing pipeline 
s_df_transform.printSchema()

root
 |-- book_id: string (nullable = true)
 |-- book_title: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- review_score: integer (nullable = true)
 |-- review_text: string (nullable = true)
 |-- review_title: string (nullable = true)
 |-- review_user: string (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- review_concat: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tf: vector (nullable = true)
 |-- tfidf: vector (nullable = true)



# 3. Multiclass Models
## 3.1. Simple logistic regression

In [18]:
# random split in train and test set with 80-20% proportions
training_df, testing_df = s_df.randomSplit([0.8, 0.2], seed=42)
[training_df.count(), testing_df.count()]

[9191, 2382]

In [19]:
# Add logistic regression to the previously defined pipeline
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression().\
    setLabelCol('review_score').\
    setFeaturesCol('tfidf').\
    setRegParam(0.0).\
    setMaxIter(100).\
    setElasticNetParam(0.)

In [96]:
# new pipeline to chain idf_pipeline with logistic regression
# fit training set on pipeline
lr_pipeline = Pipeline(stages=[tfidf_pipeline, lr]).fit(training_df)

# precict on test and calculate accuracy
lr_predictions = lr_pipeline.transform(testing_df)

In [6]:
# score above seems OK but now let's check the accuracy per class. we see it is not good for all but 5
def printClassPredictions(predictions):
    predictions.select(fn.expr('float(prediction = review_score)').alias('correct')).\
        select(fn.avg('correct')).show()
    print('Score = 1')
    predictions.filter(predictions['review_score'] == 1).\
        select(fn.expr('float(prediction = review_score)').alias('correct')).\
        select(fn.avg('correct')).show()
    print('Score = 2')
    predictions.filter(predictions['review_score'] == 2).\
        select(fn.expr('float(prediction = review_score)').alias('correct')).\
        select(fn.avg('correct')).show()
    print('Score = 3')
    predictions.filter(predictions['review_score'] == 3).\
        select(fn.expr('float(prediction = review_score)').alias('correct')).\
        select(fn.avg('correct')).show()
    print('Score = 4')
    predictions.filter(predictions['review_score'] == 4).\
        select(fn.expr('float(prediction = review_score)').alias('correct')).\
        select(fn.avg('correct')).show()
    print('Score = 5')
    predictions.filter(predictions['review_score'] == 5).\
        select(fn.expr('float(prediction = review_score)').alias('correct')).\
        select(fn.avg('correct')).show()

In [98]:
printClassPredictions(lr_predictions)

+------------------+
|      avg(correct)|
+------------------+
|0.8303946263643996|
+------------------+

Score = 1
+------------------+
|      avg(correct)|
+------------------+
|0.3448275862068966|
+------------------+

Score = 2
+-------------------+
|       avg(correct)|
+-------------------+
|0.20512820512820512|
+-------------------+

Score = 3
+-------------------+
|       avg(correct)|
+-------------------+
|0.25675675675675674|
+-------------------+

Score = 4
+-----------------+
|     avg(correct)|
+-----------------+
|0.582089552238806|
+-----------------+

Score = 5
+------------------+
|      avg(correct)|
+------------------+
|0.9165354330708662|
+------------------+



## 3.3. Logistic regression with stratified split

In [32]:
# Now make a new stratified split to make sure we have enough representative examples in the train set
training_strat_df = s_df.sampleBy("review_score", fractions={1: 0.8, 2: 0.8, 3: 0.8, 4: 0.8, 5: 0.8}, seed=42)
test_strat_df = s_df.subtract(training_strat_df)
# training set
intersect = training_strat_df.select('review_id').intersect(test_strat_df.select('review_id'))
print('size intersect: ' + str(intersect.count()))

print('# rows training set: ' + str(training_strat_df.count()))
print('# rows per class')
training_strat_df.groupBy("review_score") \
    .count() \
    .orderBy(fn.col("count").desc()) \
    .show()
# test set
print('# rows test set: ' + str(test_strat_df.count()))
print('# rows per class')
test_strat_df.groupBy("review_score") \
    .count() \
    .orderBy(fn.col("count").desc()) \
    .show()

size intersect: 0
# rows training set: 9191
# rows per class
+------------+-----+
|review_score|count|
+------------+-----+
|           5| 7454|
|           4| 1216|
|           3|  270|
|           2|  137|
|           1|  114|
+------------+-----+

# rows test set: 2382
# rows per class
+------------+-----+
|review_score|count|
+------------+-----+
|           5| 1929|
|           4|  313|
|           3|   76|
|           2|   33|
|           1|   31|
+------------+-----+



In [102]:
# new prevision with previously defined en_lr (elestic net logistic regression)
en_lr_strat_pipeline = Pipeline(stages=[tfidf_pipeline, en_lr]).fit(training_strat_df)
# fitting + accuracy estimation
predictions_en_lr_strat = en_lr_strat_pipeline.transform(test_strat_df)

printClassPredictions(predictions_en_lr_strat)

+------------------+
|      avg(correct)|
+------------------+
|0.8119227539882452|
+------------------+

Score = 1
+------------+
|avg(correct)|
+------------+
|         0.0|
+------------+

Score = 2
+------------+
|avg(correct)|
+------------+
|         0.0|
+------------+

Score = 3
+------------+
|avg(correct)|
+------------+
|         0.0|
+------------+

Score = 4
+-------------------+
|       avg(correct)|
+-------------------+
|0.01597444089456869|
+-------------------+

Score = 5
+------------+
|avg(correct)|
+------------+
|         1.0|
+------------+



## 3.4. Logistic regression with down sampling
Sampling down all class to the smallest 1 (= 1 star)

In [103]:
# recap data set size and distribution
print('# rows per class')
s_df.groupBy("review_score") \
    .count() \
    .orderBy(fn.col("count").desc()) \
    .show()
print('Total # rows in data set: '+ str(s_df.count()))

# rows per class
+------------+-----+
|review_score|count|
+------------+-----+
|           5| 9383|
|           4| 1529|
|           3|  346|
|           2|  170|
|           1|  145|
+------------+-----+

Total # rows in data set: 11573


In [104]:
# downsampling to 145 rows per class (# rows for class 1)
downsampled_data = s_df.sampleBy('review_score',
    fractions={1: 1, 2: 145./170, 3: 145./346, 4: 145./1529, 5: 145./9383}) \
    .cache()

downsampled_data.groupBy("review_score") \
    .count() \
    .orderBy(fn.col("count").desc()) \
    .show()

+------------+-----+
|review_score|count|
+------------+-----+
|           3|  162|
|           2|  158|
|           4|  154|
|           5|  147|
|           1|  145|
+------------+-----+



In [105]:
# random split in train and test set with 80-20% proportions
training_down_df, testing_down_df = downsampled_data.randomSplit([0.8, 0.2], seed=42)
[training_down_df.count(), testing_down_df.count()]

[605, 161]

In [106]:
# new prevision with previously defined en_lr (elestic net logistic regression)
en_lr_down_pipeline = Pipeline(stages=[tfidf_pipeline, en_lr]).fit(training_down_df)
# fitting + accuracy estimation
predictions_en_lr_down = en_lr_down_pipeline.transform(testing_down_df)

printClassPredictions(predictions_en_lr_down)

+-------------------+
|       avg(correct)|
+-------------------+
|0.40372670807453415|
+-------------------+

Score = 1
+------------------+
|      avg(correct)|
+------------------+
|0.4857142857142857|
+------------------+

Score = 2
+------------+
|avg(correct)|
+------------+
|       0.375|
+------------+

Score = 3
+------------------+
|      avg(correct)|
+------------------+
|0.3870967741935484|
+------------------+

Score = 4
+------------------+
|      avg(correct)|
+------------------+
|0.5172413793103449|
+------------------+

Score = 5
+------------------+
|      avg(correct)|
+------------------+
|0.2647058823529412|
+------------------+



## 3.5 Logistic regression with up and down sampling

In [20]:
# Using down - up sampling to build a train set with 2000 example per class
# Test set = 20% of train set = 10000 * 0.2 = 1500 rows
# Attention: test set cannot contains exmaple fro test set and must be built before up-down sampling
#
# Get test set of 2000 rows with a factor 'prop'
prop = (11500.-2000.)/11500
training_updown_df_pre = s_df.sampleBy("review_score", fractions={1: prop, 2: prop, 3: prop, 4: prop, 5: prop}, seed=42)
test_updown_df = s_df.subtract(training_updown_df_pre)
# training set before up - down sampling
print('training set before up - down sampling: ' + str(training_updown_df_pre.count()))
training_updown_df_pre.groupBy("review_score") \
    .count() \
    .orderBy(fn.col("count").desc()) \
    .show()
# test set
print('test set: ' + str(test_updown_df.count()))
test_updown_df.groupBy("review_score") \
    .count() \
    .orderBy(fn.col("count").desc()) \
    .show()

training set before up - down sampling: 9509
+------------+-----+
|review_score|count|
+------------+-----+
|           5| 7724|
|           4| 1254|
|           3|  274|
|           2|  138|
|           1|  119|
+------------+-----+

test set: 2064
+------------+-----+
|review_score|count|
+------------+-----+
|           5| 1659|
|           4|  275|
|           3|   72|
|           2|   32|
|           1|   26|
+------------+-----+



In [21]:
# perform up and down sampling on the trainig set so that each class contains +- 1500 rows
df_class_1 = training_updown_df_pre[training_updown_df_pre['review_score'] == 1]
df_class_2 = training_updown_df_pre[training_updown_df_pre['review_score'] == 2]
df_class_3 = training_updown_df_pre[training_updown_df_pre['review_score'] == 3]
df_class_4 = training_updown_df_pre[training_updown_df_pre['review_score'] == 4]
df_class_5 = training_updown_df_pre[training_updown_df_pre['review_score'] == 5]

df_class_1_over = df_class_1.sample(withReplacement=True, fraction=2000./117, seed = 42)
df_class_2_over = df_class_2.sample(withReplacement=True, fraction=2000./139, seed = 42)
df_class_3_over = df_class_3.sample(withReplacement=True, fraction=2000./290, seed = 42)
df_class_4_over = df_class_4.sample(withReplacement=True, fraction=2000./1242, seed = 42)
df_class_5_under = df_class_5.sample(withReplacement=True, fraction=2000./7721, seed = 42)

import functools 

def unionAll(dfs):
    return functools.reduce(lambda df1,df2: df1.union(df2.select(df1.columns)), dfs) 

training_updown_df = unionAll([df_class_1_over, df_class_2_over, df_class_3_over, df_class_4_over, df_class_5_under])

print('# of rows in the train set')
print('Total: ' + str(training_updown_df.count()))
print('Per class:')
training_updown_df.groupBy("review_score") \
    .count() \
    .orderBy(fn.col("count").desc()) \
    .show()

# of rows in the train set
Total: 9693
Per class:
+------------+-----+
|review_score|count|
+------------+-----+
|           4| 1997|
|           1| 1993|
|           2| 1936|
|           5| 1918|
|           3| 1849|
+------------+-----+



In [26]:
# new prevision with previously defined en_lr (elestic net logistic regression)
lr_updown_pipeline = Pipeline(stages=[tfidf_pipeline, lr]).fit(training_updown_df)
# fitting + accuracy estimation
predictions_lr_updown = lr_updown_pipeline.transform(test_updown_df)

printClassPredictions(predictions_lr_updown)

+------------------+
|      avg(correct)|
+------------------+
|0.7257751937984496|
+------------------+

Score = 1
+-------------------+
|       avg(correct)|
+-------------------+
|0.11538461538461539|
+-------------------+

Score = 2
+------------+
|avg(correct)|
+------------+
|     0.34375|
+------------+

Score = 3
+------------------+
|      avg(correct)|
+------------------+
|0.2638888888888889|
+------------------+

Score = 4
+------------------+
|      avg(correct)|
+------------------+
|0.6727272727272727|
+------------------+

Score = 5
+------------------+
|      avg(correct)|
+------------------+
|0.7715491259795058|
+------------------+



## 3.6 Naive Bayes

In [61]:
from pyspark.ml.feature import StringIndexer, Tokenizer, CountVectorizer, IDF, IndexToString, HashingTF, StopWordsRemover
from pyspark.ml.classification import NaiveBayes
from pyspark.ml import Pipeline

import nltk
from nltk.corpus import stopwords 

# import nltk stop words
nltk.download('stopwords')
stop_words = list(set(stopwords.words('english')))

# 3. Filter out stop words
sw_filter = StopWordsRemover()\
    .setStopWords(stop_words)\
    .setCaseSensitive(False)\
    .setInputCol("words")\
    .setOutputCol("filtered")

s_df = s_df.withColumn('label',fn.col('review_score'))

train_nb, test_nb = s_df.randomSplit([0.8, 0.2], seed=42)

#train_nb = s_df.sampleBy("review_score", fractions={1: 0.8, 2: 0.8, 3: 0.8, 4: 0.8, 5: 0.8}, seed=42)
#test_nb = s_df.subtract(train_nb)

#categoryIndexer_nb = StringIndexer(inputCol="review_score", outputCol="label", handleInvalid = "keep")
tokenizer_nb = Tokenizer(inputCol="review_concat", outputCol="words")
hashingTF_nb = HashingTF(inputCol="words", outputCol="features", numFeatures=10000)
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
#categoryConverter_nb = IndexToString(inputCol="prediction", outputCol="predCategory")

#nb_pipeline = Pipeline(stages=[categoryIndexer_nb, tokenizer_nb, sw_filter, hashingTF_nb, nb, categoryConverter_nb])
nb_pipeline = Pipeline(stages=[tokenizer_nb, sw_filter, hashingTF_nb, nb])

nb_model = nb_pipeline.fit(train_nb)
nb_predictions = nb_model.transform(train_nb)
printClassPredictions(nb_predictions)

[nltk_data] Downloading package stopwords to /Users/admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
+-------------------+
|       avg(correct)|
+-------------------+
|0.04624088782504624|
+-------------------+

Score = 1
+------------+
|avg(correct)|
+------------+
|         0.0|
+------------+

Score = 2
+-------------------+
|       avg(correct)|
+-------------------+
|0.07633587786259542|
+-------------------+

Score = 3
+-------------------+
|       avg(correct)|
+-------------------+
|0.10294117647058823|
+-------------------+

Score = 4
+------------------+
|      avg(correct)|
+------------------+
|0.3241206030150754|
+------------------+

Score = 5
+------------+
|avg(correct)|
+------------+
|         0.0|
+------------+



In [59]:
nb_predictions.select('review_score', 'label', 'prediction').show()

+------------+-----+----------+
|review_score|label|prediction|
+------------+-----+----------+
|           3|    3|       4.0|
|           5|    5|       4.0|
|           5|    5|       4.0|
|           5|    5|       4.0|
|           5|    5|       4.0|
|           5|    5|       4.0|
|           5|    5|       3.0|
|           5|    5|       4.0|
|           5|    5|       4.0|
|           5|    5|       4.0|
|           4|    4|       3.0|
|           5|    5|       3.0|
|           5|    5|       4.0|
|           5|    5|       4.0|
|           5|    5|       4.0|
|           5|    5|       4.0|
|           5|    5|       4.0|
|           5|    5|       4.0|
|           5|    5|       4.0|
|           4|    4|       3.0|
+------------+-----+----------+
only showing top 20 rows



In [20]:
nb_predictions.filter(nb_predictions['review_score'] == 5).\
        select('prediction', 'label', 'review_score', 'review_concat').show()

+----------+-----+------------+--------------------+
|prediction|label|review_score|       review_concat|
+----------+-----+------------+--------------------+
|       0.0|  0.0|           5|Go Be Kind This b...|
|       0.0|  0.0|           5|amazing book grea...|
|       0.0|  0.0|           5|Such a blessing t...|
|       0.0|  0.0|           5|Page-turner. Terr...|
|       0.0|  0.0|           5|Elegant with just...|
|       0.0|  0.0|           5|My heart is still...|
|       0.0|  0.0|           5|Hard going I gave...|
|       0.0|  0.0|           5|Hilarious and ins...|
|       0.0|  0.0|           5|757 pages of jour...|
|       0.0|  0.0|           5|Totally engrossin...|
|       0.0|  0.0|           5|The COUNT One of ...|
|       0.0|  0.0|           5|Difficult to cont...|
|       0.0|  0.0|           5|Funny, serious, i...|
|       0.0|  0.0|           5|This book has cha...|
|       0.0|  0.0|           5|Best book I've ev...|
|       0.0|  0.0|           5|Great Read Well

## 3.7 Random Forest 

In [None]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)
# Train model with Training Data
rfModel = rf.fit(trainingData)
predictions = rfModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("Descript","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="review_score", outputCol="indexedLabel").fit(s_df)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = s_df.randomSplit([0.8, 0.2])

# Automatically identify categorical features, and index them.
assembler = VectorAssembler(inputCols=['tfidf','sentimentIndex'],outputCol="indexedFeatures")

# Train a GBT model.
rf = RandomForestClassifier(labelCol="review_score", featuresCol="indexedFeatures", numTrees=1)

# Chain indexers and GBT in a Pipeline
pipeline = Pipeline(stages=[tfidf_pipeline, assembler, rf])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "review_score", "indexedFeatures").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="review_score", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

gbtModel = model.stages[2]
print(gbtModel)  # summary only

# 4. Binary Models
1 to 3 stars = 0 and 4 / 5 stars = 1
## 4.1 Logistic regression
with hyper parameter tuning on dedicated validation set

In [37]:
# start with easy implemetation: only consider the content of the 2 fields review_title and review_text
# concantenate them in one new field "review_concat"from pyspark.sql import SQLContext
from pyspark.sql import functions as fn
from pyspark.sql.types import IntegerType
import pandas as pd

filepath = 'data_processed/ExctractedData.json'
# load JSON file
s_df = spark.read.json(filepath)
s_df.count()
s_df = s_df.drop_duplicates(subset=['review_id'])

In [38]:
# concatenate review text and title in one field
s_df = s_df.withColumn('review_concat',fn.concat(fn.col('review_title'),fn.lit(' '), fn.col('review_text')))
# review_score is of type String ==> cast it from String to Integer
s_df = s_df.withColumn("review_score", s_df["review_score"].cast(IntegerType()))

# add new binary score (0 or 1), 
# 1 to 3 stars = 0 and 4 to 5 stars = 1
from pyspark.sql.functions import udf
def scoreToBin(value):
   if   value < 4: return 0
   else : return 1
udfScoreToBin = udf(scoreToBin, IntegerType())
s_df = s_df.withColumn("bin_score", udfScoreToBin("review_score"))
s_df.printSchema()

root
 |-- book_id: string (nullable = true)
 |-- book_title: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- review_score: integer (nullable = true)
 |-- review_text: string (nullable = true)
 |-- review_title: string (nullable = true)
 |-- review_user: string (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- review_concat: string (nullable = true)
 |-- bin_score: integer (nullable = true)



In [47]:
print('Total # of rows: ' + str(s_df.count()))
print('# of rows per review score:')
s_df.groupBy("review_score") \
    .count() \
    .orderBy(fn.col("count").desc()) \
    .show()
print('# of rows per BINARY review score:')
s_df.groupBy("bin_score") \
    .count() \
    .orderBy(fn.col("count").desc()) \
    .show()

Total # of rows: 11573
# of rows per review score:
+------------+-----+
|review_score|count|
+------------+-----+
|           5| 9383|
|           4| 1529|
|           3|  346|
|           2|  170|
|           1|  145|
+------------+-----+

# of rows per BINARY review score:
+---------+-----+
|bin_score|count|
+---------+-----+
|        1|10912|
|        0|  661|
+---------+-----+



In [48]:
# Now make a new stratified split 70-10-20% with same proportion of bin_score 0 and 1
training_strat_df = s_df.sampleBy("bin_score", fractions={0: 0.7, 1: 0.7}, seed=42)
test_valid_strat_df = s_df.subtract(training_strat_df)

valid_strat_df = test_valid_strat_df.sampleBy("bin_score", fractions={0: 0.33, 1: 0.33}, seed=42)
test_strat_df = test_valid_strat_df.subtract(valid_strat_df)

# show some stats
# training sets
print('# rows training set: ' + str(training_strat_df.count()))
print('# rows per class')
training_strat_df.groupBy("bin_score") \
    .count() \
    .orderBy(fn.col("count").desc()) \
    .show()
# validation set
print('# rows validation set: ' + str(valid_strat_df.count()))
print('# rows per class')
valid_strat_df.groupBy("bin_score") \
    .count() \
    .orderBy(fn.col("count").desc()) \
    .show()
# test set
print('# rows test set: ' + str(test_strat_df.count()))
print('# rows per class')
test_strat_df.groupBy("bin_score") \
    .count() \
    .orderBy(fn.col("count").desc()) \
    .show()

# rows training set: 8027
# rows per class
+---------+-----+
|bin_score|count|
+---------+-----+
|        1| 7579|
|        0|  448|
+---------+-----+

# rows validation set: 1105
# rows per class
+---------+-----+
|bin_score|count|
+---------+-----+
|        1| 1031|
|        0|   74|
+---------+-----+

# rows test set: 2441
# rows per class
+---------+-----+
|bin_score|count|
+---------+-----+
|        1| 2302|
|        0|  139|
+---------+-----+



In [49]:
# perform up sampling on the trainig to increase the number of reviews with bin_score = 0
# increase with a factor 5 to get above 2000 reviews with bin_score = 0
df_class_0 = training_strat_df[training_strat_df['bin_score'] == 0]
df_class_0_over = df_class_0.sample(withReplacement=True, fraction=5., seed = 42)

df_class_1 = training_strat_df[training_strat_df['bin_score'] == 1]

import functools 
def unionAll(dfs):
    return functools.reduce(lambda df1,df2: df1.union(df2.select(df1.columns)), dfs) 

training_up_df = unionAll([df_class_0_over, df_class_1])

print('# of rows in the train set')
print('Total: ' + str(training_up_df.count()))
print('Per class:')
training_up_df.groupBy("bin_score") \
    .count() \
    .orderBy(fn.col("count").desc()) \
    .show()

# of rows in the train set
Total: 9747
Per class:
+---------+-----+
|bin_score|count|
+---------+-----+
|        1| 7579|
|        0| 2168|
+---------+-----+



In [50]:
# define pre-processing and classification pipeline

from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import IDF, RegexTokenizer, StringIndexer, StopWordsRemover, CountVectorizer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
import nltk
from nltk.corpus import stopwords 

# import nltk stop words
nltk.download('stopwords')
stop_words = list(set(stopwords.words('english')))

# 1. String indexer: convert book_id (string) to unique numeric undex
book_stringIdx = StringIndexer() \
    .setHandleInvalid("keep")\
    .setInputCol("book_id")\
    .setOutputCol("book_label")

# 2. Tokenizer, .setPattern("\\p{L}+") means that it remove accent from words
regex_tokenizer = RegexTokenizer()\
    .setGaps(False)\
    .setPattern("\\p{L}+")\
    .setInputCol("review_concat")\
    .setOutputCol("words")

# 3. Filter out stop words
stopword_remover = StopWordsRemover()\
    .setStopWords(stop_words)\
    .setCaseSensitive(False)\
    .setInputCol("words")\
    .setOutputCol("filtered")

# 4. TF: TF vectorization + remove words that appear in 5 docs or less
# converts text documents to vectors of term counts
count_vectorizer = CountVectorizer(minDF=5)\
    .setInputCol("filtered")\
    .setOutputCol("tf")

# 5. TF-IDF transform
# The IDFModel takes feature vectors (generally created from HashingTF or CountVectorizer) and scales each column. 
# Intuitively, it down-weights columns which appear frequently in a corpus.
idf = IDF()\
    .setInputCol("tf")\
    .setOutputCol("tfidf")

# 6. Feature assembler
# assemble tfidf tectual features with book_label
assembler = VectorAssembler(inputCols=['tfidf','book_label'],outputCol="tfidf_book")

[nltk_data] Downloading package stopwords to /Users/admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [51]:
# utility function to calculate and print prediction results
def printClassPredictions_bin(predictions):
    predictions.select(fn.expr('float(prediction = bin_score)').alias('correct')).\
        select(fn.avg('correct')).show()
    print('bin_score = 0')
    predictions.filter(predictions['bin_score'] == 0).\
        select(fn.expr('float(prediction = bin_score)').alias('correct')).\
        select(fn.avg('correct')).show()
    print('bin_score = 1')
    predictions.filter(predictions['bin_score'] == 1).\
        select(fn.expr('float(prediction = bin_score)').alias('correct')).\
        select(fn.avg('correct')).show()

### Parameter tuning on the validation set

In [52]:
lr = LogisticRegression(featuresCol=assembler.getOutputCol(), labelCol="bin_score")

pipeline = Pipeline(stages=[book_stringIdx, regex_tokenizer, stopword_remover,
    count_vectorizer, idf, assembler, lr])

model = pipeline.fit(training_up_df)
predictionsBin = model.transform(valid_strat_df)
printClassPredictions_bin(predictionsBin)

+------------------+
|      avg(correct)|
+------------------+
|0.9457013574660633|
+------------------+

bin_score = 0
+-------------------+
|       avg(correct)|
+-------------------+
|0.43243243243243246|
+-------------------+

bin_score = 1
+------------------+
|      avg(correct)|
+------------------+
|0.9825412221144519|
+------------------+



In [53]:
lr_en1 = LogisticRegression(featuresCol=assembler.getOutputCol(), labelCol="bin_score",
        regParam = 0.1, elasticNetParam = 0.8)

pipeline_en1 = Pipeline(stages=[book_stringIdx, regex_tokenizer, stopword_remover,
    count_vectorizer, idf, assembler, lr_en1])

model_en1 = pipeline_en1.fit(training_up_df)
predictionsBin_en1 = model_en1.transform(valid_strat_df)
printClassPredictions_bin(predictionsBin_en1)

+------------------+
|      avg(correct)|
+------------------+
|0.9330316742081448|
+------------------+

bin_score = 0
+------------+
|avg(correct)|
+------------+
|         0.0|
+------------+

bin_score = 1
+------------+
|avg(correct)|
+------------+
|         1.0|
+------------+



In [54]:
lr_en2 = LogisticRegression(featuresCol=assembler.getOutputCol(), labelCol="bin_score",
        regParam = 0.1, elasticNetParam = 0.5)

pipeline_en2 = Pipeline(stages=[book_stringIdx, regex_tokenizer, stopword_remover,
    count_vectorizer, idf, assembler, lr_en2])

model_en2 = pipeline_en2.fit(training_up_df)
predictionsBin_en2 = model_en2.transform(valid_strat_df)
printClassPredictions_bin(predictionsBin_en2)

+------------------+
|      avg(correct)|
+------------------+
|0.9312217194570136|
+------------------+

bin_score = 0
+------------+
|avg(correct)|
+------------+
|         0.0|
+------------+

bin_score = 1
+------------------+
|      avg(correct)|
+------------------+
|0.9980601357904947|
+------------------+



In [55]:
lr_en3 = LogisticRegression(featuresCol=assembler.getOutputCol(), labelCol="bin_score",
        regParam = 0.3, elasticNetParam = 0.8)

pipeline_en3 = Pipeline(stages=[book_stringIdx, regex_tokenizer, stopword_remover,
    count_vectorizer, idf, assembler, lr_en3])

model_en3 = pipeline_en3.fit(training_up_df)
predictionsBin_en3 = model_en3.transform(valid_strat_df)
printClassPredictions_bin(predictionsBin_en3)

+------------------+
|      avg(correct)|
+------------------+
|0.9330316742081448|
+------------------+

bin_score = 0
+------------+
|avg(correct)|
+------------+
|         0.0|
+------------+

bin_score = 1
+------------+
|avg(correct)|
+------------+
|         1.0|
+------------+



In [56]:
lr_en4 = LogisticRegression(featuresCol=assembler.getOutputCol(), labelCol="bin_score",
        regParam = 0.3, elasticNetParam = 0.5)

pipeline_en4 = Pipeline(stages=[book_stringIdx, regex_tokenizer, stopword_remover,
    count_vectorizer, idf, assembler, lr_en4])

model_en4 = pipeline_en4.fit(training_up_df)
predictionsBin_en4 = model_en4.transform(valid_strat_df)
printClassPredictions_bin(predictionsBin_en4)

+------------------+
|      avg(correct)|
+------------------+
|0.9330316742081448|
+------------------+

bin_score = 0
+------------+
|avg(correct)|
+------------+
|         0.0|
+------------+

bin_score = 1
+------------+
|avg(correct)|
+------------+
|         1.0|
+------------+



In [27]:
### Evaluating best model on the test set

In [None]:
predictionsBin_test = model.transform(test_strat_df)
printClassPredictions_binprintClassPredictions(predictionsBin_test)