In [1]:
from threading import Thread

class StreamingThread(Thread):
    def __init__(self, ssc):
        Thread.__init__(self)
        self.ssc = ssc
    def run(self):
        ssc.start()
        ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [2]:
sc

In [3]:
# Remarks / TO DO
# 1. unbalance class distribution ==> try up / down-sampling (but I heard from another student it did not improve)
# try SMOTE
# 2. add to pre-processing stemming
# 3. also try Naive Bayes and SVM (on top of log regr). See https://towardsdatascience.com/multi-class-text-classification-with-pyspark-7d78d022ed35
# see also: http://classes.ischool.syr.edu/ist718/content/unit09/lab-sentiment_analysis/
# 4. Try ensemlble of different methods
# 5. Try external library for sentiment analysis (sentimnetvader / but I heard from another student it did not help)
# 6. VERY IMPORTANT: I think we should rather reduce the number of categories from 5 to let's say 3
# the 3 categories would be bad (0 and 1 star), middle (3 star), good (4 and 5 stars)
# this would allow to have more training instance per categories and anyway how can even a human differentiate a 1 from a 2 stars or a 4 from a 5 stars

In [4]:
# start with easy implemetation: only consider the content of the 2 fields review_title and review_text
# concantenate them in one new field "review_concat"from pyspark.sql import SQLContext
from pyspark.sql import functions as fn
filepath = '../data_processed/ExctractedData.json'
# load JSON file
s_df = spark.read.json(filepath)
# concatenate review text and title in one field
s_df = s_df.withColumn('review_concat',fn.concat(fn.col('review_title'),fn.lit(' '), fn.col('review_text')))
s_df.printSchema()

root
 |-- book_id: string (nullable = true)
 |-- book_title: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- review_score: string (nullable = true)
 |-- review_text: string (nullable = true)
 |-- review_title: string (nullable = true)
 |-- review_user: string (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- review_concat: string (nullable = true)



In [5]:
# review_score is of type String ==> cast it from String to Integer
from pyspark.sql.types import IntegerType
s_df = s_df.withColumn("review_score", s_df["review_score"].cast(IntegerType()))
s_df.printSchema()

root
 |-- book_id: string (nullable = true)
 |-- book_title: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- review_score: integer (nullable = true)
 |-- review_text: string (nullable = true)
 |-- review_title: string (nullable = true)
 |-- review_user: string (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- review_concat: string (nullable = true)



In [6]:
# show score distribution, we see classes are highly unbalanced
from pyspark.sql.functions import col
s_df.groupBy("review_score") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+------------+-----+
|review_score|count|
+------------+-----+
|           5|10093|
|           4| 1281|
|           3|  395|
|           2|  187|
|           1|  145|
+------------+-----+



In [7]:
# check if duplicate review (normally not the case as the python script that filters the JSON took care of that)
s_df.dropDuplicates(['review_id'])
s_df.groupBy("review_score") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+------------+-----+
|review_score|count|
+------------+-----+
|           5|10093|
|           4| 1281|
|           3|  395|
|           2|  187|
|           1|  145|
+------------+-----+



In [52]:
# show first few reviews
s_df.head(2)

[Row(book_id='62678426', book_title='The Woman in the Window: A Novel', review_id='RZFSIAUSHZH43', review_score='5', review_text="Someone recommended this book but wouldn't say much about it. Lots of twists and turns, trying to figure out what was reality and what wasn't.Don't start this if you have to be somewhere else. It's hard to put down until you know how it ends.", review_title='Twists and turns', review_user='LindaG', timestamp=1556699124, review_concat="Twists and turns Someone recommended this book but wouldn't say much about it. Lots of twists and turns, trying to figure out what was reality and what wasn't.Don't start this if you have to be somewhere else. It's hard to put down until you know how it ends."),
 Row(book_id='1455536156', book_title='Scraps, Wilt & Weeds: Turning Wasted Food into Plenty', review_id='RSSSXHBF4BJJY', review_score='4', review_text='A well written and inspiring cook book! As a chef my self i was impressed with some of the applications for food wast

In [10]:
# look at first 5 star review
s_df.where(fn.col('review_score') == 5).first()

Row(book_id='62678426', book_title='The Woman in the Window: A Novel', review_id='RZFSIAUSHZH43', review_score=5, review_text="Someone recommended this book but wouldn't say much about it. Lots of twists and turns, trying to figure out what was reality and what wasn't.Don't start this if you have to be somewhere else. It's hard to put down until you know how it ends.", review_title='Twists and turns', review_user='LindaG', timestamp=1556699124, review_concat="Twists and turns Someone recommended this book but wouldn't say much about it. Lots of twists and turns, trying to figure out what was reality and what wasn't.Don't start this if you have to be somewhere else. It's hard to put down until you know how it ends.")

In [11]:
# look at 1 very bad review
s_df.where(fn.col('review_score') == 1).first()

Row(book_id='0143110438', book_title='A Gentleman in Moscow: A Novel', review_id='R3QEDU4XU80W5U', review_score=1, review_text="I followed the high Amazon reviews for this book and cannot believe how this book is popular.  Yes, the author is talented and his writing is very sophisticated.....but this book has NO plot, the characters are boring and the story could be told in about 10 pages.  Literally, nothing interesting happens in this book....instead, it's 460 pages of non-nonsensical rambling.", review_title='BORING RAMBLING', review_user='Amazon Customer', timestamp=1554764357, review_concat="BORING RAMBLING I followed the high Amazon reviews for this book and cannot believe how this book is popular.  Yes, the author is talented and his writing is very sophisticated.....but this book has NO plot, the characters are boring and the story could be told in about 10 pages.  Literally, nothing interesting happens in this book....instead, it's 460 pages of non-nonsensical rambling.")

In [12]:
# Show ony review_concat field
s_df.select('review_concat').where(fn.col('review_score') == 1).first()

Row(review_concat="BORING RAMBLING I followed the high Amazon reviews for this book and cannot believe how this book is popular.  Yes, the author is talented and his writing is very sophisticated.....but this book has NO plot, the characters are boring and the story could be told in about 10 pages.  Literally, nothing interesting happens in this book....instead, it's 460 pages of non-nonsensical rambling.")

In [8]:
# import stop words to filter them out from the reviews
import requests
stop_words = requests.get('http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words').text.split()
stop_words[0:10]

['a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost']

In [9]:
# define processing 4 steps and execute them with a trsnformation pipeline
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.ml import Pipeline
# 1. Tokenizer, .setPattern("\\p{L}+") means that it remove accent from words (check it has no impact on the smileys !!!)
tokenizer = RegexTokenizer().setGaps(False)\
  .setPattern("\\p{L}+")\
  .setInputCol("review_concat")\
  .setOutputCol("words")
# 2. filter out stop words
sw_filter = StopWordsRemover()\
  .setStopWords(stop_words)\
  .setCaseSensitive(False)\
  .setInputCol("words")\
  .setOutputCol("filtered")
# 3. TF: TF vectorization + remove words that appear in 20 docs or less
cv = CountVectorizer(minTF=1., minDF=20., vocabSize=2**17)\
  .setInputCol("filtered")\
  .setOutputCol("tf")
# 4. TF-IDF transform
idf = IDF().\
    setInputCol('tf').\
    setOutputCol('tfidf')
# Create a pipelined transformer
tfidf_pipeline = Pipeline(stages=[tokenizer, sw_filter, cv, idf]).fit(s_df)
# Execute transform
tfidf_pipeline.transform(s_df)
#s_df.select('tf').where(fn.col('review_score') == 1).first()
s_df.printSchema()

root
 |-- book_id: string (nullable = true)
 |-- book_title: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- review_score: integer (nullable = true)
 |-- review_text: string (nullable = true)
 |-- review_title: string (nullable = true)
 |-- review_user: string (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- review_concat: string (nullable = true)



In [10]:
# split in train, valid and test sets
#training_df, validation_df, testing_df = s_df.randomSplit([0.7, 0.2, 0.1], seed=42)
#[training_df.count(), validation_df.count(), testing_df.count()]

# start w/o validation set
training_df, testing_df = s_df.randomSplit([0.8, 0.2], seed=42)
[training_df.count(), testing_df.count()]

[9718, 2383]

In [12]:
# perform simple logistic regression 
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression().\
    setLabelCol('review_score').\
    setFeaturesCol('tfidf').\
    setRegParam(0.0).\
    setMaxIter(100).\
    setElasticNetParam(0.)
# new pipeline to chain idf_pipeline with logistic regression
lr_pipeline = Pipeline(stages=[tfidf_pipeline, lr]).fit(training_df)
# fitting + accuracy estimation
predictions = lr_pipeline.transform(testing_df)
predictions.select(fn.expr('float(prediction = review_score)').alias('correct')).\
    select(fn.avg('correct')).show()

+-----------------+
|     avg(correct)|
+-----------------+
|0.892992026856903|
+-----------------+



In [13]:
# score above seems OK but now let's check the accuracy per class. we see it is not good for all but 5
print('Score = 1')
predictions.filter(predictions['review_score'] == 1).\
    select(fn.expr('float(prediction = review_score)').alias('correct')).\
    select(fn.avg('correct')).show()
print('Score = 2')
predictions.filter(predictions['review_score'] == 2).\
    select(fn.expr('float(prediction = review_score)').alias('correct')).\
    select(fn.avg('correct')).show()
print('Score = 3')
predictions.filter(predictions['review_score'] == 3).\
    select(fn.expr('float(prediction = review_score)').alias('correct')).\
    select(fn.avg('correct')).show()
print('Score = 4')
predictions.filter(predictions['review_score'] == 4).\
    select(fn.expr('float(prediction = review_score)').alias('correct')).\
    select(fn.avg('correct')).show()
print('Score = 5')
predictions.filter(predictions['review_score'] == 5).\
    select(fn.expr('float(prediction = review_score)').alias('correct')).\
    select(fn.avg('correct')).show()

Score = 1
+------------------+
|      avg(correct)|
+------------------+
|0.4827586206896552|
+------------------+

Score = 2
+-------------------+
|       avg(correct)|
+-------------------+
|0.47368421052631576|
+-------------------+

Score = 3
+------------------+
|      avg(correct)|
+------------------+
|0.5952380952380952|
+------------------+

Score = 4
+------------------+
|      avg(correct)|
+------------------+
|0.6856060606060606|
+------------------+

Score = 5
+------------------+
|      avg(correct)|
+------------------+
|0.9476626016260162|
+------------------+



In [14]:
# show some predictions
predictions.select("review_id","review_concat","review_score","prediction") \
    .show(n = 100, truncate = 70)

+--------------+----------------------------------------------------------------------+------------+----------+
|     review_id|                                                         review_concat|review_score|prediction|
+--------------+----------------------------------------------------------------------+------------+----------+
|R1CEBJPZE4H5YE|The Next Worst Thing to Being There When you move through this book...|           5|       5.0|
|R1CNZRHE674DEW|The Next Worst Thing to Being There When you move through this book...|           5|       5.0|
|R1CNZRHE674DEW|The Next Worst Thing to Being There When you move through this book...|           5|       5.0|
|R300GA3M5Q1T11|                                   Great true crime novel. Great book.|           5|       5.0|
|R3PB2ZW7LRKR3L|                                   Great true crime novel. Great book.|           5|       5.0|
|R2FYM3CL5CIQAC|One of Jacqueline Winspear's best tales I've read every Maisie Dobb...|           5|    

In [15]:
# show some predictions for which the ground truth was score = 1
predictions.filter(predictions['review_score'] == 1).\
    select("review_id","review_concat","review_score","prediction"). \
    show(n = 100, truncate = 70)

+--------------+----------------------------------------------------------------------+------------+----------+
|     review_id|                                                         review_concat|review_score|prediction|
+--------------+----------------------------------------------------------------------+------------+----------+
|R18E85EKCWU53F|A complete waste of my time Soooo tedious. The drinking, the pills,...|           1|       1.0|
|R1AN8ETB4I7P5N|Couldn't finish I think this is truly a horrible book.  I hated it....|           1|       5.0|
|R1D3A61OTEN6QX|Horrible I never write reviews but this book is so bad I have to sa...|           1|       3.0|
|R24G49195RDQSV|Waste of time Extremely disappointed in this book.  Do not understa...|           1|       1.0|
|R2FG2SAHEV8AD9|Be careful when you order! What I received was in German. Figuring ...|           1|       1.0|
|R2FNSA44HYXZTB|Deadly dull I bought this book because Amy Adams will star in the f...|           1|    

In [16]:
training_df.head(5)



In [17]:
s_df.head(5)

[Row(book_id='62678426', book_title='The Woman in the Window: A Novel', review_id='RZFSIAUSHZH43', review_score=5, review_text="Someone recommended this book but wouldn't say much about it. Lots of twists and turns, trying to figure out what was reality and what wasn't.Don't start this if you have to be somewhere else. It's hard to put down until you know how it ends.", review_title='Twists and turns', review_user='LindaG', timestamp=1556699124, review_concat="Twists and turns Someone recommended this book but wouldn't say much about it. Lots of twists and turns, trying to figure out what was reality and what wasn't.Don't start this if you have to be somewhere else. It's hard to put down until you know how it ends."),
 Row(book_id='1455536156', book_title='Scraps, Wilt & Weeds: Turning Wasted Food into Plenty', review_id='RSSSXHBF4BJJY', review_score=4, review_text='A well written and inspiring cook book! As a chef my self i was impressed with some of the applications for food waste!',

In [18]:
# Now make a new stratified split to make sure we have enough representative examples in the train set
training2_df = s_df.sampleBy("review_score", fractions={1: 0.8, 2: 0.8, 3: 0.8, 4: 0.8, 5: 0.8}, seed=42)
test2_df = s_df.subtract(training2_df)
print('data set split')
[training2_df.count(), test2_df.count()]
# training set
print('training set')
training2_df.groupBy("review_score") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()
# test set
print('test set')
test2_df.groupBy("review_score") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

data set split
training set
+------------+-----+
|review_score|count|
+------------+-----+
|           5| 8141|
|           4| 1015|
|           3|  305|
|           2|  146|
|           1|  111|
+------------+-----+

test set
+------------+-----+
|review_score|count|
+------------+-----+
|           5| 1031|
|           4|  138|
|           3|   48|
|           2|   24|
|           1|   23|
+------------+-----+



In [19]:
# new prevision
lr = LogisticRegression().\
    setLabelCol('review_score').\
    setFeaturesCol('tfidf').\
    setRegParam(0.0).\
    setMaxIter(100).\
    setElasticNetParam(0.)
# new pipeline to chain idf_pipeline with logistic regression
lr_pipeline = Pipeline(stages=[tfidf_pipeline, lr]).fit(training2_df)
# fitting + accuracy estimation
predictions = lr_pipeline.transform(test2_df)
predictions.select(fn.expr('float(prediction = review_score)').alias('correct')).\
    select(fn.avg('correct')).show()

+------------------+
|      avg(correct)|
+------------------+
|0.8212025316455697|
+------------------+



In [20]:
# score above seems OK but now let's check the accuracy per class. we see it is not good for all but 5
print('Score = 1')
predictions.filter(predictions['review_score'] == 1).\
    select(fn.expr('float(prediction = review_score)').alias('correct')).\
    select(fn.avg('correct')).show()
print('Score = 2')
predictions.filter(predictions['review_score'] == 2).\
    select(fn.expr('float(prediction = review_score)').alias('correct')).\
    select(fn.avg('correct')).show()
print('Score = 3')
predictions.filter(predictions['review_score'] == 3).\
    select(fn.expr('float(prediction = review_score)').alias('correct')).\
    select(fn.avg('correct')).show()
print('Score = 4')
predictions.filter(predictions['review_score'] == 4).\
    select(fn.expr('float(prediction = review_score)').alias('correct')).\
    select(fn.avg('correct')).show()
print('Score = 5')
predictions.filter(predictions['review_score'] == 5).\
    select(fn.expr('float(prediction = review_score)').alias('correct')).\
    select(fn.avg('correct')).show()

Score = 1
+-------------------+
|       avg(correct)|
+-------------------+
|0.21739130434782608|
+-------------------+

Score = 2
+-------------------+
|       avg(correct)|
+-------------------+
|0.16666666666666666|
+-------------------+

Score = 3
+------------+
|avg(correct)|
+------------+
|       0.375|
+------------+

Score = 4
+-----------------+
|     avg(correct)|
+-----------------+
|0.463768115942029|
+-----------------+

Score = 5
+-----------------+
|     avg(correct)|
+-----------------+
|0.918525703200776|
+-----------------+

