In [129]:
from threading import Thread

class StreamingThread(Thread):
    def __init__(self, ssc):
        Thread.__init__(self)
        self.ssc = ssc
    def run(self):
        ssc.start()
        ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [130]:
sc

In [8]:
# Remarks / TO DO
# 2. add to pre-processing stemming
# 3. also try Naive Bayes and SVM (on top of log regr). See https://towardsdatascience.com/multi-class-text-classification-with-pyspark-7d78d022ed35
# see also: http://classes.ischool.syr.edu/ist718/content/unit09/lab-sentiment_analysis/
# 6. VERY IMPORTANT: I think we should rather reduce the number of categories from 5 to let's say 3
# the 3 categories would be bad (0 and 1 star), middle (3 star), good (4 and 5 stars)
# this would allow to have more training instance per categories and anyway how can even a human differentiate a 1 from a 2 stars or a 4 from a 5 stars

# 1. Data loading and exploration

In [185]:
# start with easy implemetation: only consider the content of the 2 fields review_title and review_text
# concantenate them in one new field "review_concat"from pyspark.sql import SQLContext
from pyspark.sql import functions as fn
from pyspark.sql.types import IntegerType

filepath = 'data_processed/ExctractedData.json'
# load JSON file
s_df = spark.read.json(filepath)
# concatenate review text and title in one field
s_df = s_df.withColumn('review_concat',fn.concat(fn.col('review_title'),fn.lit(' '), fn.col('review_text')))
# review_score is of type String ==> cast it from String to Integer
s_df = s_df.withColumn("review_score", s_df["review_score"].cast(IntegerType()))
s_df = s_df.withColumn("book_id", s_df["book_id"].cast(IntegerType()))
s_df.printSchema()

root
 |-- book_id: integer (nullable = true)
 |-- book_title: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- review_score: integer (nullable = true)
 |-- review_text: string (nullable = true)
 |-- review_title: string (nullable = true)
 |-- review_user: string (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- review_concat: string (nullable = true)



In [198]:
# check if duplicate review (normally not the case as the python script that filters the JSON took care of that)
s_df = s_df.dropDuplicates(['review_id'])
print('Total # of rows: ' + str(s_df.count()))
print('# of rows per class:')
s_df.groupBy("review_score") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

Total # of rows: 11573
# of rows per class:
+------------+-----+
|review_score|count|
+------------+-----+
|           5| 9383|
|           4| 1529|
|           3|  346|
|           2|  170|
|           1|  145|
+------------+-----+



In [187]:
# look at first 5 star review
s_df.where(fn.col('review_score') == 5).first()

Row(book_id=62678426, book_title='The Woman in the Window: A Novel', review_id='R15DG6BI3K1I78', review_score=5, review_text="Extraordinary on any & every level. Astonishing that it' s a debut novel. Transfixing.", review_title='Although reviews are universally stellar, highly recommend one avoids reading them & any synopsis preplunging in.', review_user='Perel Soreh', timestamp=1556661613, review_concat="Although reviews are universally stellar, highly recommend one avoids reading them & any synopsis preplunging in. Extraordinary on any & every level. Astonishing that it' s a debut novel. Transfixing.")

In [188]:
# look at 1 very bad review
s_df.where(fn.col('review_score') == 1).first()

Row(book_id=62824619, book_title='Cemetery Road: A Novel', review_id='R1T4O9RXIKX7D9', review_score=1, review_text='I am a huge fan of Greg Isles, but Cemetery Road was a outline of the garbage that the publishers must insist on before they will publish your book.  Mr. Isles, you are better than this, and you disappointed us with Cemetery Road.  I am going back to your older books, which are far superior to your latest endeavor.  In closing, there are no grey areas like you are suggesting in your book.  It is either moral or immoral.  There is no in between.', review_title='Disappointed', review_user='Jeanette Grayeb-Mihal', timestamp=1554878526, review_concat='Disappointed I am a huge fan of Greg Isles, but Cemetery Road was a outline of the garbage that the publishers must insist on before they will publish your book.  Mr. Isles, you are better than this, and you disappointed us with Cemetery Road.  I am going back to your older books, which are far superior to your latest endeavor. 

In [189]:
# Show ony review_concat field
s_df.select('review_concat').where(fn.col('review_score') == 1).first()

Row(review_concat='Disappointed I am a huge fan of Greg Isles, but Cemetery Road was a outline of the garbage that the publishers must insist on before they will publish your book.  Mr. Isles, you are better than this, and you disappointed us with Cemetery Road.  I am going back to your older books, which are far superior to your latest endeavor.  In closing, there are no grey areas like you are suggesting in your book.  It is either moral or immoral.  There is no in between.')

# 2. Define pre-processing pipeline

In [199]:
# import stop words to filter them out from the reviews
import requests
stop_words = requests.get('http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words').text.split()
stop_words[0:10]

['a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost']

In [200]:
# define processing 4 steps and execute them with a trsnformation pipeline
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, IDF
from nltk.stem.snowball import SnowballStemmer
from pyspark.ml import Pipeline

# 1. Tokenizer, .setPattern("\\p{L}+") means that it remove accent from words (check it has no impact on the smileys !!!)
tokenizer = RegexTokenizer().setGaps(False)\
  .setPattern("\\p{L}+")\
  .setInputCol("review_concat")\
  .setOutputCol("words")

# 2. filter out stop words
sw_filter = StopWordsRemover()\
  .setStopWords(stop_words)\
  .setCaseSensitive(False)\
  .setInputCol("words")\
  .setOutputCol("filtered")

# 3. TF: TF vectorization + remove words that appear in 5 docs or less
#  converts text documents to vectors of term counts
cv = CountVectorizer(minTF=1., minDF=5., vocabSize=10000)\
  .setInputCol("filtered")\
  .setOutputCol("tf")

# 4. TF-IDF transform
# The IDFModel takes feature vectors (generally created from HashingTF or CountVectorizer) and scales each column. 
# Intuitively, it down-weights columns which appear frequently in a corpus.
idf = IDF().\
    setInputCol('tf').\
    setOutputCol('tfidf')

# Create a pipelined transformer and fit it with full data set
tfidf_pipeline = Pipeline(stages=[tokenizer, sw_filter, cv, idf]).fit(s_df)

# Control execution of preprocessing pipeline by pre-processing the data
s_df_transform = tfidf_pipeline.transform(s_df)
s_df_transform.select('tfidf').where(fn.col('review_score') == 1).first()


Row(tfidf=SparseVector(7909, {0: 0.9577, 7: 1.7476, 14: 2.1023, 42: 2.6845, 55: 2.8888, 138: 3.4816, 165: 3.6103, 201: 7.4795, 302: 4.3067, 462: 4.3459, 602: 4.8132, 1010: 10.2448, 1149: 5.1518, 1358: 5.5723, 1500: 5.5064, 2092: 12.7216, 2476: 6.0984, 2819: 6.1785, 3718: 6.5839, 3837: 6.7175, 4781: 6.7916, 6035: 7.1593, 6709: 7.2771, 7151: 7.4106, 7244: 7.4106, 7394: 15.1295}))

In [201]:
# check schema of output of preprocessing pipeline 
s_df_transform.printSchema()

root
 |-- book_id: integer (nullable = true)
 |-- book_title: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- review_score: integer (nullable = true)
 |-- review_text: string (nullable = true)
 |-- review_title: string (nullable = true)
 |-- review_user: string (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- review_concat: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tf: vector (nullable = true)
 |-- tfidf: vector (nullable = true)



# 3. Model training
## 3.1. Simple logistic regression

In [202]:
# random split in train and test set with 80-20% proportions
training_df, testing_df = s_df.randomSplit([0.8, 0.2], seed=42)
[training_df.count(), testing_df.count()]

[9191, 2382]

In [145]:
# Add logistic regression to the previously defined pipeline
lr = LogisticRegression().\
    setLabelCol('review_score').\
    setFeaturesCol('tfidf').\
    setRegParam(0.0).\
    setMaxIter(100).\
    setElasticNetParam(0.)

# new pipeline to chain idf_pipeline with logistic regression
# fit training set on pipeline
lr_pipeline = Pipeline(stages=[tfidf_pipeline, lr]).fit(training_df)

# precict on test and calculate accuracy
lr_predictions = lr_pipeline.transform(testing_df)

In [146]:
# score above seems OK but now let's check the accuracy per class. we see it is not good for all but 5
def printClassPredictions(predictions):
    predictions.select(fn.expr('float(prediction = review_score)').alias('correct')).\
        select(fn.avg('correct')).show()
    print('Score = 1')
    predictions.filter(predictions['review_score'] == 1).\
        select(fn.expr('float(prediction = review_score)').alias('correct')).\
        select(fn.avg('correct')).show()
    print('Score = 2')
    predictions.filter(predictions['review_score'] == 2).\
        select(fn.expr('float(prediction = review_score)').alias('correct')).\
        select(fn.avg('correct')).show()
    print('Score = 3')
    predictions.filter(predictions['review_score'] == 3).\
        select(fn.expr('float(prediction = review_score)').alias('correct')).\
        select(fn.avg('correct')).show()
    print('Score = 4')
    predictions.filter(predictions['review_score'] == 4).\
        select(fn.expr('float(prediction = review_score)').alias('correct')).\
        select(fn.avg('correct')).show()
    print('Score = 5')
    predictions.filter(predictions['review_score'] == 5).\
        select(fn.expr('float(prediction = review_score)').alias('correct')).\
        select(fn.avg('correct')).show()
    
printClassPredictions(predictions)

+------------------+
|      avg(correct)|
+------------------+
|0.8211586901763224|
+------------------+

Score = 1
+-------------------+
|       avg(correct)|
+-------------------+
|0.27586206896551724|
+-------------------+

Score = 2
+-------------------+
|       avg(correct)|
+-------------------+
|0.20512820512820512|
+-------------------+

Score = 3
+-------------------+
|       avg(correct)|
+-------------------+
|0.33783783783783783|
+-------------------+

Score = 4
+------------------+
|      avg(correct)|
+------------------+
|0.5701492537313433|
+------------------+

Score = 5
+----------------+
|    avg(correct)|
+----------------+
|0.90498687664042|
+----------------+



## 3.2. Logistic regression with elastic net regularization

In [156]:
# not add elastic net regularization (combination of L1 and L2 reg)
lambda_par = 0.1
alpha_par = 0.3
en_lr = LogisticRegression().\
        setLabelCol('review_score').\
        setFeaturesCol('tfidf').\
        setRegParam(lambda_par).\
        setMaxIter(100).\
        setElasticNetParam(alpha_par)

# new pipeline to chain idf_pipeline with logistic regression
en_lr_pipeline = Pipeline(stages=[tfidf_pipeline, en_lr]).fit(training_df)
# fitting + accuracy estimation
en_lr_predictions = lr_pipeline.transform(testing_df)

printClassPredictions(en_lr_predictions)

+------------------+
|      avg(correct)|
+------------------+
|0.8207388748950462|
+------------------+

Score = 1
+-------------------+
|       avg(correct)|
+-------------------+
|0.27586206896551724|
+-------------------+

Score = 2
+-------------------+
|       avg(correct)|
+-------------------+
|0.20512820512820512|
+-------------------+

Score = 3
+-------------------+
|       avg(correct)|
+-------------------+
|0.33783783783783783|
+-------------------+

Score = 4
+------------------+
|      avg(correct)|
+------------------+
|0.5671641791044776|
+------------------+

Score = 5
+----------------+
|    avg(correct)|
+----------------+
|0.90498687664042|
+----------------+



In [150]:
# show some predictions for which the ground truth was score = 1
predictions.filter(predictions['review_score'] == 1).\
    select("review_id","review_concat","review_score","prediction"). \
    show(n = 10, truncate = 70)

+--------------+----------------------------------------------------------------------+------------+----------+
|     review_id|                                                         review_concat|review_score|prediction|
+--------------+----------------------------------------------------------------------+------------+----------+
|R2TPIP9WFJFHBF|Blurry. Do not purchase. I just received this today. Do not be dece...|           1|       5.0|
|R24G49195RDQSV|Waste of time Extremely disappointed in this book.  Do not understa...|           1|       1.0|
|R18E85EKCWU53F|A complete waste of my time Soooo tedious. The drinking, the pills,...|           1|       2.0|
|R24O439CRE9HHV|A Weeping Liberal Who has a typical weeping liberals non grasp of i...|           1|       5.0|
|  R50JW2WMC3O4|Good idea but executed poorly Started off pretty well but then kept...|           1|       4.0|
|R1XHIL1UCA5F8Y|The agony of reading this book This is beautifully written but so, ...|           1|    

## 3.3. Logistic regression with stratified split

In [203]:
# Now make a new stratified split to make sure we have enough representative examples in the train set
training_strat_df = s_df.sampleBy("review_score", fractions={1: 0.8, 2: 0.8, 3: 0.8, 4: 0.8, 5: 0.8}, seed=42)
test_strat_df = s_df.subtract(training_strat_df)
# training set
print('# rows training set: ' + str(training_strat_df.count()))
print('# rows per class')
training_strat_df.groupBy("review_score") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()
# test set
print('# rows test set: ' + str(test_strat_df.count()))
print('# rows per class')
test_strat_df.groupBy("review_score") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

# rows training set: 9191
# rows per class
+------------+-----+
|review_score|count|
+------------+-----+
|           5| 7459|
|           4| 1206|
|           3|  280|
|           2|  134|
|           1|  112|
+------------+-----+

# rows test set: 2382
# rows per class
+------------+-----+
|review_score|count|
+------------+-----+
|           5| 1929|
|           4|  313|
|           3|   76|
|           2|   33|
|           1|   31|
+------------+-----+



In [153]:
# new prevision with previously defined en_lr (elestic net logistic regression)
en_lr_strat_pipeline = Pipeline(stages=[tfidf_pipeline, en_lr]).fit(training_strat_df)
# fitting + accuracy estimation
predictions_en_lr_strat = en_lr_strat_pipeline.transform(test_strat_df)

printClassPredictions(predictions_en_lr_strat)

+------------------+
|      avg(correct)|
+------------------+
|0.8434089000839631|
+------------------+

Score = 1
+-------------------+
|       avg(correct)|
+-------------------+
|0.03225806451612903|
+-------------------+

Score = 2
+--------------------+
|        avg(correct)|
+--------------------+
|0.030303030303030304|
+--------------------+

Score = 3
+-------------------+
|       avg(correct)|
+-------------------+
|0.05263157894736842|
+-------------------+

Score = 4
+-------------------+
|       avg(correct)|
+-------------------+
|0.24920127795527156|
+-------------------+

Score = 5
+-----------------+
|     avg(correct)|
+-----------------+
|0.997926386728875|
+-----------------+



## 3.4. Logistic regression with down sampling
Sampling down all class to the smallest 1 (= 1 star)

In [160]:
# recap data set size and distribution
print('# rows per class')
s_df.groupBy("review_score") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()
print('Total # rows in data set: '+ str(s_df.count()))

# rows per class
+------------+-----+
|review_score|count|
+------------+-----+
|           5| 9383|
|           4| 1529|
|           3|  346|
|           2|  170|
|           1|  145|
+------------+-----+

Total # rows in data set: 11573


In [155]:
# downsampling to 145 rows per class (# rows for class 1)
downsampled_data = s_df.sampleBy('review_score',
    fractions={1: 1, 2: 145./170, 3: 145./346, 4: 145./1529, 5: 145./9383}) \
    .cache()

downsampled_data.groupBy("review_score") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+------------+-----+
|review_score|count|
+------------+-----+
|           5|  168|
|           3|  148|
|           4|  147|
|           2|  147|
|           1|  145|
+------------+-----+



In [158]:
# random split in train and test set with 80-20% proportions
training_down_df, testing_down_df = downsampled_data.randomSplit([0.8, 0.2], seed=42)
[training_down_df.count(), testing_down_df.count()]

[595, 160]

In [159]:
# new prevision with previously defined en_lr (elestic net logistic regression)
en_lr_down_pipeline = Pipeline(stages=[tfidf_pipeline, en_lr]).fit(training_down_df)
# fitting + accuracy estimation
predictions_en_lr_down = en_lr_down_pipeline.transform(testing_down_df)

printClassPredictions(predictions_en_lr_down)

+------------+
|avg(correct)|
+------------+
|      0.4375|
+------------+

Score = 1
+------------+
|avg(correct)|
+------------+
|         0.5|
+------------+

Score = 2
+-------------------+
|       avg(correct)|
+-------------------+
|0.37037037037037035|
+-------------------+

Score = 3
+------------+
|avg(correct)|
+------------+
|         0.5|
+------------+

Score = 4
+------------------+
|      avg(correct)|
+------------------+
|0.3333333333333333|
+------------------+

Score = 5
+------------------+
|      avg(correct)|
+------------------+
|0.4782608695652174|
+------------------+



## 3.5 Logistic regression with up and down sampling

In [169]:
# Using down - up sampling to build a train set with 2000 example per class
# Test set = 20% of train set = 10000 * 0.2 = 1500 rows
# Attention: test set cannot contains exmaple fro test set and must be built before up-down sampling
#
# Get test set of 2000 rows with a factor 'prop'
prop = (11500.-2000.)/11500
training_updown_df_pre = s_df.sampleBy("review_score", fractions={1: prop, 2: prop, 3: prop, 4: prop, 5: prop}, seed=42)
test_updown_df = s_df.subtract(training_updown_df_pre)
# training set before up - down sampling
print('training set before up - down sampling: ' + str(training_updown_df_pre.count()))
training_updown_df_pre.groupBy("review_score") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()
# test set
print('test set: ' + str(test_updown_df.count()))
test_updown_df.groupBy("review_score") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

training set before up - down sampling: 9509
+------------+-----+
|review_score|count|
+------------+-----+
|           5| 7721|
|           4| 1242|
|           3|  290|
|           2|  139|
|           1|  117|
+------------+-----+

test set: 2064
+------------+-----+
|review_score|count|
+------------+-----+
|           5| 1659|
|           4|  275|
|           3|   72|
|           2|   32|
|           1|   26|
+------------+-----+



In [172]:
# perform up and down sampling on the trainig set so that each class contains +- 1500 rows
df_class_1 = training_updown_df_pre[training_updown_df_pre['review_score'] == 1]
df_class_2 = training_updown_df_pre[training_updown_df_pre['review_score'] == 2]
df_class_3 = training_updown_df_pre[training_updown_df_pre['review_score'] == 3]
df_class_4 = training_updown_df_pre[training_updown_df_pre['review_score'] == 4]
df_class_5 = training_updown_df_pre[training_updown_df_pre['review_score'] == 5]

df_class_1_over = df_class_1.sample(withReplacement=True, fraction=2000./117, seed = 42)
df_class_2_over = df_class_2.sample(withReplacement=True, fraction=2000./139, seed = 42)
df_class_3_over = df_class_3.sample(withReplacement=True, fraction=2000./290, seed = 42)
df_class_4_over = df_class_4.sample(withReplacement=True, fraction=2000./1242, seed = 42)
df_class_5_under = df_class_5.sample(withReplacement=True, fraction=2000./7721, seed = 42)

import functools 

def unionAll(dfs):
    return functools.reduce(lambda df1,df2: df1.union(df2.select(df1.columns)), dfs) 

training_updown_df = unionAll([df_class_1_over, df_class_2_over, df_class_3_over, df_class_4_over, df_class_5_under])

print('# of rows in the train set')
print('Total: ' + str(training_updown_df.count()))
print('Per class:')
training_updown_df.groupBy("review_score") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

# of rows in the train set
Total: 9798
Per class:
+------------+-----+
|review_score|count|
+------------+-----+
|           4| 2002|
|           3| 1976|
|           1| 1960|
|           2| 1937|
|           5| 1923|
+------------+-----+



In [173]:
# new prevision with previously defined en_lr (elestic net logistic regression)
en_lr_updown_pipeline = Pipeline(stages=[tfidf_pipeline, en_lr]).fit(training_updown_df)
# fitting + accuracy estimation
predictions_en_lr_updown = en_lr_updown_pipeline.transform(test_updown_df)

printClassPredictions(predictions_en_lr_updown)

+------------------+
|      avg(correct)|
+------------------+
|0.4874031007751938|
+------------------+

Score = 1
+------------------+
|      avg(correct)|
+------------------+
|0.6538461538461539|
+------------------+

Score = 2
+------------+
|avg(correct)|
+------------+
|        0.25|
+------------+

Score = 3
+------------------+
|      avg(correct)|
+------------------+
|0.3888888888888889|
+------------------+

Score = 4
+------------------+
|      avg(correct)|
+------------------+
|0.5454545454545454|
+------------------+

Score = 5
+-------------------+
|       avg(correct)|
+-------------------+
|0.48402652200120555|
+-------------------+



## 3.6 Naive Bayes

In [204]:
# on full data set
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1).\
        setLabelCol('review_score').\
        setFeaturesCol('tfidf')

#book_id

# new pipeline to chain idf_pipeline with logistic regression
nb_pipeline = Pipeline(stages=[tfidf_pipeline, nb]).fit(training_strat_df)
# fitting + accuracy estimation
nb_predictions = lr_pipeline.transform(test_strat_df)
printClassPredictions(nb_predictions)

+------------------+
|      avg(correct)|
+------------------+
|0.9563392107472712|
+------------------+

Score = 1
+------------------+
|      avg(correct)|
+------------------+
|0.8709677419354839|
+------------------+

Score = 2
+------------------+
|      avg(correct)|
+------------------+
|0.8484848484848485|
+------------------+

Score = 3
+------------------+
|      avg(correct)|
+------------------+
|0.8552631578947368|
+------------------+

Score = 4
+------------------+
|      avg(correct)|
+------------------+
|0.8690095846645367|
+------------------+

Score = 5
+------------------+
|      avg(correct)|
+------------------+
|0.9777086573354069|
+------------------+



In [184]:
# on full data set
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.feature import VectorAssembler

nb = NaiveBayes(smoothing=1).\
        setLabelCol('review_score').\
        setFeaturesCol('tfidf')

assembler = VectorAssembler(inputCols=['tfidf','book_id'],outputCol="tfidf_book")

# new pipeline to chain idf_pipeline with logistic regression
nb_pipeline = Pipeline(stages=[tfidf_pipeline, assembler, nb]).fit(training_strat_df)
# fitting + accuracy estimation
nb_predictions = lr_pipeline.transform(test_strat_df)
printClassPredictions(nb_predictions)

IllegalArgumentException: 'Data type string of column book_id is not supported.'

## 3.7 Random Forest

In [None]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)
# Train model with Training Data
rfModel = rf.fit(trainingData)
predictions = rfModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("Descript","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

In [175]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="review_score", outputCol="indexedLabel").fit(s_df)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = s_df.randomSplit([0.8, 0.2])

# Automatically identify categorical features, and index them.
assembler = VectorAssembler(inputCols=['tfidf','sentimentIndex'],outputCol="indexedFeatures")

# Train a GBT model.
rf = RandomForestClassifier(labelCol="review_score", featuresCol="indexedFeatures", numTrees=1)

# Chain indexers and GBT in a Pipeline
pipeline = Pipeline(stages=[tfidf_pipeline, assembler, rf])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "review_score", "indexedFeatures").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="review_score", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

gbtModel = model.stages[2]
print(gbtModel)  # summary only

NameError: name 'VectorAssembler' is not defined

# 4. Model with multiples inputs
## 4.1 Logistic regression

## 4.2 Naive Bayes