In [2]:
# https://www.ateam-oracle.com/post/multiclass-text-classification-crossvalidation-with-pyspark-pipelines

from timeit import default_timer as timer
import datetime
import nltk
from nltk.corpus import stopwords

from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, OneVsRest
from pyspark.ml.classification import NaiveBayes, LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import IDF
from pyspark.ml.feature import NGram
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, TimestampType, StringType

In [3]:
###################################################################
########## general setting to configure before executing ##########
# how many parallel threads should the crossvalidator use
parallelExec=4

# should the crossvalidator collect submodel data
trackSubModels=False

# how many folds should crossvalidator use
numberFolds=3

# Path to save the best model to
modelSavePath = "gs://cloud-project-bucket-3/modeldir2"

# data file to load
dataToLoad = "gs://cloud-project-bucket-3/Sentiment_Analysis_Dataset.csv"

conf = SparkConf().setMaster("local[*]").setAppName("multigridsearch")
###################################################################

In [4]:
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")
spark = SparkSession(sc)

22/03/09 09:27:32 INFO org.apache.spark.SparkEnv: Registering MapOutputTracker
22/03/09 09:27:32 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster
22/03/09 09:27:32 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat
22/03/09 09:27:32 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator


In [5]:
stoplist = stopwords.words('english')
# stopwords_download = nltk.download('stopwords')
# stop_words = stopwords_download.read().split('\n')

In [6]:
"""
cols = ['sentiment','id','date','query_string','user','text']
schema = StructType([
    StructField('sentiment', IntegerType(), True),
    StructField('id', IntegerType(), True),
    StructField('date', StringType(), True),
    StructField('query_string', StringType(), True),
    StructField('user', StringType(), True),
    StructField('text', StringType(), True)])
    
data = spark.read.csv(dataToLoad,header=False,schema=schema)  
"""
data = spark.read.csv(dataToLoad,header=True)
data = data.withColumn("label", data["label"].cast(IntegerType()))
data = data.drop('_c0','id', 'flag', 'user', 'date')
data = data.sample(False, .0025, 42)
data.show(5)


                                                                                

+-----+--------------------+
|label|                text|
+-----+--------------------+
|    0|i think to much o...|
|    0|Tuesday is a rain...|
|    0|Dad was admitted ...|
|    0|hoping I can fall...|
|    0|Finished the seco...|
+-----+--------------------+
only showing top 5 rows



In [7]:
# how many partitions if the data being sliced into by default
print ('data partitions #:', data.rdd.getNumPartitions())
# What is the size of our dataset
print ('data size : ', data.count())

data partitions #: 4




data size :  4001


                                                                                

In [8]:
data.dtypes

[('label', 'int'), ('text', 'string')]

In [9]:
###################################################################
### What features, algorithms and hyperparameters will be used ####

In [10]:
# tokenize our data into individual words. If you only wanted to use unigrams, you could use just this step and not need the ngram step as well. You would need to adjust column names.
# any word less than this lenth will be removed from the feature list. For example, is stop words doesn't catch "a, at, of" and min length is 3, those are gone.
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W", minTokenLength=3)
minimumWordLength = [2, 3, 4]

In [11]:
# convert our text labels to numerical values
label_stringIdx = StringIndexer(inputCol="label", outputCol="num_label", handleInvalid="keep")

In [12]:
# remove stopwords using our custom stopword list
# stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords=stop_words)
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords=stoplist)

In [13]:
# split words into groups. ngramSize defines the group size. For example, 1 is unigrams, 2 is bigrams etc...
ngramer = NGram(inputCol='filtered', outputCol='ngrams')
ngramSize = [1, 2]

In [14]:
countVectors = CountVectorizer(inputCol="ngrams", outputCol="features", vocabSize=100000, minDF=5)
vocabularySize = [10000, 50000, 100000]
minDF = [3, 5]

In [15]:
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
regParam = [0.01, 0.1, 0.3, 0.5]
elasticNetParam = [0, .5, 1]

In [16]:
rf = RandomForestRegressor(subsamplingRate=0.15, featuresCol='features', labelCol='num_label')
numberTrees = [10, 20, 30]
maxDepth =  [5, 10]

In [17]:
nb = NaiveBayes(smoothing=1)
nbSmoothing =  [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]

In [18]:
idf = IDF(inputCol="features", outputCol="idf", minDocFreq=5)
minDocFreq=[5, 10]

In [19]:
lsvc = LinearSVC()
lvcsMaxIter = [10, 50, 100]
lvcsRegParam = [0.001, 0.01, 1.0,10.0]
ovr = OneVsRest(classifier=lsvc)
###################################################################

In [20]:
# set pipeline to en empty list of pipeline stages
pipeline = Pipeline(stages=[])

###################################################################

In [21]:
######### define our various grids we want to execute #############
cv_stages = [regexTokenizer, label_stringIdx, stopwordsRemover, ngramer, countVectors, lr]
cv_paramgrid = ParamGridBuilder().baseOn({pipeline.stages:cv_stages}) \
    .addGrid(regexTokenizer.minTokenLength, minimumWordLength) \
    .addGrid(ngramer.n, ngramSize)\
    .addGrid(countVectors.vocabSize, vocabularySize)\
    .addGrid(countVectors.minDF, minDF) \
    .addGrid(lr.regParam, regParam) \
    .addGrid(lr.elasticNetParam, elasticNetParam) \
    .build()


In [22]:
rf_stages = [regexTokenizer, label_stringIdx, stopwordsRemover, ngramer, countVectors, rf]

In [23]:
rf_paramgrid = ParamGridBuilder().baseOn({pipeline.stages:rf_stages}) \
    .addGrid(regexTokenizer.minTokenLength, minimumWordLength) \
    .addGrid(ngramer.n, ngramSize) \
    .addGrid(countVectors.vocabSize, vocabularySize) \
    .addGrid(countVectors.minDF, minDF) \
    .addGrid(rf.numTrees, numberTrees) \
    .addGrid(rf.maxDepth, maxDepth)\
    .build()

In [24]:
idf_stages = [regexTokenizer, label_stringIdx, stopwordsRemover, ngramer, countVectors, idf, lr]
idf_paramgrid = ParamGridBuilder().baseOn({pipeline.stages:idf_stages}) \
    .addGrid(regexTokenizer.minTokenLength, minimumWordLength) \
    .addGrid(ngramer.n, ngramSize) \
    .addGrid(countVectors.vocabSize, vocabularySize) \
    .addGrid(countVectors.minDF, minDF) \
    .addGrid(idf.minDocFreq, minDocFreq)\
    .build()

In [25]:
nb_stages = [regexTokenizer, label_stringIdx, stopwordsRemover, ngramer, countVectors, nb]
nb_paramgrid = ParamGridBuilder().baseOn({pipeline.stages:nb_stages}) \
    .addGrid(regexTokenizer.minTokenLength, minimumWordLength) \
    .addGrid(ngramer.n, ngramSize) \
    .addGrid(countVectors.vocabSize, vocabularySize) \
    .addGrid(countVectors.minDF, minDF) \
    .addGrid(nb.smoothing, nbSmoothing) \
    .build()

In [26]:
lsvc_stages = [regexTokenizer, label_stringIdx, stopwordsRemover, ngramer, countVectors, ovr]
lsvc_paramgrid = ParamGridBuilder().baseOn({pipeline.stages:lsvc_stages}) \
    .addGrid(regexTokenizer.minTokenLength, minimumWordLength) \
    .addGrid(ngramer.n, ngramSize) \
    .addGrid(countVectors.vocabSize, vocabularySize) \
    .addGrid(countVectors.minDF, minDF) \
    .addGrid(lsvc.maxIter, lvcsMaxIter) \
    .addGrid(lsvc.regParam, lvcsRegParam) \
    .build()

In [27]:
# gridloop = [cv_paramgrid, idf_paramgrid]
# gridloop = [cv_paramgrid]
gridloop = [cv_paramgrid, rf_paramgrid, idf_paramgrid, nb_paramgrid, lsvc_paramgrid]

###################################################################

In [28]:
# how many parameter combinations are about to be tested. Warning - this can quickly get out of hand
paramCombo = 0
for grid in gridloop:
    paramCombo = paramCombo + len(grid)
print ("Number of parameter combinations being tested ", paramCombo)

print("starting crossvalidation at ", datetime.datetime.now(), "\n")

Number of parameter combinations being tested  1368
starting crossvalidation at  2022-03-09 09:27:41.426468 



In [29]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")

In [30]:
# Since a loop is used, we manually track which model between loops has the highest accuracy
bestAcc = 0

In [None]:
# using a loop instead of one big grid search to accomodate getting submodel data. With a huge grid, out of memory errors are more likely when collecting subModel data
for grid in gridloop:
    # print ("running grid ", grid, "\n")
    starttime = timer()
    crossval = CrossValidator(estimatorParamMaps=grid,
                                     estimator=pipeline,
                                     evaluator=evaluator,
                                     numFolds=numberFolds,
                                     parallelism=parallelExec,
                                     collectSubModels=trackSubModels)
    cvModel = crossval.fit(data)
    # print("Time to crossval", timer() - starttime)
    # get the accuracy metrics for the models. This is a list.
    avgMetricsGrid = cvModel.avgMetrics
    # print (avgMetricsGrid)
    # get the max accuracy metric in the list of accuracy metrics.
    modelAcc = max(avgMetricsGrid)
    print("max score for this grid ", modelAcc)
    if (modelAcc > bestAcc):
        print ("this model has greater accuracy. Old acc ", bestAcc, " new acc ", modelAcc)
        bestModel = cvModel.bestModel
        bestAcc = modelAcc
        """
        # print out the params for all the stages of this model
        for stage in bestModel.stages:
            print (stage.extractParamMap())
        """
    # if you are collecting submodel data, this will dump all the param combinations being tested. You can use this with the avgMetricsGrid above to see the accruacy of all your param combos
    """
    if (trackSubModels) :
        submods = cvModel.subModels
        for mods in submods:
            for mod in mods:
                for stage in mod.stages:
                    print(stage.extractParamMap())
    """

22/03/09 09:57:16 WARN org.apache.spark.storage.BlockManager: Asked to remove block broadcast_164320, which does not exist
                                                                                

max score for this grid  0.689248862955272
this model has greater accuracy. Old acc  0  new acc  0.689248862955272


                                                                                

max score for this grid  0.0


                                                                                

max score for this grid  0.6872384492090453


                                                                                

max score for this grid  0.34542898513832004


22/03/09 10:23:07 ERROR org.apache.spark.ml.util.Instrumentation: java.lang.IllegalArgumentException: requirement failed: LinearSVC only supports binary classification. 1 classes detected in LinearSVC_5253b7c8f35a__labelCol
	at scala.Predef$.require(Predef.scala:281)
	at org.apache.spark.ml.classification.LinearSVC.$anonfun$train$1(LinearSVC.scala:212)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.classification.LinearSVC.train(LinearSVC.scala:171)
	at org.apache.spark.ml.classification.LinearSVC.train(LinearSVC.scala:76)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:151)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:115)
	at sun.reflect.GeneratedMethodAccessor288.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:

IllegalArgumentException: requirement failed: LinearSVC only supports binary classification. 1 classes detected in LinearSVC_5253b7c8f35a__labelCol

In [None]:
# save the best model for reuse.
bestModel.save(modelSavePath)


In [None]:
spark.stop()