In [1]:
import pyspark
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StringIndexer, CountVectorizer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

spark= SparkSession.builder.getOrCreate()

path = '../data/data_raw'
paths = [path+'/'+str(i)+'.csv' for i in range(23)]


In [2]:
# data cleanup
df = spark.read.csv(paths,header=True)
df = df.dropna()
# train test split
(trainData, testData) = df.randomSplit([0.7, 0.3], seed = 100)

### Model choice
According to literature review, Logistic regression is just as effective as naive bayes or support vector machines for sentiment analysis \*1
- TF-IDF process:
    - CounvtVectorizer converts words into sparse matrix of vectors, vocabsize restrict the top x words to be used, minDF (minimum document frequency) ignores words that appear less than x amount of x percent in the entire document(s). 
    - Inverse document frequnecy (IDF) adds a discount to words that appear frequently in the text
- execution speed dependency:
    - vocabsize/minDF: controls the sparsity of model
    - parallelization for matrix multiplications/ softmax multi-logistic regression and its gradient calculations


In [3]:
# pipeline 
tokenizer = Tokenizer(inputCol="tokens", outputCol="words")
countVectors = CountVectorizer(inputCol="words", outputCol="cv", vocabSize=30000, minDF=5)
idf = IDF(inputCol='cv',outputCol='features',minDocFreq=5)
label = StringIndexer(inputCol = "Category", outputCol = "label")
lr = LogisticRegression()
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
pipeline = Pipeline(stages=[label, tokenizer, countVectors, idf, lr])

### Hyperparameter tuning
Due to large amount of tweets (~2mil), 3 fold CV is used to determine the best elastic net and regularization parameter - ***Warning long runtime*** \
Accuracy, precision and recall generally decrease as more regularization strength is applied. \
~ 85% - 75% accuracy for regparam 0.0 - 0.1, elastic net 0, 0.5 ,1 \
*Important caveat*: The data was not manually labeled, instead it's done with NLTK VADER, so it is comparing how similar our model output is to the NLTK version.


In [None]:
# grid search w/ 3 fold cv 
# %time
lr = LogisticRegression(maxIter=100)
pipeline = Pipeline(stages=[label, tokenizer, countVectors, idf, lr])
paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.01, 0.1]).addGrid(lr.elasticNetParam, [0., 0.5, 1.0]).build()
cv = CrossValidator(estimator=pipeline,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=3)

cvModel = cv.fit(trainData)
predictions = cvModel.transform(testData)

accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})

# Print the results
print("CV Accuracy:", accuracy)
print("CV Precision:", precision)
print("CV Recall:", recall)

bestModel = cvModel.bestModel
cvModel.getEstimatorParamMaps()[np.argmax(cvModel.avgMetrics)]

In [9]:
# train with selected params
lr = LogisticRegression(regParam = 0.01, maxIter= 100, elasticNetParam = 0.)
pipeline = Pipeline(stages=[label, tokenizer, countVectors, idf, lr])

lrModel = pipeline.fit(trainData)
predictions = lrModel.transform(testData)

accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})

# Print the results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

# lrModel.write().overwrite().save("/Model")

Accuracy: 0.7782525881551807
Precision: 0.7763460163068651
Recall: 0.7782525881551807


#### Citation
1. Samar Al-Saqqa, Ghazi Al-Naymat, Arafat Awajan,
A Large-Scale Sentiment Data Classification for Online Reviews Under Apache Spark,
Procedia Computer Science,Volume 141,2018,Pages 183-189,ISSN 1877-0509,https://doi.org/10.1016/j.procs.2018.10.166.