In [1]:
import pyspark
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
# from pyspark.sql.functions import when
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StringIndexer, CountVectorizer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

spark= SparkSession.builder.getOrCreate()

path = './train_final'
paths = [path+'/'+str(i)+'.csv' for i in range(23)]


In [2]:
# data cleanup
df = spark.read.csv(paths,header=True)
df = df.dropna()
(trainData, testData) = df.randomSplit([0.7, 0.3], seed = 100)
# df = df.withColumn('Category',when(df.Category == 'Positive',1.).when(df.Category == 'Neutral',0.).when(df.Category == 'Negative',-1.))

In [3]:
# pipeline 
tokenizer = Tokenizer(inputCol="tokens", outputCol="words")
countVectors = CountVectorizer(inputCol="words", outputCol="cv", vocabSize=30000, minDF=5)
idf = IDF(inputCol='cv',outputCol='features',minDocFreq=5)
label = StringIndexer(inputCol = "Category", outputCol = "label")
lr = LogisticRegression()
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
pipeline = Pipeline(stages=[label, tokenizer, countVectors, idf, lr])
# pipeline = Pipeline(stages=[tokenizer, countVectors, idf, label_stringIdx,nb])

In [None]:
# grid search w/ cv 
%time
paramGrid = (ParamGridBuilder().addGrid(lr.regParam, [0.01, 0.1, 0.2,]).addGrid(lr.maxIter, [20, 50, 100]).addGrid(lr.elasticNetParam, [0., 0.5, 1.0]).build())
cv = CrossValidator(estimator=pipeline,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=5)

cvModel = cv.fit(trainData)
predictions = cvModel.transform(testData)

accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})

# Print the results
print("CV Accuracy:", accuracy)
print("CV Precision:", precision)
print("CV Recall:", recall)

bestModel = cvModel.bestModel
cvModel.getEstimatorParamMaps()[np.argmax(cvModel.avgMetrics)]

In [None]:
# train with selected params
# lr = LogisticRegression(regParam = ' ', maxIter= ' ', elasticNetParam = ' ')
# pipeline = Pipeline(stages=[label, tokenizer, countVectors, idf, lr])

lrModel = pipeline.fit(trainData)
predictions = lrModel.transform(testData)

accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})

# Print the results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

# lrModel.write().overwrite().save("/Model")