In [111]:
import pyspark
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StringIndexer, CountVectorizer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

spark= SparkSession.builder.getOrCreate()

path = './data_raw'
paths = [path+'/'+str(i)+'.csv' for i in range(23)]


In [99]:
# data load
df = spark.read.option("delimiter", "\t").csv(paths,header=True)
df = df.dropna()

In [100]:
df.printSchema()

root
 |-- rawData: string (nullable = true)
 |-- tokens: string (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- Category: string (nullable = true)



In [101]:
(trainData, testData) = df.randomSplit([0.7, 0.3], seed = 100)
testData.write.csv('./data_test', header=True, mode="overwrite",sep="\t")

                                                                                

In [80]:
df.count()

                                                                                

1349850

In [81]:
testData.count()

                                                                                

404376

In [96]:
trainData.printSchema()

root
 |-- rawData: string (nullable = true)
 |-- tokens: string (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- Category: string (nullable = true)



In [102]:
# tokenization
tokenizer = Tokenizer(inputCol="tokens", outputCol="words")
trainData = tokenizer.transform(trainData)
trainData = trainData.drop("rawData").drop("tokens")
trainData.printSchema()

root
 |-- sentiment: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [108]:
testData2 = tokenizer.transform(testData)
testData2 = testData2.drop("rawData").drop("tokens")
testData2.printSchema()

root
 |-- sentiment: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [103]:
# pipeline 
countVectors = CountVectorizer(inputCol="words", outputCol="cv", vocabSize=30000, minDF=5)
idf = IDF(inputCol='cv',outputCol='features',minDocFreq=5)
label = StringIndexer(inputCol = "Category", outputCol = "label")
lr = LogisticRegression()
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
pipeline = Pipeline(stages=[label, countVectors, idf, lr])

In [109]:
lrModel = pipeline.fit(trainData)
predictions = lrModel.transform(testData2)

accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})

# Print the results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

lrModel.write().overwrite().save("./model")

23/11/27 23:56:21 WARN DAGScheduler: Broadcasting large task binary with size 1571.8 KiB
23/11/27 23:56:34 WARN DAGScheduler: Broadcasting large task binary with size 1571.8 KiB
23/11/27 23:56:46 WARN DAGScheduler: Broadcasting large task binary with size 1571.8 KiB
                                                                                

Accuracy: 0.8697449898114626
Precision: 0.8675089703191299
Recall: 0.8697449898114626


                                                                                

In [112]:
load_model = PipelineModel.load("./model")

                                                                                

In [113]:
predictions = load_model.transform(testData2)

accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})

23/11/28 00:00:08 WARN DAGScheduler: Broadcasting large task binary with size 1570.7 KiB
23/11/28 00:00:20 WARN DAGScheduler: Broadcasting large task binary with size 1570.7 KiB
23/11/28 00:00:31 WARN DAGScheduler: Broadcasting large task binary with size 1570.7 KiB
                                                                                

In [114]:
predictions.printSchema()

root
 |-- sentiment: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- label: double (nullable = false)
 |-- cv: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [115]:
predictions.select("prediction", "label", "Category", "features").show()

23/11/28 00:01:22 WARN DAGScheduler: Broadcasting large task binary with size 1558.1 KiB
[Stage 325:>                                                        (0 + 1) / 1]

+----------+-----+--------+--------------------+
|prediction|label|Category|            features|
+----------+-----+--------+--------------------+
|       0.0|  0.0| Neutral|(30000,[2,116,123...|
|       0.0|  0.0| Neutral|(30000,[61,125],[...|
|       0.0|  0.0| Neutral|(30000,[0,5,17,18...|
|       0.0|  0.0| Neutral|(30000,[3325],[7....|
|       0.0|  0.0| Neutral|(30000,[2065,1851...|
|       0.0|  0.0| Neutral|(30000,[63],[3.65...|
|       0.0|  0.0| Neutral|(30000,[0,8,14,16...|
|       0.0|  0.0| Neutral|(30000,[15,63],[2...|
|       0.0|  0.0| Neutral|(30000,[277,698,9...|
|       0.0|  0.0| Neutral|(30000,[3,63,487,...|
|       0.0|  0.0| Neutral|(30000,[15,44],[2...|
|       0.0|  0.0| Neutral|(30000,[2647],[7....|
|       0.0|  0.0| Neutral|(30000,[16,328],[...|
|       0.0|  0.0| Neutral|(30000,[62,256,38...|
|       0.0|  0.0| Neutral|(30000,[62,256,38...|
|       0.0|  0.0| Neutral|(30000,[0,7,35,10...|
|       0.0|  0.0| Neutral|(30000,[1,5,14,20...|
|       0.0|  0.0| N

                                                                                