In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline


In [4]:
spark = SparkSession.builder.appName("FBLiveTH").getOrCreate()

In [6]:
df = spark.read.csv("fb_live_thailand.csv", header=True, inferSchema=True)

In [7]:
indexer1 = StringIndexer(inputCol="status_type", outputCol="status_type_ind")
indexer2 = StringIndexer(inputCol="status_published", outputCol="status_published_ind")

In [8]:
assembler = VectorAssembler(
    inputCols=["status_type_ind", "status_published_ind"],
    outputCol="features"
)

In [9]:
lr = LogisticRegression(
    labelCol="status_type_ind",
    featuresCol="features",
    maxIter=10,
    regParam=0.3,
    elasticNetParam=0.8
)

In [10]:
pipeline = Pipeline(stages=[indexer1, indexer2, assembler, lr])


In [11]:
model = pipeline.fit(df)
predictions = model.transform(df)


In [12]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="status_type_ind",
    predictionCol="prediction"
)

In [13]:
# Show metrics
print("Accuracy:", evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"}))
print("Precision:", evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"}))
print("Recall:", evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"}))
print("F1:", evaluator.evaluate(predictions, {evaluator.metricName: "f1"}))

spark.stop()

Accuracy: 0.6082269503546099
Precision: 0.3938503719602186
Recall: 0.6082269503546099
F1: 0.4781076376023038
