# Training model logistic regression dengan menggunakan spark

In [1]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import split, regexp_replace, col, when, count, abs
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression

In [2]:
# membuat spark session
spark = SparkSession.builder \
	.appName("Rekomendasi Mobil") \
	.config("spark.app.Chatbot", "Chatbot.Rekomendasi Mobil") \
	.getOrCreate()

In [3]:
df_chatbot_spark = spark.read.csv("Data/prep-data-chatbot.csv", header=True, inferSchema=True)

In [4]:
# Create pipeline stages
tokenizer = Tokenizer(inputCol="prep", outputCol="words")
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=1000)
idf = IDF(inputCol="rawFeatures", outputCol="features")

# Create StringIndexer for the tag column
label_indexer = StringIndexer(inputCol="tag", outputCol="label")

# Create the pipeline
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, label_indexer])

# Prepare the data by selecting only prep and tag columns
data = df_chatbot_spark.select("prep", "tag")

# Split data into training and testing sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Fit the pipeline on training data
pipeline_model = pipeline.fit(train_data)

# Transform both training and testing data
train_df = pipeline_model.transform(train_data)
test_df = pipeline_model.transform(test_data)

In [5]:
#Model pertama menggunakan Logistic Regression
log_reg = LogisticRegression(maxIter=100)
log_reg_Model = log_reg.fit(train_df)
predictions = log_reg_Model.transform(test_df)

In [6]:
# Calculate accuracy
evaluator_accuracy = MulticlassClassificationEvaluator(
	labelCol="label", 
	predictionCol="prediction", 
	metricName="accuracy"
)

# Calculate f1 score
evaluator_f1 = MulticlassClassificationEvaluator(
	labelCol="label", 
	predictionCol="prediction", 
	metricName="f1"
)

accuracy = evaluator_accuracy.evaluate(predictions)
f1_score = evaluator_f1.evaluate(predictions)

print("Accuracy Score: {0:.4f}".format(accuracy))
print("F1 Score: {0:.4f}".format(f1_score))

NameError: name 'MulticlassClassificationEvaluator' is not defined