# Import Spark SQL, libary machine learning dan membuat session

In [1]:
#mengimport modul yang dibutuhkan
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer, StopWordsRemover

#membuat session
appName = "Sentiment Analysis di Spark"
spark = SparkSession \
    .builder \
    .appName(appName) \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

# Memuat data dari file

In [2]:
#membaca data dari file ke DataFrame dengan skema yang diinfer
tweets_csv = spark.read.csv('dataset/tweets.csv', inferSchema=True, header=True)
tweets_csv.show(truncate=False, n=3)

+------+---------+---------------+---------------------------------+
|ItemID|Sentiment|SentimentSource|SentimentText                    |
+------+---------+---------------+---------------------------------+
|1038  |1        |Sentiment140   |that film is fantastic #brilliant|
|1804  |1        |Sentiment140   |this music is really bad #myband |
|1693  |0        |Sentiment140   |winter is terrible #thumbs-down  |
+------+---------+---------------+---------------------------------+
only showing top 3 rows



# Menyiapkan data

In [3]:
#memilih data hanya dari kolom "SentimentText" dan kolom "Sentiment".
#kemudian meng-casting nilai di kolom "Sentiment" ke tipe integer dan mengganti nama kolomnya menjadi "label".
data = tweets_csv.select(
    "SentimentText", col("Sentiment").cast("Int").alias("label"))
data.show(truncate = False,n=5)

+---------------------------------+-----+
|SentimentText                    |label|
+---------------------------------+-----+
|that film is fantastic #brilliant|1    |
|this music is really bad #myband |1    |
|winter is terrible #thumbs-down  |0    |
|this game is awful #nightmare    |0    |
|I love jam #loveit               |1    |
+---------------------------------+-----+
only showing top 5 rows



# Memisahkan data training dan testing

In [4]:
dataTerpisah = data.randomSplit([0.7, 0.3])
train = dataTerpisah[0]
#pada data testing, kita rename labelnya dari "label" ke "trueLabel"
test = dataTerpisah[1].withColumnRenamed("label", "trueLabel")
train_rows = train.count()
test_rows = test.count()
print ("Jumlah baris data training:", train_rows, ", jumlah baris data testing:", test_rows)

Jumlah baris data training: 1335 , jumlah baris data testing: 597


# Menyiapkan data testing

In [5]:
tokenizer = Tokenizer(inputCol="SentimentText", outputCol="SentimentWords")
tokenizedTrain = tokenizer.transform(train)
tokenizedTrain.show(truncate=False, n=5)

+---------------------------------+-----+---------------------------------------+
|SentimentText                    |label|SentimentWords                         |
+---------------------------------+-----+---------------------------------------+
|I adore cheese #bestever         |1    |[i, adore, cheese, #bestever]          |
|I adore cheese #brilliant        |1    |[i, adore, cheese, #brilliant]         |
|I adore cheese #thumbs-up        |1    |[i, adore, cheese, #thumbs-up]         |
|I adore cheese #toptastic        |1    |[i, adore, cheese, #toptastic]         |
|I adore classical music #bestever|1    |[i, adore, classical, music, #bestever]|
+---------------------------------+-----+---------------------------------------+
only showing top 5 rows



In [6]:
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), 
                       outputCol="MeaningfulWords")
SwRemovedTrain = swr.transform(tokenizedTrain)
SwRemovedTrain.show(truncate=False, n=5)

+---------------------------------+-----+---------------------------------------+------------------------------------+
|SentimentText                    |label|SentimentWords                         |MeaningfulWords                     |
+---------------------------------+-----+---------------------------------------+------------------------------------+
|I adore cheese #bestever         |1    |[i, adore, cheese, #bestever]          |[adore, cheese, #bestever]          |
|I adore cheese #brilliant        |1    |[i, adore, cheese, #brilliant]         |[adore, cheese, #brilliant]         |
|I adore cheese #thumbs-up        |1    |[i, adore, cheese, #thumbs-up]         |[adore, cheese, #thumbs-up]         |
|I adore cheese #toptastic        |1    |[i, adore, cheese, #toptastic]         |[adore, cheese, #toptastic]         |
|I adore classical music #bestever|1    |[i, adore, classical, music, #bestever]|[adore, classical, music, #bestever]|
+---------------------------------+-----+-------

In [7]:
hashTF = HashingTF(inputCol=swr.getOutputCol(), outputCol="features")
numericTrain = hashTF.transform(SwRemovedTrain).select(
    'label', 'MeaningfulWords', 'features')
numericTrain.show(truncate=False, n=3)

+-----+---------------------------+-------------------------------------------+
|label|MeaningfulWords            |features                                   |
+-----+---------------------------+-------------------------------------------+
|1    |[adore, cheese, #bestever] |(262144,[65702,69876,108823],[1.0,1.0,1.0])|
|1    |[adore, cheese, #brilliant]|(262144,[61111,65702,69876],[1.0,1.0,1.0]) |
|1    |[adore, cheese, #thumbs-up]|(262144,[3984,65702,69876],[1.0,1.0,1.0])  |
+-----+---------------------------+-------------------------------------------+
only showing top 3 rows



# Mentraining model dengan data training

In [8]:
lr = LogisticRegression(labelCol="label", featuresCol="features", 
                        maxIter=10, regParam=0.01)
model = lr.fit(numericTrain)
print ("Training selesai!")

Training selesai!


# Menyiapkan data testing

In [9]:
tokenizedTest = tokenizer.transform(test)
SwRemovedTest = swr.transform(tokenizedTest)
numericTest = hashTF.transform(SwRemovedTest).select(
    'trueLabel', 'MeaningfulWords', 'features')
numericTest.show(truncate=False, n=2)


+---------+--------------------------+-------------------------------------------+
|trueLabel|MeaningfulWords           |features                                   |
+---------+--------------------------+-------------------------------------------+
|1        |[adore, cheese, #favorite]|(262144,[65702,69876,156543],[1.0,1.0,1.0])|
|1        |[adore, cheese, #loveit]  |(262144,[65702,65728,69876],[1.0,1.0,1.0]) |
+---------+--------------------------+-------------------------------------------+
only showing top 2 rows



# Memprediksi dan menghitung akurasi model

In [10]:
prediksiMentah = model.transform(numericTest)
prediksiFinal = prediksiMentah.select(
    "MeaningfulWords", "prediction", "trueLabel")
prediksiFinal.show(n=4, truncate = False)
prediksiBenar = prediksiFinal.filter(
    prediksiFinal['prediction'] == prediksiFinal['trueLabel']).count()
totalData = prediksiFinal.count()
print("prediksi benar: ", prediksiBenar, ", total data: ", 
      totalData, ", akurasi: ", prediksiBenar/totalData)

+-------------------------------------+----------+---------+
|MeaningfulWords                      |prediction|trueLabel|
+-------------------------------------+----------+---------+
|[adore, cheese, #favorite]           |1.0       |1        |
|[adore, cheese, #loveit]             |1.0       |1        |
|[adore, classical, music, #loveit]   |1.0       |1        |
|[adore, classical, music, #thumbs-up]|1.0       |1        |
+-------------------------------------+----------+---------+
only showing top 4 rows

prediksi benar:  591 , total data:  597 , akurasi:  0.9899497487437185
