# Read Me
- Pada case ini saya menggunakan pyspark.

## 0. Inisiasi Spark

In [1]:
#Inisiasi spark
import findspark
findspark.init()

In [20]:
#Import Library
import pyspark as ps
from pyspark.sql import SparkSession

from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import CountVectorizer
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [3]:
#Testing spark
spark = SparkSession.builder.getOrCreate()
df = spark.sql("select 'spark' as hello ")
df.show()

+-----+
|hello|
+-----+
|spark|
+-----+



- Karena spark sudah jalan, maka sudah bisa dilanjutkan ke tahapan selanjutnya

## 1. Load Datasets
- Memuat dataset yang akan digunakan.

In [4]:
df = spark.read.csv('clean_tweet.csv', header=True)

In [5]:
#Melihat 5 data teratas
df.show(5)

+--------------------+------+
|                text|target|
+--------------------+------+
|awww that s a bum...|     0|
|is upset that he ...|     0|
|i dived many time...|     0|
|my whole body fee...|     0|
|no it s not behav...|     0|
+--------------------+------+
only showing top 5 rows



## 2. Handling Missing Values
- Crosscheck apabila ada missing values pada data.

In [6]:
#Total data di awal
df.count()

1600000

In [7]:
#Drop missing values
df = df.dropna()

In [8]:
#Karena jumlahnya berkurang, maka ada missing values pada data.
df.count()

1596753

## 3. Modelling

### 3.1.1 Menggunakan TF, IDF dan Logistic Regression

In [9]:
#Split dataset ke dalam training dan testing dengan proporsi 80 % untuk training dan 20% untuk testing
(train_set, test_set) = df.randomSplit([0.8, 0.2])

In [10]:
#Membuat Pipeline Staging untuk Model
tokenization = Tokenizer(inputCol="text", outputCol="words")
tf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5)
label_string = StringIndexer(inputCol = "target", outputCol = "label")
pipeline = Pipeline(stages=[tokenization, tf, idf, label_string])

In [11]:
#Melakukan training model
pipelineFit = pipeline.fit(train_set)
train_df = pipelineFit.transform(train_set)
test_df = pipelineFit.transform(test_set)

In [12]:
#Model pertama menggunakan Logistic Regression
log_reg = LogisticRegression(maxIter=100)
log_reg_Model = log_reg.fit(train_df)
predictions = log_reg_Model.transform(test_df)

In [13]:
#Melakukan Evalusi
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(test_set.count())
roc_auc = evaluator.evaluate(predictions)

print ("Accuracy Score: {0:.4f}".format(accuracy))
print ("ROC-AUC: {0:.4f}".format(roc_auc))

Accuracy Score: 0.7878
ROC-AUC: 0.8579


- Dari model kombinasi menggunakan TF, IDF, dan Logistic Regression didapatkan AUC Score sebesar 85%.

### 3.1.2 Menggunakan CountVectorizer, IDF, dan Logistic Regression

In [14]:
#Membuat Pipeline untuk Model
tokenizer = Tokenizer(inputCol="text", outputCol="words")
cv = CountVectorizer(vocabSize=2**16, inputCol="words", outputCol='cv')
idf = IDF(inputCol='cv', outputCol="features", minDocFreq=5)
label_stringIdx = StringIndexer(inputCol = "target", outputCol = "label")
lr = LogisticRegression(maxIter=100)
pipeline = Pipeline(stages=[tokenizer, cv, idf, label_stringIdx, lr])

#Melakukan Training Model
pipelineFit = pipeline.fit(train_set)
predictions = pipelineFit.transform(test_set)

#Evaluasi Model
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(test_set.count())
roc_auc = evaluator.evaluate(predictions)

print ("Accuracy Score: {0:.4f}".format(accuracy))
print ("ROC-AUC: {0:.4f}".format(roc_auc))

Accuracy Score: 0.7940
ROC-AUC: 0.8639


### 3.2.1 Menggunakan TF, IDF, Naive Bayes

In [15]:
#Membuat Pipeline untuk Model
tokenization = Tokenizer(inputCol="text", outputCol="words")
tf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5)
label_string = StringIndexer(inputCol = "target", outputCol = "label")
pipeline = Pipeline(stages=[tokenization, tf, idf, label_string])

In [16]:
#Melakukan training model
pipelineFit = pipeline.fit(train_set)
train_df = pipelineFit.transform(train_set)
test_df = pipelineFit.transform(test_set)

In [17]:
#Model Kedua menggunakan Naive Bayes
naive = NaiveBayes()
naive_model = naive.fit(train_df)
predictions = naive_model.transform(test_df)

In [18]:
#Melakukan Evalusi
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(test_set.count())
roc_auc = evaluator.evaluate(predictions)

print ("Accuracy Score: {0:.4f}".format(accuracy))
print ("ROC-AUC: {0:.4f}".format(roc_auc))

Accuracy Score: 0.7629
ROC-AUC: 0.4874


### 3.2.2 Menggunakan CountVectorizer, IDF, Naive Bayes

In [19]:
#Membuat Pipeline untuk Model
tokenizer = Tokenizer(inputCol="text", outputCol="words")
cv = CountVectorizer(vocabSize=2**16, inputCol="words", outputCol='cv')
idf = IDF(inputCol='cv', outputCol="features", minDocFreq=5)
label_stringIdx = StringIndexer(inputCol = "target", outputCol = "label")
naive = NaiveBayes()
pipeline = Pipeline(stages=[tokenizer, cv, idf, label_stringIdx, naive])

#Melakukan Training Model
pipelineFit = pipeline.fit(train_set)
predictions = pipelineFit.transform(test_set)

#Evaluasi Model
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(test_set.count())
roc_auc = evaluator.evaluate(predictions)

print ("Accuracy Score: {0:.4f}".format(accuracy))
print ("ROC-AUC: {0:.4f}".format(roc_auc))

Accuracy Score: 0.7698
ROC-AUC: 0.4850


### 3.3.1 Menggunakan TF, IDF dan Linear SVC

In [22]:
#Membuat Pipeline Staging untuk Model
tokenization = Tokenizer(inputCol="text", outputCol="words")
tf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5)
label_string = StringIndexer(inputCol = "target", outputCol = "label")
pipeline = Pipeline(stages=[tokenization, tf, idf, label_string])

#Melakukan training model
pipelineFit = pipeline.fit(train_set)
train_df = pipelineFit.transform(train_set)
test_df = pipelineFit.transform(test_set)

#Model pertama menggunakan Logistic Regression
linear_svc = LinearSVC()
linear_svc_model = linear_svc.fit(train_df)
predictions = linear_svc_model.transform(test_df)

#Melakukan Evalusi
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(test_set.count())
roc_auc = evaluator.evaluate(predictions)

print ("Accuracy Score: {0:.4f}".format(accuracy))
print ("ROC-AUC: {0:.4f}".format(roc_auc))

Accuracy Score: 0.7895
ROC-AUC: 0.8597


### 3.3.2 Menggunakan CountVectorizer, IDF, dan Linear SVC

In [24]:
#Membuat Pipeline untuk Model
tokenizer = Tokenizer(inputCol="text", outputCol="words")
cv = CountVectorizer(vocabSize=2**16, inputCol="words", outputCol='cv')
idf = IDF(inputCol='cv', outputCol="features", minDocFreq=5)
label_stringIdx = StringIndexer(inputCol = "target", outputCol = "label")
linear_svc = LinearSVC()
pipeline = Pipeline(stages=[tokenizer, cv, idf, label_stringIdx, linear_svc])

#Melakukan Training Model
pipelineFit = pipeline.fit(train_set)
predictions = pipelineFit.transform(test_set)

#Evaluasi Model
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(test_set.count())
roc_auc = evaluator.evaluate(predictions)

print ("Accuracy Score: {0:.4f}".format(accuracy))
print ("ROC-AUC: {0:.4f}".format(roc_auc))

Accuracy Score: 0.7957
ROC-AUC: 0.8658


## 4. Kesimpulan

***
- Setelah melakukan analisis sentiment, dapat ditarik beberapa kesimpulan:
    - Dari keempat model yang telah dijalankan, maka dapat ditarik kesimpulan bahwa model yang menggunakan CountVectorizer, IDF, dan Linear SVC memberikan nilai AUC Score yang paling tinggi yaitu 86.58%.
    - Model ini scorenya sangat beda tipis dengan nilai score Logistic Regression yaitu 86.39.
***