# 1 manual process for analyzing data.

- ### Import modules and create spark session

In [None]:
pip install numpy 

In [3]:
#import modules
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer, StopWordsRemover
from pyspark.sql import SparkSession
#create Spark session
appName = "Sentiment Analysis"
spark = SparkSession \
    .builder \
    .master("local[*]")\
    .appName(appName) \
    .getOrCreate()

- ### Read data file into Spark dataFrame

In [64]:
tweets = spark.read.csv('dataset/tweets.csv', inferSchema=True, header=True)
tweets.show(truncate=False, n=3)

+------+---------+---------------+---------------------------------+
|ItemID|Sentiment|SentimentSource|SentimentText                    |
+------+---------+---------------+---------------------------------+
|1038  |1        |Sentiment140   |that film is fantastic #brilliant|
|1804  |1        |Sentiment140   |this music is really bad #myband |
|1693  |0        |Sentiment140   |winter is terrible #thumbs-down  |
+------+---------+---------------+---------------------------------+
only showing top 3 rows



- ### Select the related data

In [5]:
tweets.select("SentimentText", col('Sentiment').cast("Int").alias("label")).show(truncate=False,n=3)

+---------------------------------+-----+
|SentimentText                    |label|
+---------------------------------+-----+
|that film is fantastic #brilliant|1    |
|this music is really bad #myband |1    |
|winter is terrible #thumbs-down  |0    |
+---------------------------------+-----+
only showing top 3 rows



In [6]:

data = tweets.select("SentimentText", col("Sentiment").cast("Int").alias("label"))
data.show(truncate = False,n=5)

+---------------------------------+-----+
|SentimentText                    |label|
+---------------------------------+-----+
|that film is fantastic #brilliant|1    |
|this music is really bad #myband |1    |
|winter is terrible #thumbs-down  |0    |
|this game is awful #nightmare    |0    |
|I love jam #loveit               |1    |
+---------------------------------+-----+
only showing top 5 rows



- ### Split the data into training and testing sets

In [21]:
#divide data, 70% for training, 30% for testing
dividedData = data.randomSplit([0.7, 0.3]) 
trainingData = dividedData[0] 
testingData = dividedData[1] 
train_rows = trainingData.count()
test_rows = testingData.count()
print ("Training data rows:", train_rows, "; Testing data rows:", test_rows)

Training data rows: 1344 ; Testing data rows: 588


- ### Prepare training data

###### Use a tokenizer to separate the SentimentText into individual words.

In [18]:
tokenizer = Tokenizer(inputCol="SentimentText", outputCol="SentimentWords")
tokenizedTrain = tokenizer.transform(trainingData)
tokenizedTrain.show(truncate=False, n=5)

+-------------------------+-----+------------------------------+
|SentimentText            |label|SentimentWords                |
+-------------------------+-----+------------------------------+
|I adore cheese #brilliant|1    |[i, adore, cheese, #brilliant]|
|I adore cheese #favorite |1    |[i, adore, cheese, #favorite] |
|I adore cheese #loveit   |1    |[i, adore, cheese, #loveit]   |
|I adore cheese #thumbs-up|1    |[i, adore, cheese, #thumbs-up]|
|I adore cheese #toptastic|1    |[i, adore, cheese, #toptastic]|
+-------------------------+-----+------------------------------+
only showing top 5 rows



In [21]:
tokenizer.getOutputCol()

'SentimentWords'

###### Remove stop words (unimportant words that will not be used as features).


In [22]:
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), 
                       outputCol="MeaningfulWords")
SwRemovedTrain = swr.transform(tokenizedTrain)
SwRemovedTrain.show(truncate=False, n=5)

+-------------------------+-----+------------------------------+---------------------------+
|SentimentText            |label|SentimentWords                |MeaningfulWords            |
+-------------------------+-----+------------------------------+---------------------------+
|I adore cheese #brilliant|1    |[i, adore, cheese, #brilliant]|[adore, cheese, #brilliant]|
|I adore cheese #favorite |1    |[i, adore, cheese, #favorite] |[adore, cheese, #favorite] |
|I adore cheese #loveit   |1    |[i, adore, cheese, #loveit]   |[adore, cheese, #loveit]   |
|I adore cheese #thumbs-up|1    |[i, adore, cheese, #thumbs-up]|[adore, cheese, #thumbs-up]|
|I adore cheese #toptastic|1    |[i, adore, cheese, #toptastic]|[adore, cheese, #toptastic]|
+-------------------------+-----+------------------------------+---------------------------+
only showing top 5 rows



- ### Convert the words that will be used as features into numerical values. 
-  In Spark 2.2.1, this is implemented using the HashingTF function 
-  Austin Appleby's MurmurHash 3 algorithm.

In [27]:
hashTF = HashingTF(inputCol=swr.getOutputCol(), outputCol="features")
hashTF.transform(SwRemovedTrain).columns

['SentimentText', 'label', 'SentimentWords', 'MeaningfulWords', 'features']

In [28]:
numericTrainData = hashTF.transform(SwRemovedTrain).select(
            'label', 'MeaningfulWords', 'features')
numericTrainData.show(truncate=False, n=3)

+-----+---------------------------+-------------------------------------------+
|label|MeaningfulWords            |features                                   |
+-----+---------------------------+-------------------------------------------+
|1    |[adore, cheese, #brilliant]|(262144,[1689,45361,100089],[1.0,1.0,1.0]) |
|1    |[adore, cheese, #favorite] |(262144,[1689,100089,108624],[1.0,1.0,1.0])|
|1    |[adore, cheese, #loveit]   |(262144,[1689,100089,254974],[1.0,1.0,1.0])|
+-----+---------------------------+-------------------------------------------+
only showing top 3 rows



In [49]:
#import modules
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import LinearSVC

- ### Use the training data to train our classifier model.

In [61]:
lr = LogisticRegression(labelCol="label", featuresCol="features", 
                        maxIter=10, regParam=0.01)
svc = LinearSVC(featuresCol='features',labelCol='label')

model = lr.fit(numericTrainData)
model1 = svc.fit(numericTrainData) 
print ("Training is done!")

Training is done!


- ### Prepare the testing data for use in evaluating the model.

In [62]:
tokenizedTest = tokenizer.transform(testingData)
SwRemovedTest = swr.transform(tokenizedTest)
numericTest = hashTF.transform(SwRemovedTest).select(
    'Label', 'MeaningfulWords', 'features')
numericTest.show(truncate=False, n=2)


+-----+------------------------------------+-------------------------------------------------------+
|Label|MeaningfulWords                     |features                                               |
+-----+------------------------------------+-------------------------------------------------------+
|1    |[adore, cheese, #bestever]          |(262144,[1689,91011,100089],[1.0,1.0,1.0])             |
|1    |[adore, classical, music, #bestever]|(262144,[91011,100089,102383,131250],[1.0,1.0,1.0,1.0])|
+-----+------------------------------------+-------------------------------------------------------+
only showing top 2 rows



- ### Predict testing data and calculate the accuracy model

In [63]:
prediction = model.transform(numericTest)
prediction1 = model1.transform(numericTest)

In [64]:
predictionFinal = prediction.select(
    "MeaningfulWords", "prediction", "Label")
predictionFinal1 = prediction1.select(
    "MeaningfulWords", "prediction", "Label")

In [65]:
predictionFinal1.show(3)

+--------------------+----------+-----+
|     MeaningfulWords|prediction|Label|
+--------------------+----------+-----+
|[adore, cheese, #...|       1.0|    1|
|[adore, classical...|       1.0|    1|
|[adore, classical...|       1.0|    1|
+--------------------+----------+-----+
only showing top 3 rows



22/12/10 17:18:14 WARN DAGScheduler: Broadcasting large task binary with size 2.0 MiB


In [66]:
correctPrediction = predictionFinal.filter(
    predictionFinal['prediction'] == predictionFinal['Label']).count()
correctPrediction1 = predictionFinal1.filter(
    predictionFinal1['prediction'] == predictionFinal1['Label']).count()
totalData = predictionFinal.count()

22/12/10 17:18:15 WARN DAGScheduler: Broadcasting large task binary with size 2.0 MiB


In [67]:
print("correct prediction:", correctPrediction, ", total data:", totalData, 
      ", accuracy:", correctPrediction/totalData)
print("correct prediction1:", correctPrediction1, ", total data:", totalData, 
      ", accuracy:", correctPrediction1/totalData)

correct prediction: 543 , total data: 554 , accuracy: 0.98014440433213
correct prediction1: 543 , total data: 554 , accuracy: 0.98014440433213


# 2 Create pipeline to automate all stages of the process.

In [58]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, VectorAssembler
from pyspark.ml.classification import LinearSVC

tweets = spark.read.csv('dataset/tweets.csv', inferSchema=True, header=True)
data = tweets.select("SentimentText", col("Sentiment").cast("Int").alias("label"))

dividedData = data.randomSplit([0.7, 0.3]) 
trainingData = dividedData[0] 
testingData = dividedData[1] 

In [59]:
# Import libraries
from pyspark.ml.feature import HashingTF, Tokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression, LinearSVC
from pyspark.ml import Pipeline

# Create a tokenizer to split the SentimentText into individual words
tokenizer = Tokenizer(inputCol='SentimentText', outputCol='words')

# Create a stop words remover
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='filtered_words')

# Create a CountVectorizer or HashingTF to convert the words into numerical values
cv = CountVectorizer(inputCol=remover.getOutputCol(), outputCol='features')

# Use a feature selector to select the relevant fields
selector = VectorAssembler(inputCols=['features'], outputCol='selected_features')

# Create a classification model
classifier = LinearSVC(featuresCol=selector.getOutputCol(),labelCol='label' )

# Put everything in a pipeline
pipeline = Pipeline(stages=[tokenizer, remover, cv, selector, classifier])

# Fit the model to the data
model = pipeline.fit(trainingData)


In [60]:
predictions1 = model.transform(testingData)
predictions.show(2)

+--------------------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|       SentimentText|label|      SentimentWords|     MeaningfulWords|            features|       rawPrediction|prediction|
+--------------------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|I adore cheese #b...|    1|[i, adore, cheese...|[adore, cheese, #...|(262144,[1689,910...|[-1.0898936908271...|       1.0|
|I adore cheese #b...|    1|[i, adore, cheese...|[adore, cheese, #...|(262144,[1689,453...|[-1.0899028757255...|       1.0|
+--------------------+-----+--------------------+--------------------+--------------------+--------------------+----------+
only showing top 2 rows



22/12/11 08:26:14 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB


In [61]:
predictionFinal =  predictions.select(
    "MeaningfulWords", "prediction", "Label")
# accuracy
predictionFinal.show(2)

+--------------------+----------+-----+
|     MeaningfulWords|prediction|Label|
+--------------------+----------+-----+
|[adore, cheese, #...|       1.0|    1|
|[adore, cheese, #...|       1.0|    1|
+--------------------+----------+-----+
only showing top 2 rows



22/12/11 08:26:27 WARN DAGScheduler: Broadcasting large task binary with size 2.0 MiB


In [62]:
correctPrediction = predictionFinal.filter(
    predictionFinal['prediction'] == predictionFinal['Label']).count()
totalData = predictionFinal.count()
print( f"Accuracy {correctPrediction/totalData}")

22/12/11 08:26:31 WARN DAGScheduler: Broadcasting large task binary with size 2.0 MiB


Accuracy 0.9735099337748344


In [63]:
model.save("SentimentAnalysis")