Vivekn Sentiment analysis computes Vivek Nayamanan algorithm from a set of positive and negative corpus

We call necessary imports

In [1]:
#Imports
import time
import sys
import os
sys.path.append('../../')

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline, PipelineModel

We load SparkSession if not already there

In [2]:
spark = SparkSession.builder \
    .appName("ner")\
    .master("local[1]")\
    .config("spark.driver.memory","8G")\
    .config("spark.driver.maxResultSize", "2G")\
    .config("spark.jars.packages", "JohnSnowLabs:spark-nlp:1.6.2")\
    .config("spark.kryoserializer.buffer.max", "500m")\
    .getOrCreate()

In [3]:
from sparknlp.annotator import *
from sparknlp.base import DocumentAssembler, Finisher

We load a spark dataset and put it in memory

In [4]:
#Load the input data to be annotated
data = spark. \
        read. \
        parquet("file:///" + os.getcwd() + "../../../data/sentiment.parquet"). \
        limit(1000)
data.cache()
data.count()
data.show()

+------+---------+--------------------+
|itemid|sentiment|                text|
+------+---------+--------------------+
|     1|        0|                 ...|
|     2|        0|                 ...|
|     3|        1|              omg...|
|     4|        0|          .. Omga...|
|     5|        0|         i think ...|
|     6|        0|         or i jus...|
|     7|        1|       Juuuuuuuuu...|
|     8|        0|       Sunny Agai...|
|     9|        1|      handed in m...|
|    10|        1|      hmmmm.... i...|
|    11|        0|      I must thin...|
|    12|        1|      thanks to a...|
|    13|        0|      this weeken...|
|    14|        0|     jb isnt show...|
|    15|        0|     ok thats it ...|
|    16|        0|    &lt;-------- ...|
|    17|        0|    awhhe man.......|
|    18|        1|    Feeling stran...|
|    19|        0|    HUGE roll of ...|
|    20|        0|    I just cut my...|
+------+---------+--------------------+
only showing top 20 rows



We creat the document assemblerr, which will put target text column into Annotation form

In [5]:
### Define the dataframe
document_assembler = DocumentAssembler() \
            .setInputCol("text")
        
### Transform input to appropriate schema
#assembled = document_assembler.transform(data)

The sentence detector will parse sub sentences in every line

In [6]:
### Sentence detector
sentence_detector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")
#sentence_data = sentence_detector.transform(checked)

The tokenizer will match standard tokens

In [7]:
### Tokenizer
tokenizer = Tokenizer() \
            .setInputCols(["sentence"]) \
            .setOutputCol("token")
#tokenized = tokenizer.transform(assembled)

Normalizer will clean out the tokens

In [8]:
normalizer = Normalizer() \
            .setInputCols(["token"]) \
            .setOutputCol("normal")

The spell checker will correct normalized tokens, this trains with a dictionary of english words

In [9]:
### Spell Checker
spell_checker = NorvigSweetingApproach() \
            .setInputCols(["normal"]) \
            .setOutputCol("spell") \
            .setDictionary("file:///" + os.getcwd() + "/../../data/spell/words.txt")

#checked = spell_checker.fit(tokenized).transform(tokenized)

We creat the ViveknSentimentApproach and set resources to train it

In [10]:
sentiment_detector = ViveknSentimentApproach() \
    .setInputCols(["spell", "sentence"]) \
    .setOutputCol("sentiment") \
    .setPruneCorpus(0) \
    .setPositiveSource("file:///" + os.getcwd() + "/../../data/vivekn/positive") \
    .setNegativeSource("file:///" + os.getcwd() + "/../../data/vivekn/negative") \


The finisher will utilize sentiment analysis output

In [11]:
finisher = Finisher() \
    .setInputCols(["sentiment"]) \
    .setIncludeKeys(True)

Dummy training text

In [12]:
training = spark.sparkContext.parallelize([["hello world"], ["this is some more text"], ["and here another sentence"]]).toDF().toDF("text")
training.show()

+--------------------+
|                text|
+--------------------+
|         hello world|
|this is some more...|
|and here another ...|
+--------------------+



We fit and predict over data

In [13]:
pipeline = Pipeline(stages=[
    document_assembler,
    sentence_detector,
    tokenizer,
    normalizer,
    spell_checker,
    sentiment_detector,
    finisher
])

start = time.time()
sentiment_data = pipeline.fit(training).transform(data)
sentiment_data.show()
end = time.time()
print("Time elapsed pipeline process: " + str(end - start))

+------+--------------------+--------------------+
|itemid|                text|  finished_sentiment|
+------+--------------------+--------------------+
|     1|                 ...|          result->na|
|     2|                 ...|          result->na|
|     3|              omg...|          result->na|
|     4|          .. Omga...|result->na@result...|
|     5|         i think ...|result->na@result...|
|     6|         or i jus...|          result->na|
|     7|       Juuuuuuuuu...|          result->na|
|     8|       Sunny Agai...|          result->na|
|     9|      handed in m...|result->na@result...|
|    10|      hmmmm.... i...|result->na@result...|
|    11|      I must thin...|          result->na|
|    12|      thanks to a...|          result->na|
|    13|      this weeken...|          result->na|
|    14|     jb isnt show...|          result->na|
|    15|     ok thats it ...|          result->na|
|    16|    &lt;-------- ...|result->na@result...|
|    17|    awhhe man.......|re

We can take a sample back into the driver

In [14]:
for r in sentiment_data.take(5):
    print(r)

Row(itemid=1, text='                     is so sad for my APL friend.............', finished_sentiment='result->na')
Row(itemid=2, text='                   I missed the New Moon trailer...', finished_sentiment='result->na')
Row(itemid=3, text='              omg its already 7:30 :O', finished_sentiment='result->na')
Row(itemid=4, text="          .. Omgaga. Im sooo  im gunna CRy. I've been at this dentist since 11.. I was suposed 2 just get a crown put on (30mins)...", finished_sentiment='result->na@result->na@result->na@result->negative')
Row(itemid=5, text='         i think mi bf is cheating on me!!!       T_T', finished_sentiment='result->na@result->na')


We save it to disk and re read it. Either after or before fitting the model

In [15]:
start = time.time()
pipeline.write().overwrite().save("./ps")
pipeline.fit(data).write().overwrite().save("./ms")
end = time.time()
print("Time elapsed in write pipelines: " + str(end - start))

Time elapsed in write pipelines: 34.681615352630615


In [16]:
start = time.time()
p = Pipeline.read().load("./ps")
pm = PipelineModel.read().load("./ms")
end = time.time()
print("Time elapsed in read pipelines: " + str(end - start))

Time elapsed in read pipelines: 23.841302633285522


In [17]:
start = time.time()
pm.transform(data).where("finished_sentiment not like '%negative%'").show()
print(pm.transform(data).count())
end = time.time()
print("Time elapsed in using loaded pipelines: " + str(end - start))

+------+--------------------+--------------------+
|itemid|                text|  finished_sentiment|
+------+--------------------+--------------------+
|     1|                 ...|          result->na|
|     2|                 ...|          result->na|
|     3|              omg...|          result->na|
|     5|         i think ...|result->na@result...|
|     6|         or i jus...|          result->na|
|     7|       Juuuuuuuuu...|          result->na|
|     8|       Sunny Agai...|          result->na|
|     9|      handed in m...|result->na@result...|
|    10|      hmmmm.... i...|result->na@result...|
|    11|      I must thin...|          result->na|
|    12|      thanks to a...|          result->na|
|    13|      this weeken...|          result->na|
|    14|     jb isnt show...|          result->na|
|    15|     ok thats it ...|          result->na|
|    16|    &lt;-------- ...|result->na@result...|
|    17|    awhhe man.......|result->na@result...|
|    18|    Feeling stran...|re