Necessary imports

In [1]:
#Imports
import sys
sys.path.append('../../')

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

Start spark

In [2]:
spark = SparkSession.builder \
    .appName("sent")\
    .master("local[1]")\
    .config("spark.driver.memory","8G")\
    .config("spark.driver.maxResultS1ize", "2G") \
    .config("spark.jars.packages", "JohnSnowLabs:spark-nlp:1.6.2")\
    .config("spark.kryoserializer.buffer.max", "500m")\
    .getOrCreate()

In [3]:
from sparknlp.annotator import *
from sparknlp.common import RegexRule
from sparknlp.base import DocumentAssembler, Finisher

Create a spark dataset

In [4]:
data = spark. \
        read. \
        parquet("../../data/sentiment.parquet"). \
        limit(10000)
data.cache()
data.count()

10000

Create appropriate annotators. We are using Sentence Detection, Tokenizing the sentences, and find the lemmas of those tokens
The Finisher will only output the Sentiment.

In [5]:
document_assembler = DocumentAssembler() \
    .setInputCol("text")

sentence_detector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

lemmatizer = Lemmatizer() \
    .setInputCols(["token"]) \
    .setOutputCol("lemma") \
    .setDictionary("../../data/lemma-corpus-small/lemmas_small.txt", key_delimiter="->", value_delimiter="\t")
        
sentiment_detector = SentimentDetector() \
    .setInputCols(["lemma", "sentence"]) \
    .setOutputCol("sentiment_score") \
    .setDictionary("../../data/sentiment-corpus/default-sentiment-dict.txt", ",")
    
finisher = Finisher() \
    .setInputCols(["sentiment_score"]) \
    .setOutputCols(["sentiment"])

Train the pipeline, which is only being trained from external resources, not from the dataset we pass on.
The prediction runs on the target dataset

In [6]:
pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, lemmatizer, sentiment_detector, finisher])
model = pipeline.fit(data)
result = model.transform(data)

We filter the finisher output, to find the positive sentiment lines

In [7]:
result.filter("sentiment != 'positive'").show()

+------+---------+--------------------+
|itemid|sentiment|                text|
+------+---------+--------------------+
|   102| negative|   not a cool night.|
|   135| negative| #squarespace bri...|
|   145| negative| &quot;I want you...|
|   190| negative| @copicmarker&quo...|
|   362| negative|            bad day.|
|   363| negative|   bad day.....da...|
|   456| negative| - My name is Amy...|
|   503| negative|  Another expensi...|
|   526| negative|  jus got some ba...|
|   588| negative| at Chris Brown ....|
|   612| negative| bad day 2day. Br...|
|   613| negative| bad night for th...|
|   632| negative|  Things are not ...|
|   634| negative|  To The Death! A...|
|   677| negative| .. why the jonas...|
|   684| negative| .........not rea...|
|   699| negative|- @alittlebit - t...|
|   716| negative| ... ...I don't w...|
|   772| negative|- @littlecharva -...|
|   816| negative| Carrie has a bad...|
+------+---------+--------------------+
only showing top 20 rows

