In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import sparknlp
sparknlp.start()

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

import pyspark.sql.functions as F
from pyspark.sql.types import StringType

In [4]:
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
#create or get Spark Session
#spark = sparknlp.start()
sparknlp.version()


'2.5.3'

In [5]:
# Import Spark NLP 
from sparknlp.base import *
from sparknlp.annotator import *
#from sparknlp.embeddings import *


In [6]:
spark_df = spark.read.csv('Sentiment.csv',header=True)

In [7]:
spark_df.show(truncate=10)

+----------+----------+--------------------+-----------+----------------------+----------+--------------------+--------------+-------------------------+--------------+----------+----------------+-------------+--------------+-------------------+----------+-----------+-------------+----------+--------------+-------------+
|        id| candidate|candidate_confidence|relevant_yn|relevant_yn_confidence| sentiment|sentiment_confidence|subject_matter|subject_matter_confidence|candidate_gold|      name|relevant_yn_gold|retweet_count|sentiment_gold|subject_matter_gold|      text|tweet_coord|tweet_created|  tweet_id|tweet_location|user_timezone|
+----------+----------+--------------------+-----------+----------------------+----------+--------------------+--------------+-------------------------+--------------+----------+----------------+-------------+--------------+-------------------+----------+-----------+-------------+----------+--------------+-------------+
|         1|No cand...|           

In [8]:
spark_df = spark_df.select('text','sentiment').filter("text is not null")

In [9]:
spark_df.columns

['text', 'sentiment']

In [10]:
spark_df = spark_df.withColumn('sentiment', F.lower(F.col('sentiment')))

In [11]:
spark_df.columns

['text', 'sentiment']

In [12]:
spark_df = spark_df.select(F.regexp_replace('text',"\w+:\/{2}[\d\w\-]+(\.[\d\w\-]+)*(?:(?:\/[^\s/]*))*","").alias('text'),'sentiment').filter("text is not null")\
                .select(F.regexp_replace('text',"http.{1,10}…","").alias('text'),'sentiment').select(F.regexp_replace('text',"…","").alias('text'),'sentiment').filter("text is not null")

In [13]:
spark_df.show(300,truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------+---------+
|text                                                                                                                                                   |sentiment|
+-------------------------------------------------------------------------------------------------------------------------------------------------------+---------+
|RT @NancyLeeGrahn: How did everyone feel about the Climate Change question last night? Exactly. #GOPDebate                                             |neutral  |
|RT @ScottWalker: Didn't catch the full #GOPdebate last night. Here are some of Scott's best lines in 90 seconds. #Walker16                             |positive |
|RT @TJMShow: No mention of Tamir Rice and the #GOPDebate was held in Cleveland? Wow.                                                                   |neutral  |
|RT @RobGeorge: 

In [14]:
#spark_df = spark_df.select(F.regexp_replace('text',"RT\s.*\:\s","").alias('text'),'sentiment').filter("text is not null")

In [15]:
spark_df.show(300,truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------+---------+
|text                                                                                                                                                   |sentiment|
+-------------------------------------------------------------------------------------------------------------------------------------------------------+---------+
|RT @NancyLeeGrahn: How did everyone feel about the Climate Change question last night? Exactly. #GOPDebate                                             |neutral  |
|RT @ScottWalker: Didn't catch the full #GOPdebate last night. Here are some of Scott's best lines in 90 seconds. #Walker16                             |positive |
|RT @TJMShow: No mention of Tamir Rice and the #GOPDebate was held in Cleveland? Wow.                                                                   |neutral  |
|RT @RobGeorge: 

In [16]:
spark_df = spark_df.withColumn('text',F.trim(F.col('text'))).filter(spark_df["text"] != '' )

In [17]:
MODEL_NAME = 'sentimentdl_use_twitter'

In [18]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")
    
use = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")


sentimentdl = SentimentDLModel.pretrained(name=MODEL_NAME, lang="en")\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("pr_sentiment")

nlpPipeline = Pipeline(
      stages = [
          documentAssembler,
          use,
          sentimentdl
      ])

empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]
sentimentdl_use_twitter download started this may take some time.
Approximate size to download 11.9 MB
[OK!]


In [19]:
match_df = pipelineModel.transform(spark_df)


In [20]:
#match_df = match_df.withColumnRenamed('sentiment', 'pr_sentiment')

In [21]:
match_df.show(300)

+--------------------+---------+--------------------+--------------------+--------------------+
|                text|sentiment|            document| sentence_embeddings|        pr_sentiment|
+--------------------+---------+--------------------+--------------------+--------------------+
|RT @NancyLeeGrahn...|  neutral|[[document, 0, 10...|[[sentence_embedd...|[[category, 0, 10...|
|RT @ScottWalker: ...| positive|[[document, 0, 12...|[[sentence_embedd...|[[category, 0, 12...|
|RT @TJMShow: No m...|  neutral|[[document, 0, 83...|[[sentence_embedd...|[[category, 0, 83...|
|RT @RobGeorge: Th...| positive|[[document, 0, 13...|[[sentence_embedd...|[[category, 0, 13...|
|RT @DanScavino: #...| positive|[[document, 0, 12...|[[sentence_embedd...|[[category, 0, 12...|
|"RT @GregAbbott_T...| positive|[[document, 0, 13...|[[sentence_embedd...|[[category, 0, 13...|
|RT @warriorwoman9...| negative|[[document, 0, 12...|[[sentence_embedd...|[[category, 0, 12...|
|Going on #MSNBC L...|  neutral|[[docume

In [22]:
match_df = match_df.select(F.col("document.result").alias('text'),\
                F.col('sentiment'),\
                F.explode('pr_sentiment.result').alias('pr_sentiment'))

In [23]:
match_df.columns

['text', 'sentiment', 'pr_sentiment']

In [24]:
match_df = match_df.select(F.explode("text").alias('text'),\
                'sentiment',\
                'pr_sentiment').filter("sentiment is not null")

In [25]:
match_df.show(300,truncate=150)

+------------------------------------------------------------------------------------------------------------------------------------------------------+---------+------------+
|                                                                                                                                                  text|sentiment|pr_sentiment|
+------------------------------------------------------------------------------------------------------------------------------------------------------+---------+------------+
|                                            RT @NancyLeeGrahn: How did everyone feel about the Climate Change question last night? Exactly. #GOPDebate|  neutral|    positive|
|                            RT @ScottWalker: Didn't catch the full #GOPdebate last night. Here are some of Scott's best lines in 90 seconds. #Walker16| positive|    positive|
|                                                                  RT @TJMShow: No mention of Tamir Rice and the #GOPDeb

In [26]:
#match_df.filter("text is not null").filter(match_df["text"]!='').select('text').take(300)

In [27]:
match_df.filter(F.col("sentiment")==F.col("pr_sentiment")).count()

5959

In [28]:
match_df.filter(match_df["pr_sentiment"]!=match_df["sentiment"]).count()

7904

In [29]:
5959/(5959+7904)

0.42984923898146143

### Second Dataset

In [100]:
spark_df1 = spark.read.csv('Tweets.csv',header=True)
spark_df1 = spark_df1.select('text','airline_sentiment').filter("text is not null")

In [101]:
spark_df1 = spark_df1.withColumnRenamed('airline_sentiment', 'sentiment')

In [103]:
spark_df1 =  spark_df1.select(F.regexp_replace('text',"\w+:\/{2}[\d\w\-]+(\.[\d\w\-]+)*(?:(?:\/[^\s/]*))*","").alias('text'),'sentiment').filter("text is not null")\
                .select(F.regexp_replace('text',"http.{1,10}…","").alias('text'),'sentiment').select(F.regexp_replace('text',"…","").alias('text'),'sentiment')

In [104]:
spark_df1.show(10,truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------+---------+
|text                                                                                                                              |sentiment|
+----------------------------------------------------------------------------------------------------------------------------------+---------+
|@VirginAmerica What @dhepburn said.                                                                                               |neutral  |
|@VirginAmerica plus you've added commercials to the experience... tacky.                                                          |positive |
|@VirginAmerica I didn't today... Must mean I need to take another trip!                                                           |neutral  |
|"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse"|negative |

In [106]:
spark_df1 = spark_df1.select(F.regexp_replace('text',"RT\s.*\:\s","").alias('text'),'sentiment').filter("text is not null")
                


In [107]:
spark_df1.show()

+--------------------+---------+
|                text|sentiment|
+--------------------+---------+
|@VirginAmerica Wh...|  neutral|
|@VirginAmerica pl...| positive|
|@VirginAmerica I ...|  neutral|
|"@VirginAmerica i...| negative|
|@VirginAmerica an...| negative|
|@VirginAmerica se...| negative|
|@VirginAmerica ye...| positive|
|@VirginAmerica Re...|  neutral|
|@virginamerica We...| positive|
|@VirginAmerica it...| positive|
|@VirginAmerica di...|  neutral|
|@VirginAmerica I ...| positive|
|@VirginAmerica Th...| positive|
|@VirginAmerica @v...| positive|
|@VirginAmerica Th...| positive|
|@VirginAmerica SF...| negative|
|@VirginAmerica So...| positive|
|@VirginAmerica  I...| negative|
|I ❤️ flying @Virg...| positive|
|@VirginAmerica yo...| positive|
+--------------------+---------+
only showing top 20 rows



In [108]:
MODEL_NAME='sentimentdl_use_twitter'

In [109]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")



use = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")


sentimentdl = SentimentDLModel.pretrained(name=MODEL_NAME, lang="en")\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("pr_sentiment")

nlpPipeline = Pipeline(
      stages = [
          documentAssembler,
          use,
          sentimentdl
      ])


tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]
sentimentdl_use_twitter download started this may take some time.
Approximate size to download 11.9 MB
[OK!]


In [110]:
empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)



In [111]:
#df = spark.createDataFrame(pd.DataFrame({"text":text_list}))
result3 = pipelineModel.transform(spark_df1)

In [112]:
result3=result3.withColumnRenamed('airline_sentiment','sentiment')

In [113]:
result3.show()

+--------------------+---------+--------------------+--------------------+--------------------+
|                text|sentiment|            document| sentence_embeddings|        pr_sentiment|
+--------------------+---------+--------------------+--------------------+--------------------+
|@VirginAmerica Wh...|  neutral|[[document, 0, 34...|[[sentence_embedd...|[[category, 0, 34...|
|@VirginAmerica pl...| positive|[[document, 0, 71...|[[sentence_embedd...|[[category, 0, 71...|
|@VirginAmerica I ...|  neutral|[[document, 0, 70...|[[sentence_embedd...|[[category, 0, 70...|
|"@VirginAmerica i...| negative|[[document, 0, 12...|[[sentence_embedd...|[[category, 0, 12...|
|@VirginAmerica an...| negative|[[document, 0, 54...|[[sentence_embedd...|[[category, 0, 54...|
|@VirginAmerica se...| negative|[[document, 0, 87...|[[sentence_embedd...|[[category, 0, 87...|
|@VirginAmerica ye...| positive|[[document, 0, 78...|[[sentence_embedd...|[[category, 0, 78...|
|@VirginAmerica Re...|  neutral|[[docume

In [114]:
result4=result3.select(F.explode(F.arrays_zip('document.result', 'pr_sentiment.result')).alias("cols"),'sentiment').filter("pr_sentiment is not null").filter("document.result is not null").filter("pr_sentiment.result is not null") \
.select(F.expr("cols['0']").alias("document"),
        F.expr("cols['1']").alias("pr_sentiment"),'sentiment')

In [115]:
result4.show(100,truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+---------+
|document                                                                                                                                                     |pr_sentiment|sentiment|
+-------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+---------+
|@VirginAmerica What @dhepburn said.                                                                                                                          |positive    |neutral  |
|@VirginAmerica plus you've added commercials to the experience... tacky.                                                                                     |positive    |positive |
|@VirginAmerica I didn't today... Must mean I need to take another trip!             

In [281]:
#result4=result4.withColumn('sentiment', F.lower(F.col('sentiment')))

In [297]:
result4.filter(result4["pr_sentiment"]==result4["sentiment"]).count()

9560

In [298]:
result4.filter(result4["pr_sentiment"]!=result4["sentiment"]).count()

5072

In [299]:
9560/(9560+5072)

0.6533624931656643

In [213]:
result4 = result4.withColumn('same',F.col("pr_sentiment")==F.col("sentiment"))

In [229]:
result4.filter(F.col('same')==1).select(F.count('same')).collect()

[Row(count(same)=5939)]

In [230]:
result4.filter(F.col('same')==0).select(F.count('same')).collect()

[Row(count(same)=7926)]

In [242]:
MODEL_NAME='classifierdl_use_cyberbullying'

In [243]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")
    
use = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")


sentimentdl = ClassifierDLModel.pretrained(name=MODEL_NAME)\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("sentiment")

nlpPipeline = Pipeline(
      stages = [
          documentAssembler,
          use,
          sentimentdl
      ])

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]
classifierdl_use_cyberbullying download started this may take some time.
Approximate size to download 21.4 MB
[OK!]


In [244]:
pipelineModel = nlpPipeline.fit(empty_df)
result3 = pipelineModel.transform(spark_df1)

In [245]:
result3.select(F.explode(F.arrays_zip('document.result', 'sentiment.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("document"),
        F.expr("cols['1']").alias("sentiment")).show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------+---------+
|document                                                                                                                                    |sentiment|
+--------------------------------------------------------------------------------------------------------------------------------------------+---------+
|RT @NancyLeeGrahn: How did everyone feel about the Climate Change question last night? Exactly. #GOPDebate                                  |neutral  |
|RT @ScottWalker: Didn't catch the full #GOPdebate last night. Here are some of Scott's best lines in 90 seconds. #Walker16 http://t.co/ZSfF…|neutral  |
|RT @TJMShow: No mention of Tamir Rice and the #GOPDebate was held in Cleveland? Wow.                                                        |neutral  |
|RT @RobGeorge: That Carly Fiorina is trending -- hours after HER debate -- above 