In [1]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.base import *
from sparknlp.annotator import *



from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
spark = sparknlp.start()
sparknlp.version()

'2.5.5'

In [3]:
# Import Spark NLP 
from sparknlp.base import *
from sparknlp.annotator import *
#from sparknlp.embeddings import *


In [35]:
spark_df = spark.read.csv('Sentiment.csv',header=True,encoding='utf-8')

In [36]:
spark_df.show(truncate=10)

+----------+----------+--------------------+-----------+----------------------+----------+--------------------+--------------+-------------------------+--------------+----------+----------------+-------------+--------------+-------------------+----------+-----------+-------------+----------+--------------+-------------+
|        id| candidate|candidate_confidence|relevant_yn|relevant_yn_confidence| sentiment|sentiment_confidence|subject_matter|subject_matter_confidence|candidate_gold|      name|relevant_yn_gold|retweet_count|sentiment_gold|subject_matter_gold|      text|tweet_coord|tweet_created|  tweet_id|tweet_location|user_timezone|
+----------+----------+--------------------+-----------+----------------------+----------+--------------------+--------------+-------------------------+--------------+----------+----------------+-------------+--------------+-------------------+----------+-----------+-------------+----------+--------------+-------------+
|         1|No cand...|           

In [37]:
spark_df = spark_df.select('text').filter("text is not null")

In [38]:
spark_df.columns

['text']

In [39]:
spark_df = spark_df.select(F.regexp_replace('text',"\w+:\/{2}[\d\w\-]+(\.[\d\w\-]+)*(?:(?:\/[^\s/]*))*","").alias('text')).filter("text is not null")\
                .select(F.regexp_replace('text',"http.{1,10}…","").alias('text')).select(F.regexp_replace('text',"…","").alias('text')).filter("text is not null")

In [40]:
spark_df.show(300,truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                                                   |
+-------------------------------------------------------------------------------------------------------------------------------------------------------+
|RT @NancyLeeGrahn: How did everyone feel about the Climate Change question last night? Exactly. #GOPDebate                                             |
|RT @ScottWalker: Didn't catch the full #GOPdebate last night. Here are some of Scott's best lines in 90 seconds. #Walker16                             |
|RT @TJMShow: No mention of Tamir Rice and the #GOPDebate was held in Cleveland? Wow.                                                                   |
|RT @RobGeorge: That Carly Fiorina is trending -- hours after HER debate -- 

In [41]:
spark_df = spark_df.select(F.regexp_replace('text',"RT\s.*\:\s","").alias('text')).filter("text is not null")

In [42]:
spark_df.show(300,truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                                                |
+----------------------------------------------------------------------------------------------------------------------------------------------------+
|How did everyone feel about the Climate Change question last night? Exactly. #GOPDebate                                                             |
|Didn't catch the full #GOPdebate last night. Here are some of Scott's best lines in 90 seconds. #Walker16                                           |
|No mention of Tamir Rice and the #GOPDebate was held in Cleveland? Wow.                                                                             |
|That Carly Fiorina is trending -- hours after HER debate -- above any of the men in just-comp

In [43]:
spark_df = spark_df.withColumn('text',F.trim(F.col('text'))).filter(spark_df["text"] != '' )

In [44]:
MODEL_NAME = 'sentimentdl_use_twitter'

In [45]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")
    
use = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")


sentimentdl = SentimentDLModel.pretrained(name=MODEL_NAME, lang="en")\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("pr_sentiment")

nlpPipeline = Pipeline(
      stages = [
          documentAssembler,
          use,
          sentimentdl
      ])

empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]
sentimentdl_use_twitter download started this may take some time.
Approximate size to download 11.9 MB
[OK!]


In [46]:
match_df = pipelineModel.transform(spark_df)


In [47]:
#match_df = match_df.withColumnRenamed('sentiment', 'pr_sentiment')

In [48]:
match_df.show(30,truncate=50)

+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+
|                                              text|                                          document|                               sentence_embeddings|                                      pr_sentiment|
+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+
|How did everyone feel about the Climate Change ...|[[document, 0, 86, How did everyone feel about ...|[[sentence_embeddings, 0, 86, How did everyone ...|[[category, 0, 86, positive, [sentence -> 0, po...|
|Didn't catch the full #GOPdebate last night. He...|[[document, 0, 104, Didn't catch the full #GOPd...|[[sentence_embeddings, 0, 104, Didn't catch the...|[[category, 0, 104, po

In [49]:
match_df = match_df.select(F.col("document.result").alias('text'),\
                F.explode('pr_sentiment.result').alias('pr_sentiment'))

In [50]:
match_df.columns

['text', 'pr_sentiment']

In [51]:
match_df = match_df.select(F.explode("text").alias('text'),\
                'pr_sentiment').filter("sentiment is not null")

In [58]:
match_df.show(20,truncate=150)

+-----------------------------------------------------------------------------------------------------------------------------------------+------------+
|                                                                                                                                     text|pr_sentiment|
+-----------------------------------------------------------------------------------------------------------------------------------------+------------+
|                                                  How did everyone feel about the Climate Change question last night? Exactly. #GOPDebate|    positive|
|                                Didn't catch the full #GOPdebate last night. Here are some of Scott's best lines in 90 seconds. #Walker16|    positive|
|                                                                  No mention of Tamir Rice and the #GOPDebate was held in Cleveland? Wow.|    negative|
|              That Carly Fiorina is trending -- hours after HER debate -- above a

In [57]:
match_df.filter("text is not null").filter(match_df["text"]!='').select('text').take(20)

[Row(text='How did everyone feel about the Climate Change question last night? Exactly. #GOPDebate'),
 Row(text="Didn't catch the full #GOPdebate last night. Here are some of Scott's best lines in 90 seconds. #Walker16"),
 Row(text='No mention of Tamir Rice and the #GOPDebate was held in Cleveland? Wow.'),
 Row(text="That Carly Fiorina is trending -- hours after HER debate -- above any of the men in just-completed #GOPdebate says she's on"),
 Row(text='#GOPDebate w/ @realDonaldTrump delivered the highest ratings in the history of presidential debates. #Trump2016'),
 Row(text='"""On my first day I will rescind every illegal executive action taken by Barack Obama."" #GOPDebate @FoxNews"'),
 Row(text='I liked her and was happy when I heard she was going to be the moderator. Not anymore. #GOPDebate @megynkelly'),
 Row(text='Going on #MSNBC Live with @ThomasARoberts around 2 PM ET.  #GOPDebate'),
 Row(text='Deer in the headlights Ben Carson, may be the only brain surgeon who has performed a

### Second Dataset

In [60]:
spark_df1 = spark.read.csv('Tweets.csv',header=True,encoding='utf-8')
spark_df1 = spark_df1.select('text','airline_sentiment').filter("text is not null")

In [61]:
spark_df1 = spark_df1.withColumnRenamed('airline_sentiment', 'sentiment')

In [62]:
spark_df1 =  spark_df1.select(F.regexp_replace('text',"\w+:\/{2}[\d\w\-]+(\.[\d\w\-]+)*(?:(?:\/[^\s/]*))*","").alias('text'),'sentiment').filter("text is not null")\
                .select(F.regexp_replace('text',"http.{1,10}…","").alias('text'),'sentiment').select(F.regexp_replace('text',"…","").alias('text'),'sentiment')

In [63]:
spark_df1.show(10,truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------+---------+
|text                                                                                                                              |sentiment|
+----------------------------------------------------------------------------------------------------------------------------------+---------+
|@VirginAmerica What @dhepburn said.                                                                                               |neutral  |
|@VirginAmerica plus you've added commercials to the experience... tacky.                                                          |positive |
|@VirginAmerica I didn't today... Must mean I need to take another trip!                                                           |neutral  |
|"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse"|negative |

In [64]:
spark_df1 = spark_df1.select(F.regexp_replace('text',"RT\s.*\:\s","").alias('text'),'sentiment').filter("text is not null")
                


In [65]:
spark_df1.show()

+--------------------+---------+
|                text|sentiment|
+--------------------+---------+
|@VirginAmerica Wh...|  neutral|
|@VirginAmerica pl...| positive|
|@VirginAmerica I ...|  neutral|
|"@VirginAmerica i...| negative|
|@VirginAmerica an...| negative|
|@VirginAmerica se...| negative|
|@VirginAmerica ye...| positive|
|@VirginAmerica Re...|  neutral|
|@virginamerica We...| positive|
|@VirginAmerica it...| positive|
|@VirginAmerica di...|  neutral|
|@VirginAmerica I ...| positive|
|@VirginAmerica Th...| positive|
|@VirginAmerica @v...| positive|
|@VirginAmerica Th...| positive|
|@VirginAmerica SF...| negative|
|@VirginAmerica So...| positive|
|@VirginAmerica  I...| negative|
|I ❤️ flying @Virg...| positive|
|@VirginAmerica yo...| positive|
+--------------------+---------+
only showing top 20 rows



In [66]:
MODEL_NAME='sentimentdl_use_twitter'

In [67]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")



use = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")


sentimentdl = SentimentDLModel.pretrained(name=MODEL_NAME, lang="en")\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("pr_sentiment")

nlpPipeline = Pipeline(
      stages = [
          documentAssembler,
          use,
          sentimentdl
      ])


tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]
sentimentdl_use_twitter download started this may take some time.
Approximate size to download 11.9 MB
[OK!]


In [68]:
empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)



In [69]:
#df = spark.createDataFrame(pd.DataFrame({"text":text_list}))
result3 = pipelineModel.transform(spark_df1)

In [70]:
result3=result3.withColumnRenamed('airline_sentiment','sentiment')

In [71]:
result3.show()

+--------------------+---------+--------------------+--------------------+--------------------+
|                text|sentiment|            document| sentence_embeddings|        pr_sentiment|
+--------------------+---------+--------------------+--------------------+--------------------+
|@VirginAmerica Wh...|  neutral|[[document, 0, 34...|[[sentence_embedd...|[[category, 0, 34...|
|@VirginAmerica pl...| positive|[[document, 0, 71...|[[sentence_embedd...|[[category, 0, 71...|
|@VirginAmerica I ...|  neutral|[[document, 0, 70...|[[sentence_embedd...|[[category, 0, 70...|
|"@VirginAmerica i...| negative|[[document, 0, 12...|[[sentence_embedd...|[[category, 0, 12...|
|@VirginAmerica an...| negative|[[document, 0, 54...|[[sentence_embedd...|[[category, 0, 54...|
|@VirginAmerica se...| negative|[[document, 0, 87...|[[sentence_embedd...|[[category, 0, 87...|
|@VirginAmerica ye...| positive|[[document, 0, 78...|[[sentence_embedd...|[[category, 0, 78...|
|@VirginAmerica Re...|  neutral|[[docume

In [72]:
result4=result3.select(F.explode(F.arrays_zip('document.result', 'pr_sentiment.result')).alias("cols"),'sentiment').filter("pr_sentiment is not null").filter("document.result is not null").filter("pr_sentiment.result is not null") \
.select(F.expr("cols['0']").alias("document"),
        F.expr("cols['1']").alias("pr_sentiment"),'sentiment')

In [78]:
result4.show(20,truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------+------------+---------+
|document                                                                                                                                    |pr_sentiment|sentiment|
+--------------------------------------------------------------------------------------------------------------------------------------------+------------+---------+
|@VirginAmerica What @dhepburn said.                                                                                                         |positive    |neutral  |
|@VirginAmerica plus you've added commercials to the experience... tacky.                                                                    |positive    |positive |
|@VirginAmerica I didn't today... Must mean I need to take another trip!                                                                     |negative    |neutral  |
|"@V