In [1]:
# https://nlp.johnsnowlabs.com/docs/en/annotators#languagedetectordl

# maven - com.johnsnowlabs.nlp:spark-nlp_2.11:2.5.3
# pypi - spark-nlp, vaderSentiment, boto3==1.9.157

# example on sentiment analysis
  # https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/classification/SentimentDL_train_multiclass_sentiment_classifier.ipynb
# https://nlp.johnsnowlabs.com/docs/en/pipelines#multi-language

In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark import SparkFiles
from pyspark.ml.feature import (
    StringIndexer,
    HashingTF, 
    IDF
)
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import NaiveBayes

In [3]:
# get or create Spark session

app_name = "news_analysis"

spark = SparkSession.builder.appName(app_name) \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.5.3") \
    .getOrCreate()

In [4]:
file_uri = "/mnt/mnt_s3/BusinessTrainData.csv"

trainDataset = spark.read \
    .format("com.databricks.spark.csv") \
    .options(header='true', inferSchema="true") \
    .load(file_uri)
display(trainDataset)

_c0,index,category,headline,authors,link,short_description,date,short_description_new,label
148341,170325,BUSINESS,"Stores Open On Christmas Eve Include Walmart, Best Buy, Target, Among Others",Caroline Fairchild,https://www.huffingtonpost.com/entry/stores-open-on-christmas-eve-2012_us_5bb2e167e4b0480ca65de749,leave christmas shopping minute worry americas retailers got covered,2012-12-22,leave christmas shopping minute worry americas retailers got covered,Negative
94803,106274,BUSINESS,Trusting the Crowd and the Machines,"Despina Katsikakis, ContributorGlobally renowned expert on how to transform the workplace to ...",https://www.huffingtonpost.com/entry/trusting-the-crowd-and-th_b_6166406.html,business environment future needs trust people technology provide flexibility choice employees connect complementary skills network work challenges learn fast unlock passion improve performance,2014-11-19,business environment future needs trust people technology provide flexibility choice employees connect complementary skills network work challenges learn fast unlock passion improve performance,Positive
86152,90632,BUSINESS,"""The Sudden Implosion of """"Secret"""" May Reveal A Secret: Is Silicon Valley Out Of Touch Or Just Losing It's Touch?""","Tony Potts, ContributorPartner-SierraMaya360/PartnersTrust/Prev-Access Hollywood",https://www.huffingtonpost.com/entry/the-sudden-implosion-of-s_b_7310388.html,look secret timeline demise started august inverted hockey stick trajectory came days raised july investors nightmare doubt,2015-05-19,look secret timeline demise started august inverted hockey stick trajectory came days raised july investors nightmare doubt,Negative
40379,41278,BUSINESS,He's Grounded! Delta Bans Obnoxious Trump Supporter For Life,Carla Herreria,https://www.huffingtonpost.com/entry/delta-bans-trump-supporter_us_583ce595e4b0860d611640a2,ceo says passengers flight given refunds,2016-11-29,ceo says passengers flight given refunds,Neutral
34901,35480,BUSINESS,DON'T PANIC! That Bacon Shortage You Keep Hearing About Isn't Real,Ed Mazza,https://www.huffingtonpost.com/entry/no-bacon-shortage_us_5892dfbae4b0bf5206e63393,america isnt running bacon,2017-02-02,america running bacon,Neutral
57004,58878,BUSINESS,One Map That Shows How The Middle Class Is Getting Squeezed,Shane Ferro,https://www.huffingtonpost.com/entry/us-middle-class-squeezed_us_5734ea76e4b060aa7819c3c6,people arent earning,2016-05-13,people arent earning,Neutral
177642,200136,BUSINESS,Facebook Is No Longer Cool,"Max Kolonko, Contributor",,,,,
New York television producer,director,"author and the U.S. te...""",https://www.huffingtonpost.com/entry/facebook-public_us_5bb2b460e4b0480ca659ddfa,facebook billionaire mark zuckerberg joined establishment corporate giant cheated luring concept global friendship turned lined corporate greed,2012-02-05,facebook billionaire mark zuckerberg joined establishment corporate giant cheated luring concept global friendship turned lined corporate greed,Negative,,
158327,180430,MONEY,Master the Five Factors That Feed FICO This Fall,"Jeanne Kelly, Contributor",,,,,
Credit & Identity Theft Expert,Author,"CEO of The Kelly Gro...""",https://www.huffingtonpost.com/entry/master-the-five-factors-t_us_5b9c372ce4b03a1dcc7d278f,credit score taken summer vacation time commitments changes,2012-09-04,credit score summer vacation time commitments changes,Positive,,


In [5]:
file_uri = "/mnt/mnt_s3/BusinessTestData.csv"

testDataset = spark.read \
    .format("com.databricks.spark.csv") \
    .options(header='true', inferSchema="true") \
    .load(file_uri)
display(testDataset)

_c0,index,category,headline,authors,link,short_description,date,short_description_new,label
106103,127278,MONEY,Future Shock: Personal Finance Technology,"Jim Gibson, Contributor",,,,,
Personal Finance writer,"Your Finances Simplified""",https://www.huffingtonpost.com/entry/future-shockpersonal-fina_us_5b9df118e4b03a1dcc8f2e62,land way banking receipts growing trend alternative payment,2014-03-24,land banking receipts growing trend alternative payment,Positive,,,
84389,87560,BUSINESS,City Of Charleston Steps Up To Help Victims' Families In A Big Way,Andrew Lord,https://www.huffingtonpost.com/entry/business-charleston-victims_n_7644912.html,donations pouring local businesses,2015-06-23,donations pouring local businesses,Neutral
116537,137877,BUSINESS,The Shoestring Guide to Brand Publishing,,https://www.huffingtonpost.com/entry/brand-publishing-budget_us_5bb302d8e4b0480ca66147cf,contently creating stream content stands crowd brand map brand key,2013-11-30,contently creating stream content stands crowd brand map brand key,Positive
60976,63051,BUSINESS,One Of Ben Carson's Craziest Ideas Is Coming True,Ben Walsh,https://www.huffingtonpost.com/entry/us-tax-haven_us_56f56a7be4b014d3fe22f892,worlds hottest new tax haven,2016-03-26,worlds hottest new tax haven,Neutral
174267,196702,MONEY,St. Patrick's Day 2012: Saturday Holiday A Boon For Bars And Retailers,,https://www.huffingtonpost.com/entry/st-patricks-day-spending_us_5b9b7184e4b03a1dcc779415,celebrating day likely entail spending money st patricks day far expensive holiday,2012-03-13,celebrating day likely entail spending money patricks day far expensive holiday,Positive
92267,101497,BUSINESS,'Breastaurants' Thrive As The Restaurant Industry Struggles,Jillian Berman,https://www.huffingtonpost.com/entry/breastaurants-growth_n_6443274.html,serve burger cleavage crowds come pouring,2015-01-13,serve burger cleavage crowds come pouring,Neutral
97355,111221,BUSINESS,What Should I Do if an Employee Is a Liar?,"Russ Warner, ContributorVP Marketing at Converus, makers of EyeDetect, innovative solu...",https://www.huffingtonpost.com/entry/what-should-i-do-if-an-em_b_5864410.html,need careful assume individuals guaranteed repeat past behavior assumptions limit persons ability learn grow know employee going repeat past behavior illicit drug use theft bribery,2014-09-23,need careful assume individuals guaranteed repeat past behavior assumptions limit persons ability learn grow employee going repeat past behavior illicit drug use theft bribery,Positive
54038,55728,BUSINESS,IEX Wins SEC Approval As U.S. Stock Exchange,,https://www.huffingtonpost.com/entry/iex-sec-stock_us_576499dee4b015db1bc98569,builtin delay protect investors traders use tactics depicted michael lewis book flash boys wall street revolt,2016-06-18,builtin delay protect investors traders use tactics depicted michael lewis book flash boys wall street revolt,Positive
135872,157630,BUSINESS,"Mohammed Sohel Rana, Bangladesh Factory Owner, Faces Murder Complaint As Death Toll Rises To 622","Reuters, Reuters",https://www.huffingtonpost.com/entry/mohammed-sohel-rana-murder_us_5bb2f3b6e4b0480ca65f6045,guilty killings highest punishment capital punishment said abdul huq,2013-05-05,guilty killings highest punishment capital punishment said abdul huq,Negative


In [6]:
testDataset.toPandas()

Unnamed: 0,_c0,index,category,headline,authors,link,short_description,date,short_description_new,label
0,106103,127278,MONEY,Future Shock: Personal Finance Technology,"Jim Gibson, Contributor\n",,,,,
1,Personal Finance writer,"Your Finances Simplified""",https://www.huffingtonpost.com/entry/future-sh...,land way banking receipts growing trend altern...,2014-03-24,land banking receipts growing trend alternati...,Positive,,,
2,84389,87560,BUSINESS,City Of Charleston Steps Up To Help Victims' F...,Andrew Lord,https://www.huffingtonpost.com/entry/business-...,donations pouring local businesses,2015-06-23,donations pouring local businesses,Neutral
3,116537,137877,BUSINESS,The Shoestring Guide to Brand Publishing,,https://www.huffingtonpost.com/entry/brand-pub...,contently creating stream content stands crowd...,2013-11-30,contently creating stream content stands crowd...,Positive
4,60976,63051,BUSINESS,One Of Ben Carson's Craziest Ideas Is Coming True,Ben Walsh,https://www.huffingtonpost.com/entry/us-tax-ha...,worlds hottest new tax haven,2016-03-26,worlds hottest new tax haven,Neutral
5,174267,196702,MONEY,St. Patrick's Day 2012: Saturday Holiday A Boo...,,https://www.huffingtonpost.com/entry/st-patric...,celebrating day likely entail spending money s...,2012-03-13,celebrating day likely entail spending money ...,Positive
6,92267,101497,BUSINESS,'Breastaurants' Thrive As The Restaurant Indus...,Jillian Berman,https://www.huffingtonpost.com/entry/breastaur...,serve burger cleavage crowds come pouring,2015-01-13,serve burger cleavage crowds come pouring,Neutral
7,97355,111221,BUSINESS,What Should I Do if an Employee Is a Liar?,"Russ Warner, ContributorVP Marketing at Conver...",https://www.huffingtonpost.com/entry/what-shou...,need careful assume individuals guaranteed rep...,2014-09-23,need careful assume individuals guaranteed rep...,Positive
8,54038,55728,BUSINESS,IEX Wins SEC Approval As U.S. Stock Exchange,,https://www.huffingtonpost.com/entry/iex-sec-s...,builtin delay protect investors traders use ta...,2016-06-18,builtin delay protect investors traders use ta...,Positive
9,135872,157630,BUSINESS,"Mohammed Sohel Rana, Bangladesh Factory Owner,...","Reuters, Reuters",https://www.huffingtonpost.com/entry/mohammed-...,guilty killings highest punishment capital pun...,2013-05-05,guilty killings highest punishment capital pun...,Negative


In [7]:
trainDataset.toPandas().info()

In [8]:
trainDataset = trainDataset.na.drop()

In [9]:
trainDataset= trainDataset.filter((trainDataset["label"]=="Positive") | (trainDataset["label"]=="Negative") | (trainDataset["label"]=="Neutral"))


In [10]:
trainDataset.toPandas().info()

In [11]:
trainDataset.toPandas()['label'].value_counts()

In [12]:
trainDataset.show()

In [13]:
testDataset = testDataset.na.drop()

In [14]:
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from sparknlp.pretrained import PretrainedPipeline

sparknlp.start()

In [15]:
# actual content is inside description column
document = DocumentAssembler()\
    .setInputCol("short_description_new")\
    .setOutputCol("document")

use = UniversalSentenceEncoder.pretrained() \
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")

# the classes/labels/categories are in category column
sentimentdl = SentimentDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("label")\
  .setMaxEpochs(5)\
  .setEnableOutputLogs(True)

pipeline = Pipeline(
    stages = [
        document,
        use,
        sentimentdl
    ])

In [16]:
trainDataset

In [17]:
trainDataset.filter("short_description_new IS NULL").count()

In [18]:

# pipelineModel = pipeline.fit(trainDataset.filter("label IN ('Positive','Negative')"))


In [19]:
pipelineModel.save("/mnt/mnt_s3/sentimentdl_pipeline_2")

In [20]:
loadedPipeline = PipelineModel.load("/mnt/mnt_s3/sentimentdl_pipeline_2")

In [21]:
import pandas as pd
pdf = pd.DataFrame({"text": [
  "This is so great! How are you", 
  "No way, never again!", 
  "Yes, bring on more!"
]})
df = spark.createDataFrame(pdf)

results = pipelineModel.transform(df)
display(
  results
)

In [22]:
display(
  results.selectExpr("text", "class.result")
)

In [23]:
analyzer.polarity_scores("This is so great! How are you")

In [24]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

@F.udf(returnType="struct<neg:double,neu:double,pos:double,compound:double>")
def getVADER(text):
	return analyzer.polarity_scores(text)

new_results = results.withColumn("vaderSentiment", getVADER("text"))
display(new_results)