In [1]:
# Spark init
!wget -q https://dlcdn.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop2.7.tgz 
!tar xf spark-3.1.2-bin-hadoop2.7.tgz
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/default-java"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop2.7"
!pip install -q findspark
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [2]:
!head -5 /content/drive/MyDrive/data/2020-22 NYT Headlines.csv

head: cannot open '/content/drive/MyDrive/data/2020-22' for reading: No such file or directory
head: cannot open 'NYT' for reading: No such file or directory
head: cannot open 'Headlines.csv' for reading: No such file or directory


In [3]:
df = spark.read.csv("/content/drive/MyDrive/data/2020-22 NYT Headlines.csv", sep=',', header=True)

In [4]:
df.show(truncate = False)

+--------------------------------------------------------------------------------------+--------+--------+---------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|headline                                                                              |date    |doc_type|URL                                                                                                                                    |snippet                                                                                                                                       

In [5]:
df_text = df.select(['headline', 'snippet'])
df_text.show()

+--------------------+--------------------+
|            headline|             snippet|
+--------------------+--------------------+
|Already Had Plent...|He’s a bad show, ...|
|Why Did One-Quart...|Swine fever devas...|
|Coast Guard Suspe...|Two of seven crew...|
|N.B.A. Superstars...|The longtime comm...|
|In Rose Bowl Vict...|The Ducks managed...|
|Where Darth Vader...|Ed Sessa turns th...|
|Don Larsen, Yanke...|He retired after ...|
|No Corrections: J...|No corrections ap...|
|India Cold Wave B...|Across the north,...|
|Quotation of the ...|Quotation of the ...|
|Head-Scarf Ban an...|After months of t...|
|Your Thursday Bri...|Austria, Australi...|
|What’s on TV Thur...|A drama based on ...|
|Word + Quiz: subsume|This word has app...|
|Taiwan’s Top Mili...|The helicopter wa...|
|  New Year, New You?|What are your res...|
|‘Surviving R. Kel...|A new Lifetime se...|
|How David Stern N...|Stern was known a...|
|Flash Floods in I...|At least 43 peopl...|
|Lesson of the Day...|In this le

In [6]:
!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer



In [7]:
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType
analyser = SentimentIntensityAnalyzer()
vader_udf = udf(lambda text: analyser.polarity_scores(text).get('pos'), FloatType())


In [8]:
df_pos_sentiment = df_text.withColumn("positivity", vader_udf("snippet"))
df_pos_sentiment.show()

+--------------------+--------------------+----------+
|            headline|             snippet|positivity|
+--------------------+--------------------+----------+
|Already Had Plent...|He’s a bad show, ...|       0.0|
|Why Did One-Quart...|Swine fever devas...|       0.0|
|Coast Guard Suspe...|Two of seven crew...|     0.109|
|N.B.A. Superstars...|The longtime comm...|     0.057|
|In Rose Bowl Vict...|The Ducks managed...|     0.135|
|Where Darth Vader...|Ed Sessa turns th...|       0.0|
|Don Larsen, Yanke...|He retired after ...|     0.175|
|No Corrections: J...|No corrections ap...|       0.0|
|India Cold Wave B...|Across the north,...|       0.0|
|Quotation of the ...|Quotation of the ...|       0.0|
|Head-Scarf Ban an...|After months of t...|     0.084|
|Your Thursday Bri...|Austria, Australi...|       0.0|
|What’s on TV Thur...|A drama based on ...|       0.0|
|Word + Quiz: subsume|This word has app...|       0.0|
|Taiwan’s Top Mili...|The helicopter wa...|       0.0|
|  New Yea

In [9]:
vader_neg_udf = udf(lambda text: analyser.polarity_scores(text).get('neg'), FloatType())

In [10]:
df_neg_sentiment = df_text.withColumn("negativity", vader_neg_udf("snippet"))
df_neg_sentiment.show()

+--------------------+--------------------+----------+
|            headline|             snippet|negativity|
+--------------------+--------------------+----------+
|Already Had Plent...|He’s a bad show, ...|     0.243|
|Why Did One-Quart...|Swine fever devas...|     0.343|
|Coast Guard Suspe...|Two of seven crew...|       0.0|
|N.B.A. Superstars...|The longtime comm...|     0.194|
|In Rose Bowl Vict...|The Ducks managed...|     0.082|
|Where Darth Vader...|Ed Sessa turns th...|       0.0|
|Don Larsen, Yanke...|He retired after ...|     0.062|
|No Corrections: J...|No corrections ap...|     0.196|
|India Cold Wave B...|Across the north,...|       0.0|
|Quotation of the ...|Quotation of the ...|       0.0|
|Head-Scarf Ban an...|After months of t...|       0.0|
|Your Thursday Bri...|Austria, Australi...|       0.0|
|What’s on TV Thur...|A drama based on ...|       0.0|
|Word + Quiz: subsume|This word has app...|       0.0|
|Taiwan’s Top Mili...|The helicopter wa...|       0.0|
|  New Yea