## AWS credentials for colab notebook

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
! pip install awscli

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting awscli
  Downloading awscli-1.25.0-py3-none-any.whl (3.9 MB)
[K     |████████████████████████████████| 3.9 MB 4.3 MB/s 
[?25hCollecting rsa<4.8,>=3.1.2
  Downloading rsa-4.7.2-py3-none-any.whl (34 kB)
Collecting colorama<0.4.5,>=0.2.5
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Collecting botocore==1.27.0
  Downloading botocore-1.27.0-py3-none-any.whl (8.8 MB)
[K     |████████████████████████████████| 8.8 MB 45.6 MB/s 
[?25hCollecting s3transfer<0.7.0,>=0.6.0
  Downloading s3transfer-0.6.0-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 7.7 MB/s 
[?25hCollecting docutils<0.17,>=0.10
  Downloading docutils-0.16-py2.py3-none-any.whl (548 kB)
[K     |████████████████████████████████| 548 kB 59.7 MB/s 
Collecting urllib3<1.27,>=1.25.4
  Downloading urllib3-1.26.9-py2.py3-none-any.whl (138 kB)
[K     |████████████████████████████████| 1

In [6]:
text = '''
[default]
aws_access_key_id = 
aws_secret_access_key = 
aws_session_token=
region = us-east-1
'''
path = "/content/drive/My Drive/config/awscli.ini"
with open(path, 'w') as f:
   f.write(text)

In [7]:
import os
! export AWS_SHARED_CREDENTIALS_FILE=/content/drive/My\ Drive/config/awscli.ini
os.environ['AWS_SHARED_CREDENTIALS_FILE'] = path

In [13]:
! aws s3 cp s3://lsc-projct ./data/ --recursive

download: s3://lsc-projct/3.csv to data/3.csv                     
download: s3://lsc-projct/4.csv to data/4.csv                 
download: s3://lsc-projct/2.csv to data/2.csv                 
download: s3://lsc-projct/5.csv to data/5.csv                 


## Import libraries

In [9]:
# Install PySpark and Spark NLP
! pip install -q pyspark==3.1.2 spark-nlp

[K     |████████████████████████████████| 212.4 MB 61 kB/s 
[K     |████████████████████████████████| 145 kB 32.3 MB/s 
[K     |████████████████████████████████| 198 kB 43.0 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [10]:
import pandas as pd
import numpy as np
import json
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

In [11]:
spark = sparknlp.start()

In [14]:
data = spark.read.option("header", True).option("multiline", True).option("escape","\"") \
        .csv('./data/*.csv')

df = data.na.drop(subset=['content'])

## Emotion

In [15]:
MODEL_NAME='classifierdl_use_emotion'

documentAssembler = DocumentAssembler()\
    .setInputCol("content")\
    .setOutputCol("document")
    
use = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")


sentimentdl = ClassifierDLModel.pretrained(name=MODEL_NAME)\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("sentiment")

nlpPipeline = Pipeline(
      stages = [
          documentAssembler,
          use,
          sentimentdl
      ])

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]
classifierdl_use_emotion download started this may take some time.
Approximate size to download 21.3 MB
[OK!]


In [16]:
empty_df = spark.createDataFrame([['']]).toDF("text")
pipelineModel = nlpPipeline.fit(empty_df)
result = pipelineModel.transform(df)

In [17]:
res = result.select(F.explode(F.arrays_zip('document.result', 'sentiment.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("document"),
        F.expr("cols['1']").alias("sentiment"))


In [18]:
res.groupby('sentiment').count().show()

+---------+-----+
|sentiment|count|
+---------+-----+
|      joy| 4429|
|     fear| 2876|
| surprise|  733|
|  sadness|  379|
+---------+-----+



In [19]:
res.select('document').filter(F.col('sentiment')=='joy').show(10, truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|document                                                                                                                                                                                                                                                                                   |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Don't forget! GiveBIG is May 3-4. Make your donation to #habitatskc to provide #affordablehousing solutions for King County families.

Early 

In [20]:
res.select('document').filter(F.col('sentiment')=='fear').show(10, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|document                                                                                                                                                                                                                                                                               |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|“The timing is right for this place to rise.” Petition for 
@NJGov
 to reinstate the monarchy with 
@IAMQUEENLATIFAH
 #Newark  #NJ #affordablehousing #QU

In [21]:
res.select('document').filter(F.col('sentiment')=='surprise').show(10, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|document                                                                                                                                                                                                                                                                                                                                     |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [22]:
res.select('document').filter(F.col('sentiment')=='sadness').show(10, truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|document                                                                                                                                                                                                                                                                                                                                              |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## Sarcasm

In [None]:
MODEL_NAME='classifierdl_use_sarcasm'

documentAssembler = DocumentAssembler()\
    .setInputCol("content")\
    .setOutputCol("document")
    
use = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")


sentimentdl = ClassifierDLModel.pretrained(name=MODEL_NAME)\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("sentiment")

nlpPipeline = Pipeline(
      stages = [
          documentAssembler,
          use,
          sentimentdl
      ])


tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]
classifierdl_use_sarcasm download started this may take some time.
Approximate size to download 21.3 MB
[OK!]


In [None]:
empty_df = spark.createDataFrame([['']]).toDF("content")
pipelineModel = nlpPipeline.fit(empty_df)
result = pipelineModel.transform(df)

In [None]:
res = result.select(F.explode(F.arrays_zip('document.result', 'sentiment.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("document"),
        F.expr("cols['1']").alias("sentiment"))


In [None]:
res.groupby('sentiment').count().show()

+---------+-----+
|sentiment|count|
+---------+-----+
|   normal| 8265|
|  sarcasm|  152|
+---------+-----+



In [None]:
res.select('document').filter(F.col('sentiment')=='sarcasm').show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|document                                                                                                                                                                                                                                                                               |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|43% of students at four-year universities experienced housing insecurity in 2020 and 14% of students experienced #homelessness in the last year.

For som