https://towardsdatascience.com/my-absolute-go-to-for-sentiment-analysis-textblob-3ac3a11d524

In [1]:
# Note - Output folder must be public to the internet
is_realtime = False
project_bucket_name = "cloud-project-bucket-ns-22"
topic = "Batman"
input_folder_name = 'gs://{}/Input/{}/'.format(project_bucket_name, topic)
output_folder_name = 'gs://{}/Output/{}/'.format(project_bucket_name, topic)

In [2]:
!pip install textblob
!pip install findspark

[0m

In [3]:
# import necessary packages
import os
import json
import time
import subprocess
import pyspark
import findspark
import socket
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import functions as F
from textblob import TextBlob

from IPython.display import clear_output

In [4]:
findspark.init()

In [5]:
def json_load(text):
    return json.loads(text)

In [6]:
def get_message(text):
    msg = json.loads(text)
    # if tweet is longer than 140 characters
    if "extended_tweet" in msg:
        return str(msg['extended_tweet']['full_text'])
    else:
        return str(msg['text'])

In [7]:
def get_tweet_field(text, field):
    msg = json.loads(text)
    if '/' in field:
        fields = field.split('/')
        fieldDepth = len(fields)
        f = msg
        for i in range(fieldDepth):
            if(f is None):
                return None
            if i == fieldDepth - 1:
                return str(f[fields[i]]) 
            else:
                f = f[fields[i]]
    else:
        return str(msg[field])

In [8]:
def get_analysis(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

In [9]:
def process_tweets(tweets):
    
#     # Convert to json
#     json_load_udf = udf(json_load, StringType())
#     tweets = tweets.withColumn("json", json_load_udf("value"))
    
    # Get id
    get_tweet_field_udf = udf(get_tweet_field, StringType())
    tweets = tweets.withColumn("id", get_tweet_field_udf("value", lit('id')))
    
    # Get created_at
    get_tweet_field_udf = udf(get_tweet_field, StringType())
    tweets = tweets.withColumn("created_at", get_tweet_field_udf("value", lit('created_at')))
    
    # Get place full name
    get_tweet_field_udf = udf(get_tweet_field, StringType())
    tweets = tweets.withColumn("place_full_name", get_tweet_field_udf("value", lit('place/full_name')))
    
    # Get place country
    get_tweet_field_udf = udf(get_tweet_field, StringType())
    tweets = tweets.withColumn("place_country", get_tweet_field_udf("value", lit('place/country')))
    
    # Get place country code
    get_tweet_field_udf = udf(get_tweet_field, StringType())
    tweets = tweets.withColumn("place_country_code", get_tweet_field_udf("value", lit('place/country_code')))
    
    # Get message
    get_message_udf = udf(get_message, StringType())
    tweets = tweets.withColumn("message", get_message_udf("value"))
    
    # Get cleaned words from message for analysis
    #tweets = tweets.na.replace('', None)
    #tweets = tweets.na.drop()
    tweets = tweets.withColumn('words', tweets.message)
    tweets = tweets.withColumn('words', F.regexp_replace('words', r'http\S+', ''))
    tweets = tweets.withColumn('words', F.regexp_replace('words', '@\w+', ''))
    tweets = tweets.withColumn('words', F.regexp_replace('words', '#', ''))
    tweets = tweets.withColumn('words', F.regexp_replace('words', 'RT', ''))
    tweets = tweets.withColumn('words', F.regexp_replace('words', ':', ''))
    
    # Drop unnesscessary data
    tweets = tweets.drop("value")
    #tweets = tweets.drop("json")
    
    return tweets

In [10]:
# Text classification using TextBlob
def polarity_detection(text):
    return TextBlob(text).sentiment.polarity
def subjectivity_detection(text):
    return TextBlob(text).sentiment.subjectivity
def text_classification(tweets):
    
    # polarity detection
    polarity_detection_udf = udf(polarity_detection, StringType())
    tweets = tweets.withColumn("polarity", polarity_detection_udf("words"))
    
    # subjectivity detection
    subjectivity_detection_udf = udf(subjectivity_detection, StringType())
    tweets = tweets.withColumn("subjectivity", subjectivity_detection_udf("words"))
    
    # analysis
    get_analysis_udf = udf(get_analysis, StringType())
    tweets = tweets.withColumn('analysis', get_analysis_udf('polarity'))
    
    return tweets

In [11]:
def start_offline_batch_processing(spark):
    # Read files from input folder
    df_folder = spark.read.text(input_folder_name)
    lines = df_folder
    
    # Process and classify tweets
    tweets = process_tweets(lines)
    tweets = text_classification(tweets)
    tweets.createOrReplaceTempView("tweets")
    
    # Create output file names
    df_sentiment_by_country_output_file_name = output_folder_name + "df_sentiment_by_country.csv"
    df_sentiment_by_country = spark.sql('SELECT place_country as Country, place_country_code as CountryCode, avg(polarity) as Sentiment, count(id) as TweetCount FROM tweets GROUP BY place_country, place_country_code ORDER BY place_country_code')
    
    # Write to single files
    #df_country_sentiment.repartition(1).write.mode("overwrite").option("header",True).csv(countrySentimentOutputFileName)
    #df_country_sentiment.coalesce(1).write.mode("overwrite").option("header",True).csv(countrySentimentOutputFileName)
    df_sentiment_by_country.toPandas().to_csv(df_sentiment_by_country_output_file_name, index=False)
    print("File '{}' updated...".format(df_sentiment_by_country_output_file_name))
    
    return tweets

In [12]:
def process_stream_batch(df, batch_id, output_file_name):
    # Write to single file
    #df.repartition(1).write.mode("overwrite").option("header",True).csv(outputFileName)
    #df.coalesce(1).write.mode("overwrite").option("header",True).csv(outputFileName)
    df.toPandas().to_csv(output_file_name, index=False)
    print("File '{}' updated, batch_id = {}...".format(output_file_name, batch_id))

In [13]:
def get_batched_file_stream():
    # Stream files from input folder
    file_stream = spark.readStream.format('text').option("maxFilesPerTrigger", 5).load(input_folder_name)
    print("Batched file stream...")
    return file_stream

In [14]:
def get_realtime_stream():
    # Socket connection stream
    socket_stream = spark.readStream \
                    .format("socket") \
                    .option("host", socket.gethostname()) \
                    .option("port", 5555) \
                    .load()
    print("Real-time socket stream...")
    return socket_stream

In [15]:
def start_stream_processing(spark, is_realtime=False):
    # Get tweet stream
    lines = get_realtime_stream() if is_realtime else get_batched_file_stream()
    
    # Process and classify tweets
    tweets = process_tweets(lines)
    tweets = text_classification(tweets)
    tweets.createOrReplaceTempView("tweets")
    
    # Create output file names
    df_sentiment_by_country_output_file_name = output_folder_name + "df_sentiment_by_country_streaming.csv"
    df_sentiment_by_category_output_file_name = output_folder_name + "df_sentiment_by_category_streaming.csv"

    # Create streaming queries
    df_sentiment_by_country = spark.sql('SELECT place_country as Country, place_country_code as CountryCode, avg(polarity) as Sentiment, count(id) as TweetCount FROM tweets GROUP BY place_country, place_country_code')
    df_sentiment_by_country_query = df_sentiment_by_country.writeStream.outputMode('complete').foreachBatch(lambda df, epoch_id: process_stream_batch(df, epoch_id, df_sentiment_by_country_output_file_name)).start()
    
    df_sentiment_by_category = spark.sql('SELECT analysis as Sentiment, count(id) as TweetCount FROM tweets GROUP BY analysis')
    df_sentiment_by_category_query = df_sentiment_by_category.writeStream.outputMode('complete').foreachBatch(lambda df, epoch_id: process_stream_batch(df, epoch_id, df_sentiment_by_category_output_file_name)).start()
    
    spark.streams.awaitAnyTermination()

In [None]:
#if __name__ == "__main__":

# Create Spark session
spark = SparkSession.builder.appName("TwitterTopicSentimentAnalysis").getOrCreate()

# Start offline batch processing
#tweets = start_offline_batch_processing(spark)

# Start stream processing
start_stream_processing(spark, is_realtime)

Batched file stream...


22/03/16 21:16:22 WARN org.apache.spark.sql.streaming.StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-36a7fac8-59da-4d04-8692-1f9edbae1cc4. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
22/03/16 21:16:22 WARN org.apache.spark.sql.streaming.StreamingQueryManager: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
22/03/16 21:16:22 WARN org.apache.spark.sql.streaming.StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-cc5ac82f-a2b2-4bb4-9c0f-1ee508ec4bcd. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is bes

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_category_streaming.csv' updated, batch_id = 0...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_country_streaming.csv' updated, batch_id = 0...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_category_streaming.csv' updated, batch_id = 1...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_country_streaming.csv' updated, batch_id = 1...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_category_streaming.csv' updated, batch_id = 2...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_country_streaming.csv' updated, batch_id = 2...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_category_streaming.csv' updated, batch_id = 3...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_country_streaming.csv' updated, batch_id = 3...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_category_streaming.csv' updated, batch_id = 4...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_country_streaming.csv' updated, batch_id = 4...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_category_streaming.csv' updated, batch_id = 5...
File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_country_streaming.csv' updated, batch_id = 5...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_category_streaming.csv' updated, batch_id = 6...
File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_country_streaming.csv' updated, batch_id = 6...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_category_streaming.csv' updated, batch_id = 7...
File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_country_streaming.csv' updated, batch_id = 7...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_category_streaming.csv' updated, batch_id = 8...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_country_streaming.csv' updated, batch_id = 8...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_category_streaming.csv' updated, batch_id = 9...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_country_streaming.csv' updated, batch_id = 9...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_category_streaming.csv' updated, batch_id = 10...
File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_country_streaming.csv' updated, batch_id = 10...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_category_streaming.csv' updated, batch_id = 11...
File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_country_streaming.csv' updated, batch_id = 11...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_category_streaming.csv' updated, batch_id = 12...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_country_streaming.csv' updated, batch_id = 12...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_category_streaming.csv' updated, batch_id = 13...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_country_streaming.csv' updated, batch_id = 13...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_category_streaming.csv' updated, batch_id = 14...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_country_streaming.csv' updated, batch_id = 14...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_category_streaming.csv' updated, batch_id = 15...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_country_streaming.csv' updated, batch_id = 15...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_category_streaming.csv' updated, batch_id = 16...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_country_streaming.csv' updated, batch_id = 16...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_category_streaming.csv' updated, batch_id = 17...
File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_country_streaming.csv' updated, batch_id = 17...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_country_streaming.csv' updated, batch_id = 18...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_category_streaming.csv' updated, batch_id = 18...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_country_streaming.csv' updated, batch_id = 19...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_category_streaming.csv' updated, batch_id = 19...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_country_streaming.csv' updated, batch_id = 20...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_category_streaming.csv' updated, batch_id = 20...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_country_streaming.csv' updated, batch_id = 21...


                                                                                

File 'gs://cloud-project-bucket-ns-22/Output/Batman/df_sentiment_by_category_streaming.csv' updated, batch_id = 21...


22/03/16 21:20:47 WARN org.apache.spark.sql.execution.streaming.FileStreamSource: Listed 110 file(s) in 5538 ms
