https://towardsdatascience.com/my-absolute-go-to-for-sentiment-analysis-textblob-3ac3a11d524

In [None]:
!pip install textblob
!pip install findspark

In [None]:
# import necessary packages
import os
import json
import time
import subprocess
import pyspark
import findspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import functions as F
from textblob import TextBlob

from IPython.display import clear_output

In [None]:
findspark.init()

In [None]:
def json_load(text):
    return json.loads(text)

In [None]:
def get_message(text):
    msg = json.loads(text)
    # if tweet is longer than 140 characters
    if "extended_tweet" in msg:
        return str(msg['extended_tweet']['full_text'])
    else:
        return str(msg['text'])

In [None]:
def get_tweet_field(text, field):
    msg = json.loads(text)
    if '/' in field:
        fields = field.split('/')
        fieldDepth = len(fields)
        f = msg
        for i in range(fieldDepth):
            if(f is None):
                return None
            if i == fieldDepth - 1:
                return str(f[fields[i]]) 
            else:
                f = f[fields[i]]
    else:
        return str(msg[field])

In [None]:
def get_analysis(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

In [None]:
def process_tweets(tweets):
    
    # Convert to json
    json_load_udf = udf(json_load, StringType())
    tweets = tweets.withColumn("json", json_load_udf("value"))
    
    # Get id
    get_tweet_field_udf = udf(get_tweet_field, StringType())
    tweets = tweets.withColumn("id", get_tweet_field_udf("value", lit('id')))
    
    # Get created_at
    get_tweet_field_udf = udf(get_tweet_field, StringType())
    tweets = tweets.withColumn("created_at", get_tweet_field_udf("value", lit('created_at')))
    
    # Get place full name
    get_tweet_field_udf = udf(get_tweet_field, StringType())
    tweets = tweets.withColumn("place_full_name", get_tweet_field_udf("value", lit('place/full_name')))
    
    # Get place country
    get_tweet_field_udf = udf(get_tweet_field, StringType())
    tweets = tweets.withColumn("place_country", get_tweet_field_udf("value", lit('place/country')))
    
    # Get place country code
    get_tweet_field_udf = udf(get_tweet_field, StringType())
    tweets = tweets.withColumn("place_country_code", get_tweet_field_udf("value", lit('place/country_code')))
    
    # Get message
    get_message_udf = udf(get_message, StringType())
    tweets = tweets.withColumn("message", get_message_udf("value"))
    
    # Get cleaned words from message for analysis
    #tweets = tweets.na.replace('', None)
    #tweets = tweets.na.drop()
    tweets = tweets.withColumn('words', tweets.message)
    tweets = tweets.withColumn('words', F.regexp_replace('words', r'http\S+', ''))
    tweets = tweets.withColumn('words', F.regexp_replace('words', '@\w+', ''))
    tweets = tweets.withColumn('words', F.regexp_replace('words', '#', ''))
    tweets = tweets.withColumn('words', F.regexp_replace('words', 'RT', ''))
    tweets = tweets.withColumn('words', F.regexp_replace('words', ':', ''))
    
    # Drop unnesscessary data
    tweets = tweets.drop("value")
    tweets = tweets.drop("json")
    
    return tweets

In [None]:
# Text classification using TextBlob
def polarity_detection(text):
    return TextBlob(text).sentiment.polarity
def subjectivity_detection(text):
    return TextBlob(text).sentiment.subjectivity
def text_classification(tweets):
    
    # polarity detection
    polarity_detection_udf = udf(polarity_detection, StringType())
    tweets = tweets.withColumn("polarity", polarity_detection_udf("words"))
    
    # subjectivity detection
    subjectivity_detection_udf = udf(subjectivity_detection, StringType())
    tweets = tweets.withColumn("subjectivity", subjectivity_detection_udf("words"))
    
    # analysis
    get_analysis_udf = udf(get_analysis, StringType())
    tweets = tweets.withColumn('analysis', get_analysis_udf('polarity'))
    
    return tweets

In [None]:
#import libraries to visualize the results from stream
import time
from IPython import display
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

def visualize_results_from_batch(tweets):
    
    testDF = tweets.toPandas()
    get_ipython().run_line_magic('matplotlib', 'inline')
    df_2 = testDF.groupby('analysis').count()
    df_2.reset_index(inplace=True)
    #df_2.head()
    sns.barplot(x='analysis', y='id', data=df_2);


In [None]:
def start_batch_processing(spark):
    folderDF = spark.read.text('gs://cloud-project-bucket-22/Data/*')
    tweets = folderDF
    tweets = process_tweets(folderDF)
    tweets = text_classification(tweets)
    
    return tweets

In [None]:
#if __name__ == "__main__":

# Create Spark session
spark = SparkSession.builder.appName("TwitterSentimentAnalysis").getOrCreate()

# Start batch processing
tweets = start_batch_processing(spark)

# Visualize results
visualize_results_from_batch(tweets)

In [None]:
print(tweets.count())
tweets.createOrReplaceTempView("MyTable")

In [None]:
df = spark.sql('SELECT count(id) FROM MyTable WHERE analysis = "Positive"')
df.show(5)

In [None]:
df = spark.sql('SELECT count(id) FROM MyTable WHERE analysis = "Negative"')
df.show(5)

In [None]:
df = spark.sql('SELECT count(id) FROM MyTable WHERE analysis = "Neutral"')
df.show(5)