https://towardsdatascience.com/my-absolute-go-to-for-sentiment-analysis-textblob-3ac3a11d524

In [78]:
# Note - Output folder must be public to the internet
is_realtime = False
project_bucket_name = "cloud-project-bucket-ns-22"
topic = "Batman"
batch_size = 1000
input_folder_name = 'Input/{}/'.format(topic)

# request to get credentials at http://apps.twitter.com
consumer_key    = '**************************'
consumer_secret = '**************************'
access_token    = '**************************'
access_token_secret   = '**************************'

In [79]:
import os
import sys
import time
import json
import socket
from google.cloud import storage

from IPython.display import clear_output

In [81]:
import os
import re
import time
import json
import socket

import tweepy
from tweepy import Stream


# Documentation - https://docs.tweepy.org/en/stable/streaming.html
# Inherits from the Stream in tweepy - provides additional functionality
class TweetStream(Stream):
    
    def __init__(self, consumer_key, consumer_secret, access_token, access_token_secret, client_connection=None, rate_limit_delay=None, time_limit=None, tweet_limit=None, message_Handler=None, batch_size=None, batch_handler=None):
        
        # Set client connection
        self.client_connection = client_connection

        # Set rate limt
        self.rate_limit_delay = rate_limit_delay

        # Set time limit
        self.start_time = time.time()
        self.time_limit = time_limit
        
        # Set tweet limit
        self.tweet_limit = tweet_limit
        self.tweet_count = 0

        # Set batching
        self.batch_size = batch_size
        self.batch_handler = batch_handler
        self.batch = []

        # Set message handler
        self.message_handler = message_Handler
        
        # Initialize super
        super(TweetStream, self).__init__(consumer_key, consumer_secret, access_token, access_token_secret)
        
    def check_continue_time_limit(self):
        
        # Check time limit if set
        if self.time_limit is not None:
            if (time.time() - self.start_time) > self.time_limit:
                return False
        return True
    
    def check_continue_tweet_limit(self):
        # Check tweet limit if set
        if self.tweet_limit is not None:
            if self.tweet_limit <= self.tweet_count :
                return False
        return True

    def close_connections(self):
        
        # Process batch before disconnecting
        self.process_batch(is_disconnecting=True)

        print("Disconnecting...")

        # Disconnect client connection if provided
        if self.client_connection is not None:
           self.client_connection.close()

        # Disconnect stream
        self.disconnect()

        print("Disconnected.")


    def send_to_client(self, data):

        if self.client_connection is not None:
            # Send tweet data to client connection
            self.client_connection.send(data)

            # Send new line delimiter to client connection
            self.client_connection.send(str('\n').encode('utf-8'))


    def process_batch(self, is_disconnecting=False):
        if self.batch_size is not None:
            if len(self.batch) == self.batch_size or (is_disconnecting == True and len(self.batch) > 0):
                # Write to batch handler if provided
                if self.batch_handler is not None:
                    self.batch_handler(self.batch)
            
                # Write to client connection if provided
                self.send_to_client(self.batch)

                # Clear batch
                self.batch.clear()


    def on_data(self, data):
        try:
            
            # Check for time limit
            if self.check_continue_time_limit() is False:
                print('Time limit hit.')
                self.close_connections()
                return False

            # Check for tweet limit
            if self.check_continue_tweet_limit() is False:
                print('Tweet limit hit.')
                self.close_connections()
                return False
            
            # Check for limit message from twitter
            if '{"limit":' in str(data):
                print('Twitter rate limit - {}'.format(str(data)))
                return True

            # Process batch
            if self.batch_size is not None:
                self.batch.append(data)
                self.process_batch()

            # Write to message handler if provided
            if self.message_handler is not None:
                self.message_handler(data)

            # Write to client connection if provided
            if self.client_connection is not None:
                self.send_to_client(data)

            # Update tweet count
            self.tweet_count = self.tweet_count + 1

            # Check for rate limit delay
            if self.rate_limit_delay is not None:
                time.sleep(self.rate_limit_delay)
                
        except BaseException as e:
            print("Error on_data: %s" % str(e))
            self.close_connections()
            return False
        
        return True

    def if_error(self, status):
        print(status)
        return True

In [82]:
def upload_blob_from_memory(bucket_name, contents, destination_blob_name):
    """Uploads a file to the bucket."""

    # The ID of your GCS bucket
    # bucket_name = "your-bucket-name"

    # The contents to upload to the file
    # contents = "these are my contents"

    # The ID of your GCS object
    # destination_blob_name = "storage-object-name"

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_string(contents)

    print("{} uploaded to {}.".format(destination_blob_name, bucket_name))

In [83]:
def download_blob(bucket_name, source_blob_name, destination_file_name):
    """Downloads a blob from the bucket."""
    # The ID of your GCS bucket
    # bucket_name = "your-bucket-name"

    # The ID of your GCS object
    # source_blob_name = "storage-object-name"

    # The path to which the file should be downloaded
    # destination_file_name = "local/path/to/file"

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)

    # Construct a client side representation of a blob.
    # Note `Bucket.blob` differs from `Bucket.get_blob` as it doesn't retrieve
    # any content from Google Cloud Storage. As we don't need additional data,
    # using `Bucket.blob` is preferred here.
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)
    #content = blob.download_as_string()

    print(
        "Downloaded storage object {} from bucket {} to local file {}.".format(
            source_blob_name, bucket_name, destination_file_name
        )
    )

In [84]:
def download_blob_as_string(bucket_name, source_blob_name):
    """Downloads a blob from the bucket."""
    # The ID of your GCS bucket
    # bucket_name = "your-bucket-name"

    # The ID of your GCS object
    # source_blob_name = "storage-object-name"

    # The path to which the file should be downloaded
    # destination_file_name = "local/path/to/file"

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)

    # Construct a client side representation of a blob.
    # Note `Bucket.blob` differs from `Bucket.get_blob` as it doesn't retrieve
    # any content from Google Cloud Storage. As we don't need additional data,
    # using `Bucket.blob` is preferred here.
    blob = bucket.blob(source_blob_name)
    content = blob.download_as_string()

    print("Downloaded storage object {} from bucket {}".format(source_blob_name, bucket_name))
    return content

In [85]:
def message_handler(data):
    clear_output(wait=True)
    print('Message Handler...')
    message = json.loads(data)
    print(message)

def batch_handler(batch):
    print('Batch Handler...')
    
    clear_output(wait=True)
    print('Batch Handler...')
    
    firstMessage = json.loads(batch[0])
    lastMessage = json.loads(batch[-1])
    
    # Create unique file name for batch
    fileName = input_folder_name + time.strftime("%Y-%m-%d-%H-%M-") + str(firstMessage['id']) + '-' + str(lastMessage['id']) + '.json'
    
    # Get content of batch
    content = ''
    for data in batch:
        message = json.loads(data)
        content = content + json.dumps(message) + '\n'
    
    # Upload batch to cloud storage
    upload_blob_from_memory(project_bucket_name, content, fileName)
    
    
def send_tweets_to_client_connection(client_connection, topic):
    # Create twitter stream
    twitter_stream = TweetStream(\
        consumer_key\
        , consumer_secret\
        , access_token\
        , access_token_secret\
        , client_connection\
        , message_Handler=message_handler\
        #, rate_limit_delay=1\
        #, tweet_limit=10\
        #, time_limit=5\
        #, batch_size=5\
        #, batch_handler=batch_handler\
        )
    
    # Filter for topic
    twitter_stream.filter(track=topic, languages=["en"])

In [86]:
def start_realtime_process(topic):
     # Get host name and port number for service.
     host = socket.gethostname()
     port = 5555
    
     # Initialize a socket
     s = socket.socket()
     s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
    
     # Binding host and port
     s.bind((host, port))

     print("Listening on port {} with topic {}:".format(port,topic))

     # Waiting for client connection
     s.listen(5)
    
     # Establish connection with client
     client_connection, addr = s.accept()  

     # Start streaming tweets to client
     with client_connection:
         print("Connected with client: " + str(addr))

         # Send tweets to client through the socket connection
         send_tweets_to_client_connection(client_connection, topic)

In [87]:
def start_batch_process(topic):
    
    # Create twitter stream
    twitter_stream = TweetStream(\
        consumer_key\
        , consumer_secret\
        , access_token\
        , access_token_secret\
        #, message_Handler=message_handler\
        #, rate_limit_delay=0.1\
        #, tweet_limit=batch_size\
        #, time_limit=20\
        , batch_size=batch_size\
        , batch_handler=batch_handler\
        )
    
    # Filter for topic
    twitter_stream.filter(track=topic, languages=["en"])

In [88]:
if __name__ == "__main__":
    
#     # Get topic and is_realtime from args
#     args = sys.argv[1:]
#     if len(args) == 1:
#         topic = args[0]
#     if len(args) == 2:
#         is_realtime = bool(args[1])
        
    if is_realtime == True:
        print("Starting real-time process with topic = {}".format(topic))
        start_realtime_process(topic)
        print("Finished.")
    else:
        print("Starting batch process with topic = {}".format(topic))
        start_batch_process(topic)
        print("Finished.")

Message Handler...
{'created_at': 'Wed Mar 16 20:44:35 +0000 2022', 'id': 1504196956385648646, 'id_str': '1504196956385648646', 'text': 'RT @Thapz__: Having a supportive partner can literally change your life.', 'source': '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>', 'truncated': False, 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 2383286178, 'id_str': '2383286178', 'name': 'Ngwako Letter Serepe', 'screen_name': 'MasepengJnr', 'location': 'Gauteng', 'url': None, 'description': 'Content strategy & creation | PR & Media relations.', 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 1878, 'friends_count': 767, 'listed_count': 646, 'favourites_count': 16956, 'statuses_count': 57487, 'created_at': 'Tue Mar 11 06:46:28 +0000 2014', 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'lang': 

KeyboardInterrupt: 