# Analyse Mastodon social media posts

### Install dependencies

In [None]:
!pip install confluent-kafka

### Import libraries

In [None]:
from confluent_kafka import Producer, Consumer, KafkaError # to produce and consume data from Apache Kafka topics
import boto3 # to programmatically create, configure, and manage AWS resources
import json # to work with social media messages that are represented as JSON objects
import re # for helper functionality to clean HTML tags from social media messages


## Prepare models 

### Deploy Hugging Face model to identify offesive posts 

The model we use here is https://huggingface.co/cardiffnlp/twitter-roberta-base-offensive

In [None]:
import sagemaker
import boto3
from sagemaker.huggingface import HuggingFaceModel

try:
	role = sagemaker.get_execution_role()
except ValueError:
	iam = boto3.client('iam')
	role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

# Hub Model configuration. https://huggingface.co/models
hub = {
	'HF_MODEL_ID':'cardiffnlp/twitter-roberta-base-offensive',
	'HF_TASK':'text-classification'
}

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
	transformers_version='4.26.0',
	pytorch_version='1.13.1',
	py_version='py39',
	env=hub,
	role=role, 
)

# deploy model to SageMaker Inference
offensive_predictor = huggingface_model.deploy(
	initial_instance_count=1, # number of instances
	instance_type='ml.m5.xlarge' # ec2 instance type
)

### Use a JumpStart models to predict if a message is positive or negative

In [None]:
# Define a mapping dictionary to map model labels to negative/positive label
label_mapping = {'LABEL_0': 'negative', 'LABEL_1': 'positive'}

def get_sentiment_prediction(text):
    endpoint_name = '' # TODO: Set endpoint name of your model for sentiment analysis
    client = boto3.client('runtime.sagemaker')
    query_response = client.invoke_endpoint(EndpointName=endpoint_name, ContentType='application/x-text', Body=text, Accept='application/json;verbose')
    model_predictions = json.loads(query_response['Body'].read())
    probabilities, labels, predicted_label = model_predictions['probabilities'], model_predictions['labels'], model_predictions['predicted_label']
    # Map the predicted_label to your the label using the mapping dictionary
    predicted_label = label_mapping.get(predicted_label, predicted_label)
    return probabilities, labels, predicted_label


## Apache Kafka workflow

### Define Apache Kafka connection properties

In [None]:
# TODO: Load Apache Kafka certificates into certificates folder
apache_kafka_ssl_config = {
    'ssl.ca.location': 'certificates/ca.pem', 
    'ssl.certificate.location': 'certificates/service.cert',
    'ssl.key.location': 'certificates/service.key',
    'security.protocol': 'ssl',
}

apache_kafka_uri = ''  # TODO: Set URI for Apache Kafka

apache_kafka_input_topic_name = 'mastodon'
apache_kafka_enriched_output_topic_name = 'enriched_data'
apache_kafka_processing_errors_topic_name = 'processing_errors'


### Create Apache Kafka Consumer

In [None]:
consumer = Consumer({'bootstrap.servers': apache_kafka_uri, 'group.id': 'mygroup10', 'auto.offset.reset': 'earliest', **apache_kafka_ssl_config})
consumer.subscribe([apache_kafka_input_topic_name])

CLEANR = re.compile('<.*?>') 

def get_json_body(message):    
    decoded_message = message.value().decode('utf-8') # Decode from binary 
    json_message = json.loads(decoded_message)  # Parse JSON message
    return json_message

def get_clean_content(json_object):    
    content = json_object.get("content", "")  # Retrieve 'content' property    
    only_text = re.sub(CLEANR, '', content)
    return only_text


### Create Apache Kafka Producer

In [None]:
producer = Producer({
    'bootstrap.servers': apache_kafka_uri, 
    **apache_kafka_ssl_config 
})

# Send a message to a Kafka topic
def send_message(message, topic_name):
    producer.produce(topic_name, json.dumps(message).encode('utf-8'))
    producer.flush()
    
def send_enriched_data(message, probabilities, predicted_label):
    message['probabilities'] = probabilities
    message['predition'] = predicted_label
    send_message(message, apache_kafka_enriched_output_topic_name)
    
def report_processing_error(message, error_code, error_message):
    message['processing_error_code'] = error_code
    message['processing_error_message'] = error_message
    send_message(message, apache_kafka_processing_errors_topic_name)
    


### Read messages from Apache Kafka **input topic** and push processed data back to **output topic**

In [None]:
print(f"Processing messages")
while True:
    message = consumer.poll(1.0)  # Poll for messages, with a timeout of 1 second

    if message is None:
        continue

    if message.error():
        if message.error().code() == KafkaError._PARTITION_EOF:
            # End of partition event
            print(f"Reached end of partition for topic {message.topic()} [{message.partition()}]")
        else:
            print(f"Error while consuming message: {message.error()}")
    else:
        # Process the message
        json_body = get_json_body(message)
        content_property = get_clean_content(json_body)
        if content_property == "":
            continue

        try:
            probabilities, labels, predicted_label = get_sentiment_prediction(content_property)
            offensive_probability = offensive_predictor.predict({
	            "inputs": content_property
            })[0]
            
            print(f"Inference:\n"
                  f"Input text: '{content_property}'\n"
                  f"Sentiment prediction: {probabilities}\n"
                  f"The message is {predicted_label}\n"
                  f"Offencive score: {offensive_value['score']}\n"
                  f"The message is: {offensive_value['label']}\n")

            send_enriched_data(json_body, probabilities, predicted_label, offensive_value['score'], offensive_value['label'])
            

        except Exception as e:
            print(f"An error occurred: {e}")
            response = getattr(e, "response", {})
            error_code = response.get("Error", {}).get("Code", "Unknown")
            error_message = response.get("Error", {}).get("Message", "Unknown")
            report_processing_error(json_body, error_code, error_message)
            

# Close the consumer
consumer.close()
