In [None]:
############## COLLECTING REAL DATA AND CREATING CASE STUDY DATASET##################### 
import pandas
import praw
from dotenv import dotenv_values
import os
import nltk
from nltk import word_tokenize
import re
import numpy as np
from bs4 import BeautifulSoup

#nltk.download('punkt')
#nltk.download('stopwords')


# Load environment variables from .env file
env = dotenv_values(".env")

# Authenticate with Reddit using PRAW
reddit = praw.Reddit(
    client_id=env["CLIENT_ID"],
    client_secret=env["CLIENT_SECRET"],
    user_agent=env["USER_AGENT"],
    redirect_uri=env["REDIRECT_URI"],
    refresh_token=env["REFRESH_TOKEN"],
)

# Check if the CSV file already exists
csv_file_name = "reddit_posts_with_comments.csv"
if os.path.exists(csv_file_name):
    print("CSV file already exists. Appending new data and avoiding duplicates.")
    df = pandas.read_csv(csv_file_name)  # Read existing CSV into a DataFrame
else:
    print("CSV file does not exist. It will be created after fetching new data.")
    df = pandas.DataFrame(columns=["Title", "Id", "Comments"])

# Create a subreddit instance
targetObjects = ['conspiracy',
                 'WhitePeopleTwitter', 'politics', 'Republican', 'worldnews', 'CombatFootage', 'UkraineRussiaReport']
for subreddit_name in targetObjects:
    subreddit = reddit.subreddit(subreddit_name)

    # Print subreddit name
    print(subreddit.display_name)

    # Lists to store submission information
    titles = []
    scores = []
    ids = []
    comments = []

    # Loop through the newest 21 submissions in the subreddit
    for iteration, submission in enumerate(subreddit.hot(limit=20)):
        print(f"post {iteration}/20")
        # Check if the submission ID already exists in the DataFrame to avoid duplication
        if submission.id not in df["Id"].values:
            # Add submission title to the titles list
            titles.append(submission.title)
            ids.append(submission.id)  # Add submission ID to the ids list

            # Fetch comments for the current submission
            submission.comments.replace_more(limit=40)
            submission_comments = []
            for comment in submission.comments.list():
                # Check if the comment author's username contains "bot"
                if 'bot' not in comment.name:
                    # Use BeautifulSoup to remove HTML tags from content
                    soup = BeautifulSoup(comment.body, 'html.parser')
                    filtered_content = soup.get_text()

                    # Remove URLs from filtered_content
                    filtered_content = re.sub(
                        r'http\S+|www\S+', '', filtered_content)

                    # Remove only #
                    filtered_content = re.sub(r'#', '', filtered_content).lower()
                    submission_comments.append(filtered_content)
            comments.append(submission_comments)

        # Create a DataFrame with the new data
        new_data = pandas.DataFrame(
            {"Title": titles, "Id": ids, "Comments": comments}
        )

        # Append/concat the new data to the existing DataFrame
        df = pandas.concat([df, new_data], ignore_index=True)

        # Drop duplicates based on the 'Id' column (submission IDs)
        df.drop_duplicates(subset="Id", keep="last", inplace=True)
    # Save the DataFrame to the CSV file
df.to_csv(csv_file_name, index=False)

# Print the shape of the DataFrame and display the first 10 rows
print(df.shape)
print(df.head(10))

print(f"CSV file '{csv_file_name}' has been generated/updated with the new Reddit posts and comments while avoiding duplicates.")



In [None]:
##############CLEANING THE DATASET ####################
import pandas as pd
import ast
import re
import emoji

# Read the CSV file
input_csv = "reddit_posts_with_comments.csv"
output_csv = "cleaned_reddit_posts.csv"
df = pd.read_csv(input_csv)

# Function to find and remove emojis from a string
def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')

# Function to clean and filter a single comment
def clean_and_filter_comment(comment):
    # Convert the string representation of the list to an actual list
    comment_list = ast.literal_eval(comment)
    # Filter out comments that match the specified patterns
    cleaned_comments = []
    for c in comment_list:
        # Remove '[deleted]' comments
        if c.strip() != "[deleted]":
            # Remove emojis from the comment
            c = remove_emojis(c)
            # Remove links
            c = re.sub(r'http[s]?://\S+', '', c)
            # Remove the "[meta]" pattern (case-insensitive)
            if not re.search(r'\[meta\]', c, re.IGNORECASE):
                # Remove extra spaces and append the cleaned comment
                cleaned_comments.append(re.sub(r'\s+', ' ', c.strip()))
    return cleaned_comments

# Apply the cleaning and filtering function to the 'Comments' column
df['Comments'] = df['Comments'].apply(clean_and_filter_comment)

# Remove rows where all comments were filtered out
df = df[df['Comments'].apply(len) > 0]

# Save the cleaned and filtered DataFrame to a new CSV file
df.to_csv(output_csv, index=False)




In [None]:
############## TOKENIZING THE TARGET DATASET ##################### 
import pandas as pd
import re
import numpy as np
import nltk
from nltk import word_tokenize

csv_tokenized = "tokenized_csv.csv"
csv_input = "cleaned_reddit_posts.csv"
df = pd.read_csv(csv_input)

# Function to clean the text using regex
def clean_text(text):
    cleaned_text = re.sub(r'[^a-zA-Z]', ' ', str(text))
    cleaned_text = cleaned_text.strip()
    cleaned_text = cleaned_text.lower()
    return cleaned_text

# Clean the 'Comments' column
df['Comments'] = df['Comments'].apply(clean_text)

# Tokenize the text data
df['tokenized_text'] = df['Comments'].apply(word_tokenize)

# Removal of stopwords
stopwords_english = set(nltk.corpus.stopwords.words("english"))
df['tokenized_text'] = df['tokenized_text'].apply(lambda tokens: [token for token in tokens if token not in stopwords_english])

# Remove tokens with a single character
df['tokenized_text'] = df['tokenized_text'].apply(lambda tokens: [token for token in tokens if len(token) > 1])

# Drop the unnecessary columns (keep only the 'tokenized_text' column)
df_cleaned = df[['tokenized_text']]

# Save the cleaned DataFrame to the CSV file
df_cleaned.to_csv(csv_tokenized, index=False)

# Print the shape of the DataFrame and display the first 10 rows
print(df_cleaned.shape)
print(df_cleaned.head(10))

print(
    f"CSV file '{csv_tokenized}' has been generated/updated with the tokenized text while avoiding duplicates and cleaning the data."
)


In [None]:
##################### DOWNLOADING AND CACHING MODELS ################################
import logging
import requests
from transformers import AutoTokenizer

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# Define a mapping of languages to model names
language_to_model = {
    'en': "IMSyPP/hate_speech_en",
    'it': "IMSyPP/hate_speech_it",
    'nl': "IMSyPP/hate_speech_nl",
    'sl': "IMSyPP/hate_speech_slo",
}

# Function to download models to the cache directory
def download_models_to_cache():
    # Specify the cache directory for local caching
    cache_dir = ".cache"
    # Loop over the models and download them to the cache
    for model_name in language_to_model.values():
        try:
            logger.info(f"Downloading and caching model '{model_name}'...")
            tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
            logger.info(f"Model '{model_name}' downloaded and cached successfully.")
        except Exception as e:
            logger.error(f"An error occurred while downloading the model '{model_name}': {e}")

# Call the function to download models to the cache
download_models_to_cache()


In [None]:
############## PROCESSING DATA AND CREATING THE SENTIMENT ANALYSIS ####################
import pandas as pd
import torch
from langdetect import detect
from tqdm import tqdm
import logging
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Set the logging level for the transformers library to ERROR
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.CRITICAL)

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# Load the Reddit posts CSV file
input_csv = "cleaned_reddit_posts.csv"
output_csv = "reddit_posts_with_labels.csv"

# Define a mapping of languages to model names
language_to_model = {
    'en': "IMSyPP/hate_speech_en",
    'it': "IMSyPP/hate_speech_it",
    'nl': "IMSyPP/hate_speech_nl",
    'sl': "IMSyPP/hate_speech_slo",
}

# Define the default model for cases where language detection fails
default_model_name = "IMSyPP/hate_speech_en"

def load_model(language):
    model_name = language_to_model.get(language, default_model_name)
    cache_dir = ".cache"
    tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
    
    # Set special tokens (if applicable)
    special_tokens = {}
    if tokenizer.bos_token is None:
        special_tokens["bos_token"] = "[BOS]"  # Use the string token here
    if tokenizer.eos_token is None:
        special_tokens["eos_token"] = "[EOS]"  # Use the string token here
    
    if special_tokens:
        tokenizer.add_special_tokens(special_tokens)
    
    model = AutoModelForSequenceClassification.from_pretrained(model_name, cache_dir=cache_dir)
    return tokenizer, model

# Function to analyze a comment and return the results
def analyze_comment(comment, language, tokenizer, model):
    try:
        inputs = tokenizer(comment, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)
        probabilities = torch.softmax(outputs.logits, dim=1).tolist()[0]
        analyze_error = None
    except Exception as e:
        probabilities = [0.0] * 4
        analyze_error = str(e)
    return probabilities, language, analyze_error


# Read the CSV file
df = pd.read_csv(input_csv)

# Total number of comments to process
total_comments = df['Comments'].apply(len).sum()

# Batch size for writing results to CSV
batch_size = 100

# Processed comment count
processed_comments = 0
processed_batch_count = 0

# Initialize lists to store final results
final_results = []

# Load the default model once for cases where language detection fails
default_tokenizer, default_model = load_model('en')

# Iterate over rows in the CSV
progress_bar = tqdm(total=total_comments, desc="Processing Comments", leave=False)

for index, row in df.iterrows():
    comment_list = eval(row['Comments'])  # Assuming the comments are in a list format

    # Split the comment list into batches
    comment_batches = [comment_list[i:i + batch_size] for i in range(0, len(comment_list), batch_size)]

    for batch in comment_batches:
        try:
            # Filter out empty or very short comments
            batch = [comment for comment in batch if len(comment.strip()) > 7]  # Adjust the length threshold as needed
            
            if not batch:
                continue  # Skip the batch if no valid comments are present
            
            # Detect language for the first comment in the batch
            language = detect(batch[0])
            
            # Load the model and tokenizer for the detected language
            tokenizer, model = load_model(language)
            
            # Process the batch of comments sequentially
            batch_texts = batch  # Store the batch of comments as a list of strings
            inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True)
            
            with torch.no_grad():
                outputs = model(**inputs)
            
            probabilities = torch.softmax(outputs.logits, dim=1).tolist()
            analyze_error = None

            # Process the results for each comment in the batch
            for i, comment in enumerate(batch_texts):
                result_row = (comment,f"{probabilities[i][0] * 100:.2f}%",f"{probabilities[i][1] * 100:.2f}%",f"{probabilities[i][2] * 100:.2f}%",f"{probabilities[i][3] * 100:.2f}%",language,analyze_error if analyze_error is not None else "None")
                final_results.append(result_row)

                # Update processed comment count
                processed_comments += 1

                # Save results to CSV every batch_size comments
                if processed_comments % batch_size == 0:
                    result_df = pd.DataFrame(final_results, columns=['comment','probabilities_acceptable', 'probabilities_hate', 'probabilities_offensive', 'probabilities_violent', 'language', 'errors'])
                    result_df.to_csv(output_csv, index=False)

                # Update progress description and postfix
                progress_percent = (processed_comments / total_comments) * 100
                progress_bar.set_description(f"Processing Comments - {progress_percent:.2f}%")
                progress_bar.set_postfix({"Processed": f"{processed_comments}/{total_comments}"})
            
            # Update progress batch 
            processed_batch_count += 1
            progress_batch = (processed_batch_count / len(comment_batches)) * 100
            progress_bar.set_description(f"Processing Batches - {progress_batch:.2f}%")
            progress_bar.set_postfix({"Processed": f"{processed_batch_count}/{len(comment_batches)}"})
            
            result_df = pd.DataFrame(final_results, columns=['comment','probabilities_acceptable', 'probabilities_hate', 'probabilities_offensive', 'probabilities_violent', 'language', 'errors'])
            result_df.to_csv(output_csv, index=False, mode='a')

        except Exception as e:
            logger.error("Error processing batch: %s", str(e))

# Save the final results
result_df = pd.DataFrame(final_results, columns=['comment','probabilities_acceptable', 'probabilities_hate', 'probabilities_offensive', 'probabilities_violent', 'language', 'errors'])
result_df.to_csv(output_csv, index=False, mode='a')

# Close the progress bar
progress_bar.close()

logger.info("Analysis completed. Results saved to: %s", output_csv)

In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import base64
import os
import asyncio
from azure.servicebus.aio import ServiceBusClient
from azure.servicebus import ServiceBusMessage
from dotenv import dotenv_values
from datetime import date
import json

env = dotenv_values(".env")


#### send the images to the queue
async def send_single_message(sender, data):
    # Convert the dictionary to a JSON-formatted string
    message_body = json.dumps(data)

    # Create a Service Bus message and send it to the queue
    message = ServiceBusMessage(message_body)
    await sender.send_messages(message)
    print("Sent a single message")


namespace_connection_str = env["NAMESPACE_CONNECTION_STR"]
queue_name = env["QUEUE_NAME"]
servicebus_client = ServiceBusClient.from_connection_string(
    conn_str=namespace_connection_str
)

# Get the queue sender
sender = servicebus_client.get_queue_sender(queue_name=queue_name)

# Load the CSV data into a pandas DataFrame
csv_path = "reddit_posts_with_labels.csv"
data = pd.read_csv(csv_path)

# Clean the data by removing rows with non-numeric values
data = data[data["probabilities_acceptable"].str.match(r"^\d+\.\d+%$")]

# Convert percentage strings to numeric values
data["probabilities_acceptable"] = pd.to_numeric(
    data["probabilities_acceptable"].str.rstrip("%")
)
data["probabilities_hate"] = data["probabilities_hate"].str.rstrip("%").astype(float)
data["probabilities_offensive"] = (
    data["probabilities_offensive"].str.rstrip("%").astype(float)
)
data["probabilities_violent"] = (
    data["probabilities_violent"].str.rstrip("%").astype(float)
)

# Calculate average probabilities for each content type
avg_accept = data["probabilities_acceptable"].mean()
avg_hate = data["probabilities_hate"].mean()
avg_offensive = data["probabilities_offensive"].mean()
avg_violent = data["probabilities_violent"].mean()

# Create a bar graph using Matplotlib
labels = ["Acceptable", "Hate", "Offensive", "Violent"]
values = [avg_accept, avg_hate, avg_offensive, avg_violent]

plt.bar(labels, values)
plt.ylabel("Average Probability")
plt.title("Average Probabilities of Content Types")
plt.tight_layout()

# Save the bar graph as an image
image_path = "bar_graph.png"
plt.savefig(image_path)
plt.close()

# Read the image as binary data and encode it as base64
with open(image_path, "rb") as image_file:
    image_binary = image_file.read()
    image_base64 = base64.b64encode(image_binary).decode("utf-8")

# Create a dictionary to store image data
image_data = {"chart_type": "Bar Graph", "image_base64": image_base64}

# Remove the temporary image file
os.remove(image_path)
await send_single_message(sender, image_data)

# Close the sender and the client
await sender.close()
await servicebus_client.close()

2023-12-05 22:33:48,077 - INFO - Connection state changed: None -> <ConnectionState.START: 0>
2023-12-05 22:33:48,236 - INFO - Connection state changed: <ConnectionState.START: 0> -> <ConnectionState.HDR_SENT: 2>
2023-12-05 22:33:48,237 - INFO - Connection state changed: <ConnectionState.HDR_SENT: 2> -> <ConnectionState.HDR_SENT: 2>
2023-12-05 22:33:48,238 - INFO - Connection state changed: <ConnectionState.HDR_SENT: 2> -> <ConnectionState.OPEN_PIPE: 4>
2023-12-05 22:33:48,238 - INFO - Session state changed: <SessionState.UNMAPPED: 0> -> <SessionState.BEGIN_SENT: 1>
2023-12-05 22:33:48,239 - INFO - Link state changed: <LinkState.DETACHED: 0> -> <LinkState.ATTACH_SENT: 1>
2023-12-05 22:33:48,240 - INFO - Management link receiver state changed: <LinkState.DETACHED: 0> -> <LinkState.ATTACH_SENT: 1>
2023-12-05 22:33:48,241 - INFO - Link state changed: <LinkState.DETACHED: 0> -> <LinkState.ATTACH_SENT: 1>
2023-12-05 22:33:48,241 - INFO - Management link sender state changed: <LinkState.DETA

Sent a single message


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import base64
import os
import asyncio
from azure.servicebus.aio import ServiceBusClient
from azure.servicebus import ServiceBusMessage
from dotenv import dotenv_values
from datetime import date
import json

env = dotenv_values(".env")


# Function to send a single message
async def send_single_message(sender, data):
    # Convert the dictionary to a JSON-formatted string
    message_body = json.dumps(data)

    # Create a Service Bus message and send it to the queue
    message = ServiceBusMessage(message_body)
    await sender.send_messages(message)
    print("Sent a single message")


namespace_connection_str = env["NAMESPACE_CONNECTION_STR"]
queue_name = env["QUEUE_NAME"]
servicebus_client = ServiceBusClient.from_connection_string(
    conn_str=namespace_connection_str
)

# Get the queue sender
sender = servicebus_client.get_queue_sender(queue_name=queue_name)

# Read the CSV file
dataset = pd.read_csv("reddit_posts_with_labels.csv", encoding="utf-8")

# Clean the data by removing rows with non-numeric values
dataset = dataset[dataset["probabilities_acceptable"].str.match(r"^\d+\.\d+%$")]

# Convert probability columns to numeric (float) type
probability_columns = [
    "probabilities_acceptable",
    "probabilities_hate",
    "probabilities_offensive",
    "probabilities_violent",
]
for column in probability_columns:
    dataset[column] = pd.to_numeric(dataset[column].str.rstrip("%"))

# Create a histogram plot
plt.hist(dataset["probabilities_acceptable"], bins=20, alpha=0.5, label="Acceptable")
plt.hist(dataset["probabilities_hate"], bins=20, alpha=0.5, label="Hate")
plt.hist(dataset["probabilities_offensive"], bins=20, alpha=0.5, label="Offensive")
plt.hist(dataset["probabilities_violent"], bins=20, alpha=0.5, label="Violent")
plt.xlabel("Probability")
plt.ylabel("Frequency")
plt.title("Distribution of Probabilities")
plt.legend(loc="upper right")
plt.tight_layout()

# Save the histogram plot as an image
image_path = "histogram.png"
plt.savefig(image_path)
plt.close()

# Read the image as binary data and encode it as base64
with open(image_path, "rb") as image_file:
    image_binary = image_file.read()
    image_base64 = base64.b64encode(image_binary).decode("utf-8")

# Create a dictionary to store image data
image_data = {"chart_type": "Histogram", "image_base64": image_base64}

# Remove the temporary image file
os.remove(image_path)

# Send the image data to the queue
await send_single_message(sender, image_data)

# Close the sender and the client
await sender.close()
await servicebus_client.close()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import base64
import os
import asyncio
from azure.servicebus.aio import ServiceBusClient
from azure.servicebus import ServiceBusMessage
from dotenv import dotenv_values
from datetime import date
import json

env = dotenv_values(".env")


# Function to send a single message
async def send_single_message(sender, data):
    # Convert the dictionary to a JSON-formatted string
    message_body = json.dumps(data)

    # Create a Service Bus message and send it to the queue
    message = ServiceBusMessage(message_body)
    await sender.send_messages(message)
    print("Sent a single message")


namespace_connection_str = env["NAMESPACE_CONNECTION_STR"]
queue_name = env["QUEUE_NAME"]
servicebus_client = ServiceBusClient.from_connection_string(
    conn_str=namespace_connection_str
)

# Get the queue sender
sender = servicebus_client.get_queue_sender(queue_name=queue_name)

# Read the CSV file
dataset = pd.read_csv("reddit_posts_with_labels.csv", encoding="utf-8")

# Clean the data by removing rows with non-numeric values
dataset = dataset[dataset["probabilities_acceptable"].str.match(r"^\d+\.\d+%$")]

# Convert probability columns to numeric (float) type
probability_columns = [
    "probabilities_acceptable",
    "probabilities_hate",
    "probabilities_offensive",
    "probabilities_violent",
]
for column in probability_columns:
    dataset[column] = pd.to_numeric(dataset[column].str.rstrip("%"))

# Create a scatter plot
scatter = plt.scatter(
    dataset["probabilities_acceptable"],
    dataset["probabilities_hate"],
    c=dataset["probabilities_violent"],  # Use 'probabilities_violent' for color
    cmap="viridis",
    s=dataset["probabilities_offensive"]
    * 100,  # Use 'probabilities_offensive' for marker size
    alpha=0.7,
)

plt.xlabel("Acceptable Probability")
plt.ylabel("Hate Probability")
plt.title("Scatter Plot of Hate vs. Acceptable Probabilities")
plt.colorbar(label="Violent Probability")
plt.tight_layout()

# Save the scatter plot as an image
image_path = "scatterplot.png"
plt.savefig(image_path)
plt.close()

# Read the image as binary data and encode it as base64
with open(image_path, "rb") as image_file:
    image_binary = image_file.read()
    image_base64 = base64.b64encode(image_binary).decode("utf-8")

# Create a dictionary to store image data
image_data = {"chart_type": "Scatter Plot", "image_base64": image_base64}

# Remove the temporary image file
os.remove(image_path)

# Send the image data to the queue
await send_single_message(sender, image_data)

# Close the sender and the client
await sender.close()
await servicebus_client.close()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import base64
import os
from azure.servicebus.aio import ServiceBusClient
from azure.servicebus import ServiceBusMessage
from dotenv import dotenv_values
import asyncio
import json

env = dotenv_values(".env")


# Function to send a single message
async def send_single_message(sender, data):
    # Convert the dictionary to a JSON-formatted string
    message_body = json.dumps(data)

    # Create a Service Bus message and send it to the queue
    message = ServiceBusMessage(message_body)
    await sender.send_messages(message)
    print("Sent a single message")


namespace_connection_str = env["NAMESPACE_CONNECTION_STR"]
queue_name = env["QUEUE_NAME"]
servicebus_client = ServiceBusClient.from_connection_string(
    conn_str=namespace_connection_str
)

# Get the queue sender
sender = servicebus_client.get_queue_sender(queue_name=queue_name)

# Read the CSV file
dataset = pd.read_csv("reddit_posts_with_labels.csv", encoding="utf-8")

# Clean the data by removing rows with non-numeric values
dataset = dataset[dataset["probabilities_acceptable"].str.match(r"^\d+\.\d+%$")]

# Convert probability columns to numeric (float) type
probability_columns = [
    "probabilities_acceptable",
    "probabilities_hate",
    "probabilities_offensive",
    "probabilities_violent",
]
for column in probability_columns:
    dataset[column] = pd.to_numeric(dataset[column].str.rstrip("%"))

# Create and save the bar chart as an image
data = dataset[
    [
        "probabilities_acceptable",
        "probabilities_hate",
        "probabilities_offensive",
        "probabilities_violent",
    ]
]
ax = data.plot(kind="bar", stacked=True)
plt.xlabel("Comment")
plt.ylabel("Probability")
plt.title("Content Type Probabilities in Each Comment")
plt.legend(title="Content Type")
plt.xticks(rotation=90)
plt.tight_layout()
image_path = "barchart.png"
plt.savefig(image_path)
plt.close()

# Read the image as binary data and encode it as base64
with open(image_path, "rb") as image_file:
    image_binary = image_file.read()
    image_base64 = base64.b64encode(image_binary).decode("utf-8")

# Create a dictionary to store image data
image_data = {"chart_type": "Bar Chart", "image_base64": image_base64}

# Remove the temporary image file
os.remove(image_path)

# Send the image data to the queue
await send_single_message(sender, image_data)

# Close the sender and the client
await sender.close()
await servicebus_client.close()

In [None]:
from wordcloud import WordCloud
import pandas as pd
import matplotlib.pyplot as plt
import base64
import os
from azure.servicebus.aio import ServiceBusClient
from azure.servicebus import ServiceBusMessage
from dotenv import dotenv_values
import asyncio
import json

env = dotenv_values(".env")


# Function to send a single message
async def send_single_message(sender, data):
    # Convert the dictionary to a JSON-formatted string
    message_body = json.dumps(data)

    # Create a Service Bus message and send it to the queue
    message = ServiceBusMessage(message_body)
    await sender.send_messages(message)
    print("Sent a single message")


namespace_connection_str = env["NAMESPACE_CONNECTION_STR"]
queue_name = env["QUEUE_NAME"]
servicebus_client = ServiceBusClient.from_connection_string(
    conn_str=namespace_connection_str
)

# Get the queue sender
sender = servicebus_client.get_queue_sender(queue_name=queue_name)

# Read the CSV file
dataset = pd.read_csv("reddit_posts_with_labels.csv", encoding="utf-8")

# Clean the data by removing rows with non-numeric values
dataset = dataset[dataset["probabilities_acceptable"].str.match(r"^\d+\.\d+%$")]

# Convert probability columns to numeric (float) type
probability_columns = [
    "probabilities_acceptable",
    "probabilities_hate",
    "probabilities_offensive",
    "probabilities_violent",
]
for column in probability_columns:
    dataset[column] = pd.to_numeric(dataset[column].str.rstrip("%"))

# Generate word clouds for each probability column
for column in probability_columns:
    filtered_comments = " ".join(
        [
            text
            for text, prob_value in zip(dataset["comment"], dataset[column])
            if prob_value >= 10
        ]
    )

    # Check if there are words to generate a word cloud
    if filtered_comments:
        # Generate and display the word cloud
        wordcloud = WordCloud(
            width=800, height=500, random_state=21, max_font_size=110
        ).generate(filtered_comments)

        # Save the word cloud as an image
        image_path = f"wordcloud_{column}.png"
        wordcloud.to_file(image_path)

        # Read the image as binary data and encode it as base64
        with open(image_path, "rb") as image_file:
            image_binary = image_file.read()
            image_base64 = base64.b64encode(image_binary).decode("utf-8")

        # Create a dictionary to store image data
        image_data = {
            "chart_type": f"Word Cloud - {column}",
            "image_base64": image_base64,
        }

        # Send the image data to the queue
        await send_single_message(sender, image_data)

        # Remove the temporary image file
        os.remove(image_path)
    else:
        print(f"No words found for {column}")

# Close the sender and the client
await sender.close()
await servicebus_client.close()

In [None]:
import asyncio
from azure.servicebus.aio import ServiceBusClient
from azure.servicebus import ServiceBusMessage
from dotenv import dotenv_values
import csv
import json

env = dotenv_values(".env")


# Function to send a single message
async def send_single_message(sender, data):
    # Convert the list of dictionaries to a JSON-formatted string
    message_body = json.dumps(data)

    # Create a Service Bus message and send it to the queue
    message = ServiceBusMessage(message_body)
    await sender.send_messages(message)
    print("Sent a single message")


namespace_connection_str = env["NAMESPACE_CONNECTION_STR"]
queue_name = env["QUEUE_NAME"]
servicebus_client = ServiceBusClient.from_connection_string(
    conn_str=namespace_connection_str
)

# Get the queue sender
sender = servicebus_client.get_queue_sender(queue_name=queue_name)

# Read CSV file and convert to a list of dictionaries
csv_file = "./reddit_posts_with_labels.csv"
json_data = []

with open(csv_file, "r", encoding="utf-8") as csvfile:
    csv_reader = csv.DictReader(csvfile)
    for row in csv_reader:
        json_data.append(row)

# Remove the "_id" field from the JSON documents
for doc in json_data:
    doc.pop("_id", None)

# Write the list of dictionaries to a JSON file
json_file = "data.json"
with open(json_file, "w") as jsonfile:
    json.dump(json_data, jsonfile, indent=4)

print("CSV data converted to JSON successfully.")

# Load the JSON data from the file
with open(json_file, "r") as file:
    data_to_insert = json.load(file)

# Check for duplicates in the JSON data based on the "comment" field and remove them
seen_comments = set()
unique_data_to_insert = []

for doc in data_to_insert:
    comment = doc["comment"]
    if comment not in seen_comments:
        unique_data_to_insert.append(doc)
        seen_comments.add(comment)

# Send the unique data to the queue
await send_single_message(sender, unique_data_to_insert)

# Close the sender and the client
await sender.close()
await servicebus_client.close()