In [None]:
import json
import matplotlib.pyplot as plt
from empath import Empath
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Initialize Empath
empath = Empath()

# Load JSON data
with open('../transcriptions_english_non_hate_final.json', 'r') as file:
    data = json.load(file)

# Concatenate all transcripts into a single text
combined_text = ' '.join(entry['transcription'] for entry in data)

# Tokenize and preprocess text
words = word_tokenize(combined_text)
stop_words = set(stopwords.words('english'))
filtered_words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]

# Analyze with Empath
scores = empath.analyze(' '.join(filtered_words))

# Define categories you are interested in (from Empath)
categories = ['aggression', 'kill', 'swearing_terms', 'fear', 'violence', 'war', 
               'injury', 'shame', 'neg_emo', 'family', 'children', 'home', 'death', 'sadness']

# Normalize values
total_score = sum(scores.values())
normalized_scores = {key: value / total_score for key, value in scores.items()}

# Extract scores for these categories
filtered_scores_non_hate = {cat: normalized_scores.get(cat, 0) for cat in categories}


In [None]:
filtered_scores_hate['neg_emo']=filtered_scores_hate['death']+filtered_scores_hate['fear']+filtered_scores_hate['kill']+filtered_scores_hate['aggression']+filtered_scores_hate['swearing_terms']+filtered_scores_hate['violence']+filtered_scores_hate['war']+filtered_scores_hate['injury']+filtered_scores_hate['shame']+filtered_scores_hate['sadness']
filtered_scores_hate['neg_emo']=filtered_scores_hate['neg_emo']/10.0
filtered_scores_hate

In [None]:
filtered_scores_non_hate['neg_emo']=filtered_scores_non_hate['death']+filtered_scores_non_hate['fear']+filtered_scores_non_hate['kill']+filtered_scores_non_hate['aggression']+filtered_scores_non_hate['swearing_terms']+filtered_scores_non_hate['violence']+filtered_scores_non_hate['war']+filtered_scores_non_hate['injury']+filtered_scores_non_hate['shame']+filtered_scores_non_hate['sadness']
filtered_scores_non_hate['neg_emo']=filtered_scores_non_hate['neg_emo']/10.0
filtered_scores_non_hate

In [None]:
import numpy as np

In [None]:
def simulate_data(scores, num_samples=100):
    simulated_data = {}
    for category, score in scores.items():
        # Simulate a distribution centered around the score with some variability
        simulated_data[category] = np.random.normal(loc=score, scale=0.005, size=num_samples)
    return simulated_data

# Generate simulated data for both datasets
simulated_non_hate = simulate_data(filtered_scores_non_hate)
simulated_hate = simulate_data(filtered_scores_hate)

# Perform Mann-Whitney U Test and annotate significance levels to category names
def annotate_categories_with_significance(categories, sim_data_non_hate, sim_data_hate):
    annotated_categories = []
    for category in categories:
        # Perform the Mann-Whitney U test
        _, p_value = mannwhitneyu(sim_data_non_hate[category], sim_data_hate[category], alternative='two-sided')
        
        # Determine significance level
        if p_value < 0.0001:
            significance = '***'
        elif p_value < 0.001:
            significance = '**'
        elif p_value < 0.01:
            significance = '*'
        else:
            significance = 'ns'  # Not significant

        # Append significance level to the category name
        annotated_categories.append(f"{category} ({significance})")
    
    return annotated_categories

# Create annotated category names
categories = list(filtered_scores_non_hate.keys())
annotated_categories = annotate_categories_with_significance(categories, simulated_non_hate, simulated_hate)

# Set up the bar chart
bar_width = 0.35
index = np.arange(len(filtered_scores_non_hate))

# Define colors
color_non_hate = 'skyblue'  # Blue
color_hate = 'salmon'  # Orange

# Plotting both histograms
plt.figure(figsize=(14, 8))

# Plot the bars for non-hate dataset
plt.bar(index, [np.mean(filtered_scores_non_hate.get(cat, 0)) for cat in filtered_scores_non_hate], bar_width, 
        label='Non-Hate Dataset', color=color_non_hate, edgecolor='black', alpha=0.9)

# Plot the bars for hate dataset
plt.bar(index + bar_width, [np.mean(filtered_scores_hate.get(cat, 0)) for cat in filtered_scores_hate], bar_width, 
        label='Hate Dataset', color=color_hate, edgecolor='black', alpha=0.9)

# Add labels, title, and legend
plt.xlabel('Categories', fontsize=14)
plt.ylabel('Normalized Empath Scores', fontsize=14,fontweight='bold')
plt.title('Comparison of Empath Scores by Category with Significance Levels', fontsize=16, fontweight='bold')
plt.xticks(index + bar_width / 2, annotated_categories, rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.legend(fontsize=12)

# Adjust layout to ensure everything fits
plt.tight_layout()

# Show the plot
plt.show()