In [1]:
import json
import pandas as pd
from collections import defaultdict
import re

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
def analyze_text_categories(corpus_file, stereotype_dict_file, status: str):
    """
    Analyze the proportion of categorized words in each line of text using a stereotype dictionary.
    
    Parameters:
    corpus_file (str): Path to CSV file containing tweets data
    stereotype_dict_file (str): Path to JSON file containing the stereotype dictionary
    
    Returns:
    pandas.DataFrame: DataFrame with original text and category proportions, plus word-level details
    """
    # Load the stereotype dictionary
    with open(stereotype_dict_file, 'r') as f:
        stereotype_dict = json.load(f)
    
    # Convert all category words to lowercase for case-insensitive matching
    category_sets = {
        category: set(word.lower() for word in words)
        for category, words in stereotype_dict.items()
    }
    
    # Load corpus with specified column names
    df = pd.read_csv(corpus_file)
    
    def analyze_text(text):
        """Analyze a single text for category word usage."""
        # Handle non-string input
        if not isinstance(text, str):
            if pd.isna(text):
                text = ""
            else:
                text = str(text)
        
        # Tokenize text into words (convert to lowercase and split)
        words = re.findall(r'\b\w+\b', text.lower())
        total_words = len(words)
        
        if total_words == 0:
            return {
                'proportions': {cat: 0.0 for cat in stereotype_dict.keys()},
                'counts': {cat: 0 for cat in stereotype_dict.keys()},
                'found_words': {cat: [] for cat in stereotype_dict.keys()},
                'total_words': 0
            }
        
        # Initialize tracking dictionaries
        category_counts = defaultdict(int)
        found_words = defaultdict(list)
        
        # Analyze each word
        for word in words:
            for category, category_words in category_sets.items():
                if word in category_words:
                    category_counts[category] += 1
                    found_words[category].append(word)
        
        # Calculate proportions
        proportions = {
            category: count / total_words 
            for category, count in category_counts.items()
        }
        
        # Ensure all categories are represented
        for category in stereotype_dict.keys():
            if category not in proportions:
                proportions[category] = 0.0
                category_counts[category] = 0
                found_words[category] = []
                
        return {
            'proportions': proportions,
            'counts': dict(category_counts),
            'found_words': dict(found_words),
            'total_words': total_words
        }
    
    # Analyze each text from the After_Corpus column
    analyses = []
    for text in df[f'{status}_Corpus']:
        try:
            analysis = analyze_text(text)
            analyses.append(analysis)
        except Exception as e:
            print(f"Error processing text: {text}")
            print(f"Error message: {str(e)}")
            # Return empty analysis for problematic texts
            analyses.append({
                'proportions': {cat: 0.0 for cat in stereotype_dict.keys()},
                'counts': {cat: 0 for cat in stereotype_dict.keys()},
                'found_words': {cat: [] for cat in stereotype_dict.keys()},
                'total_words': 0
            })
    
    # Extract proportions, counts, and words into separate DataFrames
    proportions_df = pd.DataFrame([analysis['proportions'] for analysis in analyses])
    counts_df = pd.DataFrame([analysis['counts'] for analysis in analyses])
    found_words_df = pd.DataFrame([analysis['found_words'] for analysis in analyses])
    
    # Add total word counts
    total_words_series = pd.Series([analysis['total_words'] for analysis in analyses], name='total_words')
    
    # Add prefix to column names to distinguish metrics
    proportions_df.columns = [f"prop_{col}" for col in proportions_df.columns]
    counts_df.columns = [f"count_{col}" for col in counts_df.columns]
    found_words_df.columns = [f"words_{col}" for col in found_words_df.columns]
    
    # Combine all DataFrames
    result_df = pd.concat([
        df,  # Original data
        proportions_df,  # Proportions for each category
        counts_df,  # Raw counts for each category
        found_words_df,  # Actual words found for each category
        total_words_series  # Total word count per text
    ], axis=1)
    
    return result_df

In [3]:
# analyze_category_proportions
before_df = "Cleaned Data/Before_NN_Cleaned.csv"
after_df = "Cleaned Data/After_NN_Cleaned.csv"
dictionary = "Create Dictionary/Stereotype_Dictionary.json"

after = analyze_text_categories(after_df, dictionary, "After")
before = analyze_text_categories(before_df, dictionary, "Before")

after.to_csv("After_Stereotypes_Proportion.csv", index=False)
before.to_csv("Before_Stereotypes_Proportion.csv", index=False)