In [2]:
import os
import pandas as pd
from collections import Counter

def compute_word_frequencies_from_texts(word_list, subcorpus_folder, token_counts_df):
    """
    Computes word frequencies for a given list of words/phrases in each subcorpus text file.

    Parameters:
    - word_list: List of words/phrases to analyze.
    - subcorpus_folder: Path to the folder containing 12 subcorpus text files.
    - token_counts_df: DataFrame with columns ['subcorpus', 'total_tokens'].

    Returns:
    - A DataFrame with columns: ['word', 'subcorpus', 'count', 'normalized_frequency']
    """
    results = []

    # Loop through each text file in the folder
    for filename in os.listdir(subcorpus_folder):
        if filename.endswith(".txt"):
            subcorpus_name = filename.replace("text_", "").replace(".txt", "").strip()
          # Extract subcorpus name
            file_path = os.path.join(subcorpus_folder, filename)

            # Read the text content
            with open(file_path, "r", encoding="utf-8") as file:
                text = file.read()

            # Count occurrences of each word/phrase
            word_counts = Counter()
            for word in word_list:
                word_counts[word] = text.count(word.lower())  # Count occurrences

            # Get total tokens for normalization
            total_tokens = token_counts_df.loc[token_counts_df["subcorpus"] == subcorpus_name, "token_count"]
            total_tokens = total_tokens.iloc[0] if not total_tokens.empty else 1  # Avoid division by zero

            # Store results
            for word, count in word_counts.items():
                results.append({
                    "word": word,
                    "subcorpus": subcorpus_name,
                    "count": count,
                    "normalized_frequency": count / total_tokens
                })

    # Convert results to DataFrame
    result_df = pd.DataFrame(results)
    
    return result_df

In [3]:
word_list = [
    "Democracy", "Dictatorship",
    "Polyarchy", "Autocracy",
    "Pluralism", "Authoritarianism",
    "Egalitarian", "Populism",
    "Consensual", "Erosion",
    "Deliberative", "Coup",
    "Participatory", "Fascism",
    "Liberal", "Conservatism",
    "Feminism", "Backsliding",
    "Social democracy", "Totalitarianism",
    "Representative", "Oligarchy",
    
    # Electoral Principle of Democracy
    "Directly elected", "Appointed", "Appointment",
    "Electoral competition", "Electoral authoritarianism",
    "Electoral integrity", "Electoral fraud",
    "Multiparty", "Multi party", "Single party",
    "Free speech", "Censorship",
    "Free press", "Propaganda",
    
    # Liberal Principle of Democracy
    "Independent judiciary", "State intervention",
    "Legislative oversight", "Legislative supremacy",
    "Transparent", "Transparency", "Corrupt", "Corruption",
    "Civil liberties", "State repression",
    "Privacy", "Surveillance",
    
    # Deliberative Principle of Democracy
    "Discretion", "Emotional appeals",
    "Rationale", "Hate speech",
    "Common good", "Self interest",
    "Ideological diversity", "Polarization",
    "Consensus", "Coercion",
    
    # Participatory Principle of Democracy
    "Suffrage", "Disenfranchisement",
    "Apportionment", "Malapportionment",
    "Direct democracy", "Bureaucratic politics",
    "Politically active", "Apolitical",
    "Civic engagement", "Apathy", 
    "Mass participation", "Indifferent",
    "Decentralize", "Decentralized", "Decentralization",
    "Centralize", "Centralized", "Centralization",
    
    # Egalitarian Principle of Democracy
    "Equality","Inequality",
    "Minorities", "Ruling elite",
    "Ethnic minorities", "Racial discrimination",
    "Gender equality", "Discrimination",
    "Redistribution", "Income inequality"
]

subcorpus_folder = '/Users/yvette/Desktop/data/Final/preprocessed grouped txt'
token_counts_df = pd.read_csv('/Users/yvette/Desktop/data/Final/group_token_counts.csv')
result = compute_word_frequencies_from_texts(word_list, subcorpus_folder, token_counts_df)
print(result)
result.to_csv('/Users/yvette/Desktop/data/Final/word_frequencies.csv', index=False)

                       word                                        subcorpus  \
0                 Democracy  2011-2020_American Journal of Political Science   
1              Dictatorship  2011-2020_American Journal of Political Science   
2                 Polyarchy  2011-2020_American Journal of Political Science   
3                 Autocracy  2011-2020_American Journal of Political Science   
4                 Pluralism  2011-2020_American Journal of Political Science   
...                     ...                                              ...   
1027  Racial discrimination  1971-1980_American Journal of Political Science   
1028        Gender equality  1971-1980_American Journal of Political Science   
1029         Discrimination  1971-1980_American Journal of Political Science   
1030         Redistribution  1971-1980_American Journal of Political Science   
1031      Income inequality  1971-1980_American Journal of Political Science   

      count  normalized_frequency  
0  

In [4]:
### combine word variations
# Define mapping for word variations
word_mapping = {
    # "Directly elected": ["Directly elected", "Election"],
    "Appointed": ["Appointed", "Appointment"],
    "Transparency": ["Transparent", "Transparency"],
    "Corruption": ["Corrupt", "Corruption"],
    "Decentralization": ["Decentralize", "Decentralized", "Decentralization"],
    "Centralization": ["Centralize", "Centralized", "Centralization"],
    #"Equality": ["Equal", "Equally", "Equality"],
    #"Inequality": ["Inequal", "Unequally", "Inequality"],
    "Equal right": ["Equal right", "Equal rights"],
    "Multiparty": ["Multiparty", "Multi party"],
    # "Ruling elite": ["Ruling elite", "Ruling elites"]
}

# Create a new column with the standardized word
result["word_standardized"] = result["word"].replace(
    {v: k for k, values in word_mapping.items() for v in values}
)

# Group by standardized words and subcorpus, summing count and normalized frequency
df_grouped = result.groupby(["word_standardized", "subcorpus"], as_index=False).agg(
    {"count": "sum", "normalized_frequency": "sum"}
)

# Rename column back to "word" for clarity
df_grouped.rename(columns={"word_standardized": "word"}, inplace=True)

# Save the cleaned dataset
df_grouped.to_csv("combined_word_frequencies.csv", index=False)

# Display result
print(df_grouped.head())

     word                                        subcorpus  count  \
0  Apathy  1971-1980_American Journal of Political Science     30   
1  Apathy   1971-1980_British Journal of Political Science     46   
2  Apathy  1981-1990_American Journal of Political Science     12   
3  Apathy   1981-1990_British Journal of Political Science     19   
4  Apathy  1991-2000_American Journal of Political Science     30   

   normalized_frequency  
0              0.000017  
1              0.000032  
2              0.000005  
3              0.000013  
4              0.000010  


In [6]:
### Descriptive statistics

# Define word groupings
word_groups = {
    "Democracy vs. Dictatorship": [
        "Democracy", "Dictatorship", "Polyarchy", "Autocracy", "Pluralism", "Authoritarianism",
        "Egalitarian", "Populism", "Consensual", "Erosion", "Deliberative", "Coup",
        "Participatory", "Fascism", "Liberal", "Conservatism", "Feminism", "Backsliding",
        "Social democracy", "Totalitarianism", "Representative", "Oligarchy"
    ],
    "Electoral Principle of Democracy": [
        "Directly elected", "Appointed", "Appointment", "Electoral competition",
        "Electoral authoritarianism", "Electoral integrity", "Electoral fraud", "Multiparty",
        "Multi party", "Single party", "Free speech", "Censorship", "Free press", "Propaganda"
    ],
    "Liberal Principle of Democracy": [
        "Independent judiciary", "State intervention", "Legislative oversight",
        "Legislative supremacy", "Transparent", "Transparency", "Corrupt", "Corruption",
        "Civil liberties", "State repression", "Privacy", "Surveillance"
    ],
    "Deliberative Principle of Democracy": [
        "Discretion", "Emotional appeals", "Rationale", "Hate speech", "Common good",
        "Self interest", "Ideological diversity", "Polarization", "Consensus", "Coercion"
    ],
    "Participatory Principle of Democracy": [
        "Suffrage", "Disenfranchisement", "Apportionment", "Malapportionment",
        "Direct democracy", "Bureaucratic politics", "Politically active", "Apolitical",
        "Civic engagement", "Apathy", "Mass participation","Indifferent", "Decentralize", "Decentralized",
        "Decentralization", "Centralize", "Centralized", "Centralization"
    ],
    "Egalitarian Principle of Democracy": [
        "Equality", "Inequality", "Minorities", "Ruling elite",
        "Ethnic minorities", "Racial discrimination", "Gender equality","Discrimination", 
        "Redistribution", "Income inequality", "Equal right", "Equal rights","Privilege"
    ]
}

# Load dataset
csv_file = "/Users/yvette/Desktop/data/Final/combined_word_frequencies.csv"  # Placeholder, replace with actual file path
df = pd.read_csv(csv_file)

# Standardize journal labels
df["Journal"] = df["subcorpus"].apply(lambda x: "American" if "American" in x else "British")

# Scale normalized frequency
df["scaled_frequency"] = df["normalized_frequency"] * 1000  # Convert to per 1,000 tokens

# Initialize results
stats_data = []

# Compute descriptive statistics per dimension and journal
for dimension, words in word_groups.items():
    for journal in ["American", "British"]:
        subset = df[(df["word"].isin(words)) & (df["Journal"] == journal)]
        total_frequency = subset["scaled_frequency"].sum()
        
        if not subset.empty:
            stats_data.append({
                "Dimension": dimension,
                "Journal": journal,
                "Mean": subset["scaled_frequency"].mean(),
                "SD": subset["scaled_frequency"].std(),
                "Min": subset["scaled_frequency"].min(),
                "Max": subset["scaled_frequency"].max(),
                "Total Frequency": total_frequency
            })

# Convert to DataFrame
stats_df = pd.DataFrame(stats_data)
stats_df[["Mean", "SD", "Min", "Max", "Total Frequency"]] = stats_df[["Mean", "SD", "Min", "Max", "Total Frequency"]].round(3)
stats_df.to_csv("/Users/yvette/Desktop/data/Final/word_frequency_descriptive_statistics.csv", index=False)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_all_words(csv_file, output_folder):
    """
    Plot the word frequency over the decades for all words in the dataset,
    distinguishing American and British journals.
    
    Parameters:
    - csv_file (str): Path to the CSV file containing 'word', 'subcorpus', 'count', 'normalized_frequency'.
    - output_folder (str): Folder to save the generated plots.
    """
    # Load the dataset
    df = pd.read_csv(csv_file)
    
    # Add a column to identify the journal (American or British)
    df['Journal'] = df['subcorpus'].apply(lambda x: 'American' if 'American' in x else 'British')
    
    # Add a 'Decade' column for grouping by decades
    df['Decade'] = df['subcorpus'].apply(lambda x: x.split('_')[0][:4] + 's')
    
    # Get all unique words
    words = df['word'].unique()
    
    # Define colors for American and British journals (culturally meaningful)
    palette = {'American': '#005AB5', 'British': '#DC3220'}  # Deep Blue for U.S., Deep Red for UK
    
    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)
    
    for word in words:
        # Filter for the specific word
        word_df = df[df['word'] == word]
        
        # Aggregate normalized frequency by Decade and Journal
        aggregated_df = word_df.groupby(['Decade', 'Journal'], as_index=False)['normalized_frequency'].sum()
        
        # Plotting
        plt.figure(figsize=(10, 6))
        sns.lineplot(data=aggregated_df, x='Decade', y='normalized_frequency', hue='Journal', 
                     style='Journal', markers=True, dashes=False, palette=palette)
        
        # Customize the plot
        plt.title(f'"{word}" Frequency Across Decades')
        plt.xlabel('Decade')
        plt.ylabel('Word Frequency')
        plt.xticks(rotation=45)
        
        # Save the plot
        plot_path = os.path.join(output_folder, f"{word.replace(' ', '_')}.png")
        plt.tight_layout()
        plt.savefig(plot_path)
        plt.close()

# Example usage
plot_all_words("/Users/yvette/Desktop/data/Final/combined_word_frequencies.csv", "output_plots_word_frequency")

In [35]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import math
import matplotlib.gridspec as gridspec
from matplotlib.lines import Line2D  

# Define the anchor words based on Table 2 from the document
# This dictionary maps each dimension to its positive and negative anchor words.
anchor_words_map = {
    'Electoral': {
        'Positive': ["Directly elected", "Electoral competition", "Electoral integrity", "Multiparty", "Free speech", "Free press"],
        'Negative': ["Appointed", "Electoral authoritarianism", "Electoral fraud", "Single party", "Censorship", "Propaganda"]
    },
    'Liberal': {
        'Positive': ["Independent judiciary", "Legislative oversight", "Civil liberties", "Privacy", "Transparency"],
        'Negative': ["State intervention", "Legislative supremacy", "State repression", "Surveillance", "Corruption"]
    },
    'Deliberative': {
        'Positive': ["Discretion", "Rationale", "Common good", "Ideological diversity", "Consensus"],
        'Negative': ["Emotional appeals", "Hate speech", "Self interest", "Polarization", "Coercion"]
    },
    'Participatory': {
        'Positive': ["Suffrage", "Apportionment", "Direct democracy", "Politically active", "Civic engagement", "Mass participation", "Decentralization"],
        'Negative': ["Disenfranchisement", "Malapportionment", "Bureaucratic politics", "Apolitical", "Apathy", "Indifferent", "Centralization"]
    },
    'Egalitarian': {
        'Positive': ["Equality", "Minorities", "Ethnic minorities", "Gender equality", "Redistribution"],
        'Negative': ["Inequality", "Ruling elite", "Racial discrimination", "Discrimination", "Income inequality"]
    },
    'Democracy-Authoritarian': {
        'Positive': ["Democracy", "Polyarchy", "Pluralism", "Egalitarian", "Consensual", "Deliberative", "Participatory", "Liberal", "Feminism", "Social democracy", "Representative"],
        'Negative': ["Dictatorship", "Autocracy", "Authoritarianism", "Populism", "Erosion", "Coup", "Fascism", "Conservatism", "Backsliding", "Totalitarianism", "Oligarchy"]
    }
}



In [79]:
def plot_anchor_word_optimized(csv_file, output_folder, anchor_words_map):
    """
    Create optimized plots for word frequency analysis with specific requirements:
    1. Detailed view only (no aggregated or summary views)
    2. Positive words on left column, negative words on right column
    3. Same y-axis scale within each dimension
    4. No grid lines
    5. Y-axis label centered
    6. No legend on first graph
    7. 4 columns for Democracy-Authoritarian dimension
    """
    

    # Load and preprocess data 
    df = pd.read_csv(csv_file)
    df['Journal'] = df['subcorpus'].apply(lambda x: 'American' if 'American' in x else 'British')
    df['Decade'] = df['subcorpus'].apply(lambda x: x.split('_')[0])
    
    # Sort decades chronologically (assuming format like "1970s")
    decades = sorted(df['Decade'].unique(), key=lambda x: int(x[:4]))
    
    # Filter and map anchor words
    all_anchor_words = [word for dim in anchor_words_map.values() 
                       for pol in dim.values() for word in pol]
    anchor_df = df[df['word'].isin(all_anchor_words)].copy()
    
    # Add dimension and polarity columns
    for dim, pol_dict in anchor_words_map.items():
        for pol, words in pol_dict.items():
            mask = anchor_df['word'].isin(words)
            anchor_df.loc[mask, 'Dimension'] = dim
            anchor_df.loc[mask, 'Polarity'] = pol

    # Convert frequencies to per thousand format
    anchor_df['frequency_per_thousand'] = anchor_df['normalized_frequency']
    
    # Visualization parameters
    journal_palette = {'American': '#005AB5', 'British': '#DC3220'}
    os.makedirs(output_folder, exist_ok=True)
    
    # Create custom legend elements
    custom_lines = [
        Line2D([0], [0], color=journal_palette['American'], lw=2, marker='o', markersize=5),
        Line2D([0], [0], color=journal_palette['British'], lw=2, marker='o', markersize=5)
    ]
    
    # Iterate through dimensions
    for dimension in anchor_words_map.keys():
        dim_data = anchor_df[anchor_df['Dimension'] == dimension].copy()
        
        # Get positive and negative words for this dimension
        pos_words = anchor_words_map[dimension]['Positive']
        neg_words = anchor_words_map[dimension]['Negative']
        
        # Get max y value for consistent scaling across all subplots in this dimension
        y_max = dim_data['frequency_per_thousand'].max() * 1.1
        
        # Special case for Democracy-Authoritarian dimension - use 4 columns
        if dimension == "Democracy-Authoritarian":
            # Calculate number of rows needed
            total_words = len(pos_words) + len(neg_words)
            num_columns = 4
            num_rows = math.ceil(total_words / num_columns)
            
            # Create figure with extra space at top for the dimension title and legend
            fig = plt.figure(figsize=(20, num_rows * 3 + 1))
            
            # Create grid with 4 columns
            gs = gridspec.GridSpec(num_rows, num_columns, wspace=0.3, hspace=0.4)
            
            # Plot all words in 4 columns
            # First half of the columns for positive words, second half for negative
            pos_col_span = num_columns // 2
            neg_col_span = num_columns - pos_col_span
            
            # Plot positive words
            for i, word in enumerate(pos_words):
                row = i // pos_col_span
                col = i % pos_col_span
                ax = fig.add_subplot(gs[row, col])
                
                plot_data = dim_data[(dim_data['word'] == word) & (dim_data['Polarity'] == 'Positive')]
                
                sns.lineplot(
                    data=plot_data, 
                    x='Decade', 
                    y='frequency_per_thousand',
                    hue='Journal', 
                    style='Journal',
                    markers=True, 
                    dashes=False,
                    palette=journal_palette,
                    ax=ax,
                    linewidth=3,
                    legend=False  # No legend as requested
                )
                
                ax.set_title(f'{word}', fontsize=20)
                ax.set_ylim(0, y_max)  # Use consistent y-axis scale
                ax.grid(False)  # Remove grid as requested
                
                # X-axis formatting
                if row == num_rows - 1 or word == "Social democracy":
                    ax.set_xlabel('')
                    ax.tick_params(axis='x', labelsize=16, rotation=45)
                else:
                    ax.set_xlabel('')
                    ax.tick_params(axis='x', labelbottom=False)
                
                # Y-axis formatting
                ax.set_ylabel('')
                ax.tick_params(axis='y', labelbottom=False)

            # Add a centered y-axis label
            fig.text(0.04, 0.5, 'Frequency', va='center', rotation='vertical', fontsize=20)
            fig.text(0.5, 0.02, 'Year', va='center', fontsize=20)
            
            # Plot negative words
            for i, word in enumerate(neg_words):
                row = i // neg_col_span
                col = (i % neg_col_span) + pos_col_span
                ax = fig.add_subplot(gs[row, col])
                
                plot_data = dim_data[(dim_data['word'] == word) & (dim_data['Polarity'] == 'Negative')]
                
                sns.lineplot(
                    data=plot_data, 
                    x='Decade', 
                    y='frequency_per_thousand',
                    hue='Journal', 
                    style='Journal',
                    markers=True, 
                    dashes=False,
                    palette=journal_palette,
                    ax=ax,
                    linewidth=3,
                    legend=False  # No legend
                )
                
                ax.set_title(f'{word}', fontsize=20)
                ax.set_ylim(0, y_max)  # Use consistent y-axis scale
                ax.grid(False)  # Remove grid as requested
                ax.set_ylabel('')  # No y-label on right columns
                
                # X-axis formatting
                if row == num_rows - 1 or word == "Totalitarianism":
                    ax.set_xlabel('')
                    ax.tick_params(axis='x', labelsize=16, rotation=45)
                else:
                    ax.set_xlabel('')
                    ax.tick_params(axis='x', labelbottom=False)
            
            # Add column titles
            fig.text(0.28, 0.9, 'Pro Democracy', ha='center', fontsize=22)
            fig.text(0.7, 0.9, 'Anti Democracy', ha='center', fontsize=22)
            
        else:
            # Standard 2-column layout for other dimensions
            # Create figure with extra space at top for legend
            max_rows = max(len(pos_words), len(neg_words))
            fig = plt.figure(figsize=(16, max_rows * 2.5 + 1))
            
            # Create grid with 2 columns - left for positive, right for negative words
            gs = gridspec.GridSpec(max_rows, 2, wspace=0.2, hspace=0.4)
            
            # Plot positive words (left column)
            for i, word in enumerate(pos_words):
                ax = fig.add_subplot(gs[i, 0])
                
                plot_data = dim_data[(dim_data['word'] == word) & (dim_data['Polarity'] == 'Positive')]
                
                sns.lineplot(
                    data=plot_data, 
                    x='Decade', 
                    y='frequency_per_thousand',
                    hue='Journal', 
                    style='Journal',
                    markers=True, 
                    dashes=False,
                    palette=journal_palette,
                    ax=ax,
                    linewidth=3,
                    legend=False  # No legend as requested
                )
                
                ax.set_title(f'{word}', fontsize=20)
                ax.set_ylim(0, y_max)  # Use consistent y-axis scale
                ax.grid(False)  # Remove grid as requested

                
                # X-axis formatting
                if i == len(pos_words) - 1:
                    ax.set_xlabel('')
                    ax.tick_params(axis='x', labelsize=16, rotation=45)
                else:
                    ax.set_xlabel('')
                    ax.tick_params(axis='x', labelbottom=False)
                
                # Y-axis formatting
                ax.set_ylabel('')
                ax.tick_params(axis='y', labelbottom=False)

            # Plot negative words (right column)
            for i, word in enumerate(neg_words):
                ax = fig.add_subplot(gs[i, 1])
                
                plot_data = dim_data[(dim_data['word'] == word) & (dim_data['Polarity'] == 'Negative')]
                
                sns.lineplot(
                    data=plot_data, 
                    x='Decade', 
                    y='frequency_per_thousand',
                    hue='Journal', 
                    style='Journal',
                    markers=True, 
                    dashes=False,
                    palette=journal_palette,
                    ax=ax,
                    linewidth=3,
                    legend=False  # No legend
                )
                
                ax.set_title(f'{word}', fontsize=20)
                ax.set_ylim(0, y_max)  # Use consistent y-axis scale
                ax.grid(False)  # Remove grid as requested

                # Y-axis formatting
                ax.set_ylabel('')  # No y-label on right column
                ax.tick_params(axis='y', labelbottom=False)

                # X-axis formatting
                if i == len(neg_words) - 1:
                    ax.set_xlabel('')
                    ax.tick_params(axis='x', labelsize=16, rotation=45)
                else:
                    ax.set_xlabel('')
                    ax.tick_params(axis='x', labelbottom=False)

            # Add a centered y-axis label
            fig.text(0.04, 0.5, 'Frequency', va='center', rotation='vertical', fontsize=20)
            fig.text(0.5, 0.02, 'Year', va='center', fontsize=20)

            # Add column titles
            fig.text(0.28, 0.91, 'Pro Democracy', ha='center', fontsize=22)
            fig.text(0.71, 0.91, 'Anti Democracy', ha='center', fontsize=22)
        
        # Add a custom legend at the top of the figure
        fig.legend(custom_lines, ['American', 'British'], 
                   loc='upper center', 
                   bbox_to_anchor=(0.5, 0.95), 
                   ncol=2, 
                   frameon=False,
                   prop={'size': 16},  
                   title='Journal Type',
                   title_fontsize=18,  
                   handlelength=2,     
                   handleheight=2,     
                   borderaxespad=1.5
                   )
        

        # Adjust layout to account for the legend
        plt.subplots_adjust(top=0.85, bottom=0.1, left=0.1, right=0.9, hspace=0.5, wspace=0.4)
        
        # Save figure
        filename = f"{dimension.replace(' ', '_')}_analysis.png"
        plt.savefig(os.path.join(output_folder, filename), dpi=500, bbox_inches='tight')
        plt.close()
        
        print(f"Saved plot for {dimension} dimension")

In [80]:

plot_anchor_word_optimized(
     "/Users/yvette/Desktop/data/Final/combined_word_frequencies.csv",
    "output_plots_grouped_frequency",
   anchor_words_map
 )



Saved plot for Electoral dimension
Saved plot for Liberal dimension
Saved plot for Deliberative dimension
Saved plot for Participatory dimension
Saved plot for Egalitarian dimension
Saved plot for Democracy-Authoritarian dimension


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Set style
sns.set_theme(style="white")
plt.style.use('default')

# Define colors
colors = {
    'American Positive': '#005AB5',  # Solid blue
    'American Negative': '#6C9BD2',  # Lighter blue
    'British Positive': '#DC3220',   # Solid red
    'British Negative': '#E88B8B'    # Lighter red
}

# Load and prepare data
anchor_words = pd.read_csv('List of anchor words.csv')
word_freq = pd.read_csv('combined_word_frequencies.csv')

word_freq['journal'] = word_freq['subcorpus'].apply(lambda x: 'American' if 'American' in x else 'British')
word_freq['time'] = word_freq['subcorpus'].str.extract(r'(\d{4}-\d{4})')

# Process data
plot_data = []
for dim in anchor_words['dimension'].unique():
    dim_data = anchor_words[anchor_words['dimension'] == dim]
    pos_words = dim_data['positive'].dropna().tolist()
    neg_words = dim_data['negative'].dropna().tolist()
    
    for time in word_freq['time'].unique():
        time_data = word_freq[word_freq['time'] == time]
        
        for journal in ['American', 'British']:
            journal_data = time_data[time_data['journal'] == journal]
            pos = journal_data[journal_data['word'].isin(pos_words)]['normalized_frequency'].sum()
            neg = journal_data[journal_data['word'].isin(neg_words)]['normalized_frequency'].sum()
            
            plot_data.append({
                'Dimension': dim,
                'Time': time,
                'Journal': journal,
                'Positive': pos,
                'Negative': neg,
                'Total': pos + neg
            })

df = pd.DataFrame(plot_data)

# Calculate layout - 3 rows, 2 columns
n_rows = 3
n_cols = 2
fig, axs = plt.subplots(n_rows, n_cols, figsize=(16, 5*n_rows))
axs = axs.flatten()

# Find max y-value (excluding Democracy vs. Dictatorship)
max_y = df[df['Dimension'] != "Democracy  vs.  Dictatorship"]['Total'].max() * 1.1

# Plot each dimension
for i, dim in enumerate(df['Dimension'].unique()):
    ax = axs[i]
    dim_df = df[df['Dimension'] == dim].sort_values('Time')
    time_periods = dim_df['Time'].unique()
    
    # Set bar positions
    x = np.arange(len(time_periods))
    width = 0.35
    
    for j, time in enumerate(time_periods):
        time_df = dim_df[dim_df['Time'] == time]
        
        # American stacked bar
        am = time_df[time_df['Journal'] == 'American'].iloc[0]
        ax.bar(x[j] - width/2, am['Positive'], width, 
               color=colors['American Positive'], label='Am Positive' if j==0 else "")
        ax.bar(x[j] - width/2, am['Negative'], width, 
               bottom=am['Positive'],
               color=colors['American Negative'], label='Am Negative' if j==0 else "")
        
        # British stacked bar
        br = time_df[time_df['Journal'] == 'British'].iloc[0]
        ax.bar(x[j] + width/2, br['Positive'], width, 
               color=colors['British Positive'], label='Br Positive' if j==0 else "")
        ax.bar(x[j] + width/2, br['Negative'], width, 
               bottom=br['Positive'],
               color=colors['British Negative'], label='Br Negative' if j==0 else "")
    
    # Formatting
    if dim == 'Democracy  vs.  Dictatorship':
        ax.set_title('Democracy-Authoritarian Dimension', fontsize=22, pad=15)
    else:
        ax.set_title(f'{dim} Dimension', fontsize=22, pad=15)

    ax.set_xticks(x)
    
    # Only show x-axis labels for bottom row (plots 4 and 5 in 0-based index)
    if i >= (n_rows-1)*n_cols:  # Bottom row
        ax.set_xticklabels(time_periods, fontsize=20, rotation=45)
        ax.set_xlabel('', fontsize=22)
    else:
        ax.set_xticklabels([])
        ax.set_xlabel('')

    
    if i % n_cols == 0:  # Left column
        ax.set_ylabel('', fontsize=22)
    else:
        ax.set_ylabel('')
    
    ax.yaxis.set_major_formatter(plt.FormatStrFormatter('%.3f'))
    ax.tick_params(axis='y', labelsize=20)
    
    if dim != "Democracy  vs.  Dictatorship":
        ax.set_ylim(0, max_y)
    
    # Clean spines
    for spine in ['top', 'right']:
        ax.spines[spine].set_visible(False)


# Hide empty subplots
for j in range(i+1, len(axs)):
    axs[j].axis('off')

# Create unified legend
legend_elements = [
    plt.Rectangle((0,0),1,1, color=colors['American Positive'], label='American Positive'),
    plt.Rectangle((0,0),1,1, color=colors['American Negative'], label='American Negative'),
    plt.Rectangle((0,0),1,1, color=colors['British Positive'], label='British Positive'),
    plt.Rectangle((0,0),1,1, color=colors['British Negative'], label='British Negative')
]

fig.legend(handles=legend_elements, 
           loc='lower center',
           bbox_to_anchor=(0.5, 1),
           ncol=4, fontsize=22)

# Final layout
plt.tight_layout()
fig.subplots_adjust(bottom=0.10)
plt.savefig('word_frequency.png', dpi=500, bbox_inches='tight')
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: 'List of anchor words.csv'