In [1]:
import pandas as pd
import math
import re
import string
import numpy as np
from scipy import stats
import csv
import plotly.graph_objects as go
from generate_count_file import get_keyword_extraction_counts
import urllib.request
import os

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/reggie/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/reggie/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/reggie/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/reggie/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/reggie/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Corpus and dictionary files to use
corpus_file = 'democracy_reports_corpus.csv'
#corpus_file = '../../data/democracy_reports_corpus_merged_wbacksliding_040724.csv'
dictionary_file = 'dimension_dictionary.json'

corpus_file_url = "https://github.com/backdem/democracy-datasets/raw/main/democracy_reports_corpus.csv"
dictionary_file_url = "https://raw.githubusercontent.com/backdem/democracy-datasets/main/dimension_dictionary.json"

In [3]:
# Download datsets if not already downloaded
if not os.path.exists(corpus_file):
    urllib.request.urlretrieve(corpus_file_url, corpus_file)
if not os.path.exists(dictionary_file):
    urllib.request.urlretrieve(dictionary_file_url, dictionary_file)

In [4]:
# Use CSV file generated from generate_count_file
# Generate or import count file
df = get_keyword_extraction_counts(corpus_file, dictionary_file, regenerate=False)

  all_countries_data = pd.read_csv(csv_file, dtype={'year': str}, comment='#')


building DataFrame


In [5]:
# Print df to see structure
print(df)

       dictionary  country  year      dimension                      source  \
0         elected   turkey  2021      electoral  freedomhouse_freedom-world   
1       elections   turkey  2021      electoral  freedomhouse_freedom-world   
2           votes   turkey  2021      electoral  freedomhouse_freedom-world   
3          voting   turkey  2021      electoral  freedomhouse_freedom-world   
4      referendum   turkey  2021  participatory  freedomhouse_freedom-world   
...           ...      ...   ...            ...                         ...   
60086      gender  czechia  2020        liberal              eu_rule_of_law   
60087     website  czechia  2020          media              eu_rule_of_law   
60088    protests  czechia  2020  participatory              eu_rule_of_law   
60089       press  czechia  2020          media              eu_rule_of_law   
60090      unions  czechia  2020  participatory              eu_rule_of_law   

       count  
0          2  
1          8  
2     

In [6]:
def get_stats(data, confidence_level=0.95):
    data_mean = np.mean(data)
    data_std_dev = np.std(data)
    # Calculate the critical value based on the confidence level and a normal distribution
    critical_value = stats.norm.ppf((1 + confidence_level) / 2)

    # Calculate the standard error of the mean for a population
    standard_error = data_std_dev / np.sqrt(len(data))

    # Calculate the margin of error
    margin_of_error = critical_value * standard_error

    # Calculate the confidence interval
    confidence_interval = (data_mean - margin_of_error, data_mean + margin_of_error)
    return (confidence_interval, margin_of_error, data_mean, data_std_dev, critical_value, standard_error)

In [7]:
def plot_country_dimension_corpus(df, countries=None, title=None):
    sub_df = df
    # Only plot subset of countries
    if countries:
        sub_df = df[df['country'].isin(countries)]
    
    dimensions = pd.Series(sub_df['dimension']).unique()
    fig = go.Figure()

    for dim in dimensions:
        filtered_df = sub_df[sub_df['dimension'] == dim]
        fig.add_trace(go.Bar(
            y=filtered_df['country'],
            x=filtered_df['count'],
            name=dim,
            orientation='h'
        ))   

    # Customize the layout (optional)
    fig.update_layout(title=title, yaxis_title=None, xaxis_title='counts')

    # Display the chart
    fig.show()
    return fig

In [12]:
# Aggregate counts
df_agg = pd.DataFrame(df.groupby(['country', 'year', 'source', 'dimension'])['count'].sum()).reset_index()
#fig = plot_country_dimension_corpus(df_grouped, title=None)
print(df_agg)
#df_agg.to_csv("../../data/democracy_keywordextraction_aggregated_counts_150724.csv", index=False)
# Write pdf image
# fig.write_image("images/all_countries_counts2.pdf", width=1024, height=1024)

             country  year                      source      dimension  count
0            albania  2002                       greco      electoral     19
1            albania  2002                       greco        liberal    290
2            albania  2002                       greco          media     19
3            albania  2002                       greco  participatory     41
4            albania  2003                         bti      electoral     29
...              ...   ...                         ...            ...    ...
5079  united-kingdom  2022  freedomhouse_freedom-world  participatory     20
5080  united-kingdom  2023  freedomhouse_freedom-world      electoral     26
5081  united-kingdom  2023  freedomhouse_freedom-world        liberal     56
5082  united-kingdom  2023  freedomhouse_freedom-world          media     20
5083  united-kingdom  2023  freedomhouse_freedom-world  participatory     23

[5084 rows x 5 columns]


In [13]:
# Group by country and dimension
df_grouped = pd.DataFrame(df.groupby(['country', 'dimension'])['count'].sum()).reset_index()
fig = plot_country_dimension_corpus(df_grouped, title=None)
print(df_grouped)
# Write pdf image
# fig.write_image("images/all_countries_counts2.pdf", width=1024, height=1024)

            country      dimension  count
0           albania      electoral   1597
1           albania        liberal   3607
2           albania          media    928
3           albania  participatory   1147
4           andorra      electoral    850
..              ...            ...    ...
195         ukraine  participatory   1564
196  united-kingdom      electoral    919
197  united-kingdom        liberal   1636
198  united-kingdom          media   1572
199  united-kingdom  participatory    384

[200 rows x 3 columns]


In [14]:
def plot_country_groups(df, title='Democratic dimensions'):    

    west_22 = ['greece', 'italy', 'ireland', 'liechtenstein', 'monaco', 'luxembourg', 
               'portugal', 'finland', 'austria', 'norway', 'france', 'denmark', 
               'spain', 'san-marino', 'switzerland', 'sweden', 'germany', 'united-kingdom', 
               'belgium', 'netherlands', 'iceland', 'andorra', 'denmark']
    cee = ['slovenia', 'slovakia', 'latvia', 'poland', 'malta', 
           'lithuania', 'croatia', 'cyprus', 'hungary', 
           'estonia', 'romania', 'bulgaria', 'czechia']
    eu_candidates = [ 'kosovo', 'moldova','ukraine', 'serbia', 
                     'north-macedonia', 'turkey', 'bosnia-herzegovina', 'montenegro', 
                     'albania','georgia']
    non_eu = ['armenia', 'azerbaijan', 'belarus', 'russia'] 

    
    group_4 = df[df['country'].isin(non_eu)]            
    group_3 = df[df['country'].isin(eu_candidates)]
    group_1 = df[df['country'].isin(west_22)]
    group_2 = df[df['country'].isin(cee)]
    
    dimensions = pd.Series(df['dimension']).unique()

    groups = [group_1, group_2, group_3, group_4]

    fig = go.Figure()
    # Create a bar trace for each group
    category_names = ['west_eu','cee','eu_candidates', 'non_eu']
    traces = []
    for dimension in dimensions:    
        # Filter the dimension i.e. only get counts per country for a
        # single dimension.
        grouped_by_dim = [g[g['dimension'] == dimension] for g in groups]  
        # Calculate the mean of the counts
        mean_counts = [g['count'].mean() for g in grouped_by_dim]       
        # Calculate the margin of error of the mean
        margin_error = [get_stats(g['count'])[1] for g in grouped_by_dim]
        trace = go.Bar(x=category_names,
                       y=mean_counts,
                       error_y=dict(type='data', array=margin_error, visible=True),
                       name=dimension)
        traces.append(trace)
   
    fig = go.Figure(data=traces)

    # Update the layout
    fig.update_layout(
        title=title,
        xaxis_title='country groups',
        yaxis_title='counts',
        barmode='group'  # Set barmode to 'group' to create grouped bars
    )

    # Show the plot
    fig.show()
    return fig
    

In [15]:
fig = plot_country_groups(df_grouped)

In [16]:
# Plot the country grouping but by seperate sources
# Get list of sources from data
sources = pd.Series(df['source']).unique()
df_grouped_source = pd.DataFrame(df.groupby(['country', 'dimension', 'source'])['count'].sum()).reset_index()
for source in sources:
    df_one_source = df_grouped_source[df_grouped_source['source'] == source]
    fig = plot_country_groups(df_one_source, title=f'Democratic dimensions for {source}')

In [None]:
# Stats imports
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.stats.diagnostic import het_white, het_breuschpagan

In [None]:
# Define linear regression function
def linear_regression(df, x_name, y_name):
    X = df[[x_name]]
    y = df[y_name]
   
    model = LinearRegression()
    model.fit(X,y)
    y_pred= model.predict(X)   
    return y_pred    

In [None]:
# Plot linear regression model on country corpus size vs keyword hits per dimension
def plot_corpus_vs_topic(df, x='count', y='sentence_length', y_pred=[], topic_name='', title=''):
    fig = go.Figure()    
    fig.add_trace(go.Scatter(x=df[x], y=df[y], name=topic_name, mode='markers', text=df['country']))
    # Plot linear regression model is given
    if(len(y_pred) > 0):
        fig.add_trace(go.Scatter(x=df[x], y=y_pred, name=f'linear fit'))
    # Customize the layout (optional)
    fig.update_layout(title=title, xaxis_title='corpus size', yaxis_title=f'{topic_name} counts')

    # Display the chart
    fig.show()
    return fig

In [None]:
# Plot residuals to check for heteroskedasticity in data set
def plot_fitted_vs_residuals(df, y='count', x='sentence_length', topic_name='', title=''):
    fig = go.Figure()    
    
    X = sm.add_constant(df[x])
    y = df[y]

    model = sm.OLS(y,X).fit()

    # Calculate residuals
    residuals = y - model.predict(X)
    
    fig.add_trace(go.Scatter(x=df[x], y=residuals, name=topic_name, mode='markers+text', text=df['country']))
    fig.add_trace(go.Scatter(x=df[x], y=np.zeros(len(residuals)), name=f'predicted'))
    # Customize the layout (optional)
    fig.update_layout(title=title, xaxis_title='corpus size', yaxis_title=f'{topic_name} residuals')

    # Display the chart
    fig.show()
    return fig

In [None]:
# Plot to residuals in Z-score showing outliers as a Z-score > 2
def plot_residuals_zscore(df, x='count', y='sentence_length', topic_name='', title=''):
    
    fig = go.Figure()    
    
    X = sm.add_constant(df[x])
    y = df[y]

    model = sm.OLS(y,X).fit()

    # Calculate residuals
    residuals = y - model.predict(X)
    mean_residuals = np.mean(residuals)
    std_residuals = np.std(residuals)
    z_score = (residuals - mean_residuals) / (std_residuals)
    
    outlier_indices = [i for i, y in enumerate(z_score) if abs(y) > 2]
    outlier_x = [df[x][i] for i in outlier_indices]
    outlier_y = [z_score[i] for i in outlier_indices]
    outlier_labels = [df['country'][i] for i in outlier_indices]
    
    
    
    fig.add_trace(go.Scatter(x=df[x], y=z_score, name=topic_name, mode='markers', text=df['country']))
    fig.add_trace(go.Scatter(x=df[x], y=np.zeros(len(residuals)), name=f'predicted'))
    fig.add_trace(go.Scatter(x=outlier_x, y=outlier_y, text=outlier_labels, mode="markers+text", name="outliers"))
    
    
    # Customize the layout (optional)
    fig.update_layout(title=title, xaxis_title='corpus size', yaxis_title=f'{topic_name} z_score')
    fig.update_traces(selector=dict(name='outliers'), showlegend=False)

    # Display the chart
    fig.show()
    return (fig, df['country'], z_score)

In [None]:
# Tests for Heteroskedasticity in dataset
def breusch_pagan_test(df, x_name, y_name, topic_name='', alpha=0.05):
    # Linear regression
    X = sm.add_constant(df[x_name])  # Add a constant term for the intercept
    y = df[y_name]

    model = sm.OLS(y, X).fit()
    residuals = model.resid

    # White's test
    white_test = het_white(residuals, X)
    print("White's test p-value:", white_test[1])
    if white_test[1] < alpha:
        print("Heteroskedasticity present")
    else:
        print("NO heteroskedasticity present")

    # Breusch-Pagan test
    bp_test = het_breuschpagan(residuals, X)
    print("Breusch-Pagan test p-value:", bp_test[1])
    if bp_test[1] < alpha:
        print("Heteroskedasticity present")
    else:
        print("NO heteroskedasticity present")
    
    # Extract the coefficient of X and its standard error from the model summary
    coef_x = model.params[x_name]
    std_err_x = model.bse[x_name]

    # Calculate the t-statistic for the coefficient of X
    t_stat = coef_x / std_err_x

    # Calculate the p-value associated with the t-statistic
    p_value = model.pvalues[x_name]

    # Print the results
    print("Coefficient of X:", coef_x)
    print("Standard error of coefficient of X:", std_err_x)
    print("t-statistic:", t_stat)
    print("p-value:", p_value)

    if p_value < alpha:
        print(f"Reject the null hypothesis: {x_name} has a significant effect on {topic_name}.")
    else:
        print(f"Fail to reject the null hypothesis: There is not enough evidence to conclude that {x_name} has a significant effect on {topic_name}.")

    return (white_test[1], bp_test[1])

In [None]:
# Load corpus data to calculate corpus sizes for countries
df_corpus = pd.read_csv(corpus_file, dtype={'year': str, 'sentence': str}, comment='#')
# Add column with sentence length to corpus
df_corpus["sentence_length"] = df_corpus["sentence"].apply(lambda x: len(x.split()))
# Calculate corpus size for each country: Group data by country and sum up sentence length
df_corpus_country_sizes = pd.DataFrame(df_corpus.groupby(['country'])['sentence_length'].sum()).reset_index()


In [None]:
# Plot corpus sizes for different countries for all years
df_sorted = df_corpus_country_sizes.sort_values(by='sentence_length', ascending=True)
fig = go.Figure(data=[go.Bar(x=df_sorted['country'], y=df_sorted['sentence_length'])])

# Customize the layout (optional)
fig.update_layout(title='Corpus size per country', xaxis_title='Country', yaxis_title='No of Words')

# Display the chart
fig.show()
#fig.write_image("images/corpus_size_all_years.pdf", width=1024, height=512)

In [None]:
# Iterate dimensions and calculate linear regression for every dimension
dimensions = pd.Series(df['dimension']).unique()
all_z_scores = {}
for dim in dimensions:
        print(f"---PLOTS FOR {dim} DIMENSION---")
        # Filter by dimension
        df_dimension = df[df['dimension'] == dim]
        df_counts = pd.DataFrame(df_dimension.groupby(['country'])['count'].sum()).reset_index()
        # Merge based on the 'key' column
        df_merged = pd.merge(df_counts, df_corpus_country_sizes, on='country', how='inner')
        y_pred = linear_regression(df_merged, 'sentence_length', 'count')        
        fig = plot_corpus_vs_topic(df_merged, x='sentence_length', y='count', y_pred=y_pred, topic_name=dim)
        fig = plot_fitted_vs_residuals(df_merged, x='sentence_length', y='count', topic_name=dim)
        fig, x, y = plot_residuals_zscore(df_merged, x='sentence_length', y='count', topic_name=dim)
        all_z_scores[dim] = (x, y)
        breusch_pagan_test(df_merged, x_name="count", y_name='sentence_length', topic_name=dim)
        print()

In [None]:
# Plot all z-scores together 
dimensions = pd.Series(df['dimension']).unique()
fig = go.Figure()  
for dim in dimensions:
    y, x = all_z_scores[dim]    
    fig.add_trace(go.Bar(y=y, x=x, orientation='h', name=dim))

# Customize the layout (optional)
fig.update_layout(title='', yaxis_title='', xaxis_title='z_score normalized residuals', width=1024, height=1024)

# Display the chart
fig.show()

In [None]:
# Group counts by year, country, dimension
df_grouped_with_years = pd.DataFrame(df.groupby(['dimension', 'year'])['count'].sum()).reset_index()

In [None]:
# Plot keywords changing over years and dimension
def plot_years(df, title=None):
    fig = go.Figure()
    df = df.sort_values(by='year', ascending=True)
    dimensions = pd.Series(df['dimension']).unique()
    for dim in dimensions:
        df_dim = df[df['dimension'] == dim]
        fig.add_trace(go.Scatter(x=df_dim['year'], y=df_dim['count'], name=dim))
    
    # Customize the layout (optional)
    fig.update_layout(title=title, xaxis_title='year', yaxis_title='counts')

    # Display the chart
    fig.show()
    return fig

In [None]:
fig = plot_years(df_grouped_with_years)

In [None]:
# Group counts by year, country, dimension
df_grouped_with_sources = pd.DataFrame(df.groupby(['dimension', 'source'])['count'].sum()).reset_index()

In [None]:
def plot_sources(df, x='source', y='count', title=None):    
    
    fig = go.Figure()
    dimensions = pd.Series(df['dimension']).unique()
    for dim in dimensions:
        df_dim = df[df['dimension'] == dim]
        fig.add_trace(go.Bar(x=df_dim[x], y=df_dim[y], name=dim))
    # Customize the layout (optional)
    fig.update_layout(title=title, xaxis_title=x, yaxis_title='counts')

    # Display the chart
    fig.show()
    return fig

In [None]:
fig = plot_sources(df_grouped_with_sources)

In [None]:
# Group by dictionary and dimension
df_grouped_with_keywords = pd.DataFrame(df.groupby(['dictionary', 'dimension'])['count'].sum()).reset_index()
print(df_grouped_with_keywords)

In [None]:
# Plot the weight of terms contributing a dimension
def plot_terms_in_dimension(df, dimension, title='', plot_logs=False):
    fig = go.Figure()
    #dimensions = pd.Series(df['dimension']).unique()
    #for dim in dimensions:
    df_dim = df[df['dimension'] == dimension]
    df_dim = df_dim.sort_values(by='count', ascending=False)    
    df_dim["log_value"] = df_dim["count"].apply(lambda x: math.log(x))
    if plot_logs:
        fig.add_trace(go.Bar(x=df_dim['dictionary'], y=df_dim['log_value'], name=dimension))
    else:
        fig.add_trace(go.Bar(x=df_dim['dictionary'], y=df_dim['count'], name=dimension))
    # Customize the layout (optional)
    fig.update_layout(title=title, xaxis_title='', yaxis_title='counts', width=1024, height=368)

    # Display the chart
    fig.show()

In [None]:
dimensions = pd.Series(df['dimension']).unique()
for dim in dimensions:    
    print(f"---PLOT KEYWORDS IN {dim}---")
    fig = plot_terms_in_dimension(df_grouped_with_keywords, dim, plot_logs=False)
    fig = plot_terms_in_dimension(df_grouped_with_keywords, dim, plot_logs=True)