In [None]:
import pandas as pd
import math
import csv
import json
import string
import re
import utils
import nltk
from collections import Counter
import itertools
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer #PorterStemmer
from nltk.tokenize import word_tokenize
import plotly.graph_objects as go
from scipy.stats import norm
from sklearn.linear_model import LinearRegression
from scipy.stats import zscore
from scipy import stats
#import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.stats.diagnostic import het_white, het_breuschpagan

# Download stopwords and initialize stemmer
nltk.download('stopwords')
nltk.download('punkt')
stopwords = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [None]:
def get_stemmed_tokens(data_list):
    all_stems = []
    for text in data_list:
        text = re.sub(r'\d+', '', text)
        translator = str.maketrans('', '', string.punctuation)
        text = text.translate(translator)
        # Tokenize the sentence
        tokens = word_tokenize(text)

        # Remove stopwords and perform stemming
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords and token.isalnum()]
        stems = [stemmer.stem(word) for word in filtered_tokens]
        all_stems += stems
        
    return all_stems

In [None]:
def get_stemmed_mapping(data_list):
    all_stems = []
    all_words = []
    for text in data_list:
        text = re.sub(r'\d+', '', text)
        translator = str.maketrans('', '', string.punctuation)
        text = text.translate(translator)
        # Tokenize the sentence
        tokens = word_tokenize(text)

        # Remove stopwords and perform stemming
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords and token.isalnum()]
        stems = [stemmer.stem(word) for word in filtered_tokens]
        all_stems += stems
        all_words += filtered_tokens
        
    return (all_stems, all_words)

In [None]:
def get_stem_word_dict(data_list, stem_word_dict):
    if not stem_word_dict:
        stem_word_dict = {}
    for text in data_list:
        text = re.sub(r'\d+', '', text)
        translator = str.maketrans('', '', string.punctuation)
        text = text.translate(translator)
        # Tokenize the text into individual words
        words = word_tokenize(text)
        # Retrieve the stem for each word
        stems = [stemmer.stem(word) for word in words]
        # Create a dictionary to associate each stem with its corresponding words       
        for word, stem in zip(words, stems):
            stem_word_dict.setdefault(stem, []).append(word)
    
    for key in stem_word_dict.keys():
        vals = stem_word_dict[key]
        stem_word_dict[key] = list(set(vals))        
            
    return stem_word_dict        

In [None]:
def get_words(data_list):
    all_words = []
    for text in data_list:
        text = re.sub(r'\d+', '', text)
        translator = str.maketrans('', '', string.punctuation)
        text = text.translate(translator)
        # Tokenize the text into individual words
        words = word_tokenize(text)
        all_words += words
    return all_words

In [None]:
def get_ngrams(data_list, n):
    words = get_words(data_list)
    ngrams = []
    for i in range(len(words) - n + 1):
        ngram_str = ' '.join(words[i:i+n]).lower()
        ngrams.append(ngram_str)
    return ngrams


In [None]:
def get_dict_counts(tokens, dictionary):
    counts = np.zeros(len(dictionary))
    token_counts = [ {} for i in range(len(dictionary)) ] 
    for token in tokens:
        for i in range(len(dictionary)):
            if token in dictionary[i]:
                counts[i] += 1
                token_counts[i][token] = token_counts[i].get(token, 0) + 1
        
    return ([c for c in counts if len(tokens) > 0], token_counts)

In [None]:
def get_stem_token_counts(tokens, words):
    counts = np.zeros(len(tokens))
    token_counts = [ {} for i in range(len(tokens)) ] 
    for token in tokens:
        for i in range(len(dictionary)):
            if token in dictionary[i]:
                counts[i] += 1
                token_counts[i][token] = token_counts[i].get(token, 0) + 1
        
    return ([c for c in counts if len(tokens) > 0], token_counts)

In [None]:
def get_vocab_counts(tokens):
    vocab_counts = {}
    for token in tokens:
        if token not in vocab_counts.keys():
            vocab_counts[token] = 1
        else:
            vocab_counts[token] += 1
    return dict(sorted(vocab_counts.items(), key=lambda x: x[1], reverse=True))

In [None]:
def load_json_dict(dict_file):
    with open(dict_file, 'r') as file:
        dictionary = json.load(file)
        return dictionary

In [None]:
# load data setof all countries, years and sources
data_file = '../data/all_countries_0.0.6.csv'
dict_file = "../data/dict_6.json"
json_dict = load_json_dict(dict_file)

In [None]:
def get_topic_name(n, dict_file=json_dict):
    return json_dict[n-1]["name"]

def get_no_topics():
    return len(json_dict)

In [None]:
countries, years, all_countries_data, sources = utils.get_countries_data(data_file)

In [None]:
def get_one_ngram_results(dict_file, data_file, year):
    results = {"country": [], "no_words": []}
    countries, years, all_countries_data, sources = utils.get_countries_data(data_file)
    dictionary,topics = utils.get_seed_lists(dict_file, 1, exact=True )
    dictionary = np.squeeze(list(dictionary))
    ngram_dict = []
    # Stem the dictionary tokens
    ngram_dict = [ list(get_stemmed_tokens(l)) for l in dictionary ]    
    year = year
    results = {"country": [], "no_words": []}
    stem_word_dict = {}
    for i in range(len(ngram_dict)):
        results[i+1] = []
        
    for country in countries:
        df = all_countries_data
        country_data = None
        if(year == "All"):
            country_data = df[df['country'] == country]
        else:
            country_data = df[(df['year'] == year) & (df['country'] == country)]
        
        country_data = country_data.reset_index(drop=True)

        # Preprocessed tokens (list of strings)
        data_list = np.squeeze(country_data[["sentence"]].to_numpy())
        # Count single ngram
        tokens = get_stemmed_tokens(data_list)
        
        if(len(tokens) == 0):
            continue     
       
        counts, token_counts = get_dict_counts(tokens, ngram_dict)           
        results["country"].append(country)
        results["no_words"].append(len(tokens))
        for i in range(len(counts)):
            results[i+1].append(counts[i])
            
    return results

In [None]:
def get_one_ngram_results_by_year(dict_file, data_file):
    results = {"year": [], "no_words": []}
    token_counts_all_years = {}
    countries, years, all_countries_data, sources = utils.get_countries_data(data_file)
    dictionary,topics = utils.get_seed_lists(dict_file, 1, exact=True )
    dictionary = np.squeeze(list(dictionary))
    ngram_dict = []
    # Stem the dictionary tokens
    ngram_dict = [ list(get_stemmed_tokens(l)) for l in dictionary ]        
    stem_word_dict = {}
    for i in range(len(ngram_dict)):
        results[i+1] = []
        
    for year in years:
        df = all_countries_data
        year_data = df[(df['year'] == year)]
        year_data = year_data.reset_index(drop=True)

        # Preprocessed tokens (list of strings)
        data_list = np.squeeze(year_data[["sentence"]].to_numpy())
        # Count single ngram
        tokens = get_stemmed_tokens(data_list)
        
        if(len(tokens) == 0):
            continue     
       
        counts, token_counts = get_dict_counts(tokens, ngram_dict)           
        results["year"].append(year)
        results["no_words"].append(len(tokens))
        token_counts_all_years[year] = token_counts
        for i in range(len(counts)):
            results[i+1].append(counts[i])
            
    return (results, token_counts_all_years)

In [None]:
def get_one_ngram_results_by_source(dict_file, data_file):
    results = {"source": [], "no_words": []}
    token_counts_all_source = {}
    countries, years, all_countries_data, sources = utils.get_countries_data(data_file)
    dictionary,topics = utils.get_seed_lists(dict_file, 1, exact=True )
    dictionary = np.squeeze(list(dictionary))
    ngram_dict = []
    # Stem the dictionary tokens
    ngram_dict = [ list(get_stemmed_tokens(l)) for l in dictionary ]        
    stem_word_dict = {}
    for i in range(len(ngram_dict)):
        results[i+1] = []
        
    for source in sources:
        df = all_countries_data
        source_data = df[(df['source'] == source)]
        source_data = source_data.reset_index(drop=True)

        # Preprocessed tokens (list of strings)
        data_list = np.squeeze(source_data[["sentence"]].to_numpy())
        # Count single ngram
        tokens = get_stemmed_tokens(data_list)
        
        if(len(tokens) == 0):
            continue     
       
        counts, token_counts = get_dict_counts(tokens, ngram_dict)           
        results["source"].append(source)
        results["no_words"].append(len(tokens))
        token_counts_all_source[source] = token_counts
        for i in range(len(counts)):
            results[i+1].append(counts[i])
            
    return (results, token_counts_all_source)

In [None]:
def get_one_ngram_results_by_source_and_country(dict_file, data_file):
    results = {"source": [], "country": [], "no_words": []}
    token_counts_all_source = {}
    countries, years, all_countries_data, sources = utils.get_countries_data(data_file)
    dictionary,topics = utils.get_seed_lists(dict_file, 1, exact=True )
    dictionary = np.squeeze(list(dictionary))
    ngram_dict = []
    # Stem the dictionary tokens
    ngram_dict = [ list(get_stemmed_tokens(l)) for l in dictionary ]        
    stem_word_dict = {}
    for i in range(len(ngram_dict)):
        results[i+1] = []
    for country in countries:
        for source in sources:
            df = all_countries_data
            source_data = df[(df['source'] == source) & (df["country"] == country)]
            source_data = source_data.reset_index(drop=True)

            # Preprocessed tokens (list of strings)
            data_list = np.squeeze(source_data[["sentence"]].to_numpy())
            # Count single ngram
            tokens = get_stemmed_tokens(data_list)

            if(len(tokens) == 0):
                continue     

            counts, token_counts = get_dict_counts(tokens, ngram_dict)           
            results["source"].append(source)
            results["country"].append(country)
            results["no_words"].append(len(tokens))
            token_counts_all_source[source] = token_counts
            for i in range(len(counts)):
                results[i+1].append(counts[i])

    return (results, token_counts_all_source)

In [None]:
def get_ngram_results(n, dict_file, data_file, year):
    if n == 1:
        return get_one_ngram_results(dict_file, data_file, year)
    results = {"country": [], "no_words": []}
    dictionary,topics = utils.get_seed_lists(dict_file, n, exact=True )
    dictionary = np.squeeze(list(dictionary))
    ngram_dict = dictionary
    for i in range(len(ngram_dict)):
        results[i+1] = []
    for country in countries:
        df = all_countries_data
        country_data = None
        if(year == "All"):
            country_data = df[df['country'] == country]
        else:
            country_data = df[(df['year'] == year) & (df['country'] == country)]
            
        country_data = country_data.reset_index(drop=True)

        # Preprocessed tokens (list of strings)
        data_list = np.squeeze(country_data[["sentence"]].to_numpy())
        ngrams = get_ngrams(data_list, n) 

        if(len(ngrams) == 0):
            continue
        counts, token_counts = get_dict_counts(ngrams, ngram_dict)
         
        results["country"].append(country)
        results["no_words"].append(len(ngrams))
        for i in range(len(counts)):
            results[i+1].append(counts[i])
            
    return results

In [None]:
def get_ngram_results_by_year(n, dict_file, data_file):
    if n == 1:
        return get_one_ngram_results_by_year(dict_file, data_file)
    results = {"year": [], "no_words": []}
    token_counts_all_years = {}
    dictionary,topics = utils.get_seed_lists(dict_file, n, exact=True )
    dictionary = np.squeeze(list(dictionary))
    ngram_dict = dictionary
    for i in range(len(ngram_dict)):
        results[i+1] = []
    for year in years:
        df = all_countries_data
        year_data = df[(df['year'] == year)]
        year_data = year_data.reset_index(drop=True)

        # Preprocessed tokens (list of strings)
        data_list = np.squeeze(year_data[["sentence"]].to_numpy())
        ngrams = get_ngrams(data_list, n) 

        if(len(ngrams) == 0):
            continue
        counts, token_counts = get_dict_counts(ngrams, ngram_dict)
         
        results["year"].append(year)
        results["no_words"].append(len(ngrams))
        token_counts_all_years[year] = token_counts
        for i in range(len(counts)):
            results[i+1].append(counts[i])
            
    return (results, token_counts_all_years)

In [None]:
def get_ngram_results_by_source(n, dict_file, data_file):
    if n == 1:
        return get_one_ngram_results_by_source(dict_file, data_file)
    results = {"source": [], "no_words": []}
    token_counts_all_source = {}
    dictionary,topics = utils.get_seed_lists(dict_file, n, exact=True )
    dictionary = np.squeeze(list(dictionary))
    ngram_dict = dictionary
    for i in range(len(ngram_dict)):
        results[i+1] = []
    for source in sources:
        df = all_countries_data
        source_data = df[(df['source'] == source)]
        source_data = source_data.reset_index(drop=True)

        # Preprocessed tokens (list of strings)
        data_list = np.squeeze(source_data[["sentence"]].to_numpy())
        ngrams = get_ngrams(data_list, n) 

        if(len(ngrams) == 0):
            continue
        counts, token_counts = get_dict_counts(ngrams, ngram_dict)
         
        results["source"].append(source)
        results["no_words"].append(len(ngrams))
        token_counts_all_source[source] = token_counts
        for i in range(len(counts)):
            results[i+1].append(counts[i])
            
    return (results, token_counts_all_source)

In [None]:
def get_ngram_results_by_source_and_country(n, dict_file, data_file):
    if n == 1:
        return get_one_ngram_results_by_source_and_country(dict_file, data_file)
    results = {"source": [], "country": [], "no_words": []}
    token_counts_all_source = {}
    dictionary,topics = utils.get_seed_lists(dict_file, n, exact=True )
    dictionary = np.squeeze(list(dictionary))
    ngram_dict = dictionary
    for i in range(len(ngram_dict)):
        results[i+1] = []
    for country in countries:
        for source in sources:
            df = all_countries_data
            source_data = df[(df['source'] == source) & (df['country'] == country)]
            source_data = source_data.reset_index(drop=True)

            # Preprocessed tokens (list of strings)
            data_list = np.squeeze(source_data[["sentence"]].to_numpy())
            ngrams = get_ngrams(data_list, n) 

            if(len(ngrams) == 0):
                continue
            counts, token_counts = get_dict_counts(ngrams, ngram_dict)

            results["source"].append(source)
            results["country"].append(country)
            results["no_words"].append(len(ngrams))
            token_counts_all_source[source] = token_counts
            for i in range(len(counts)):
                results[i+1].append(counts[i])
            
    return (results, token_counts_all_source)

In [None]:
def get_corpus_sizes(data_file, year):
    results = {"country": [], "no_words": []}
    for country in countries:
        df = all_countries_data
        country_data = df[(df['year'] == year) & (df['country'] == country)]
        country_data = country_data.reset_index(drop=True)

        # Preprocessed tokens (list of strings)
        data_list = np.squeeze(country_data[["sentence"]].to_numpy())
        ngrams = get_ngrams(data_list, n)
         
        results["country"].append(country)
        results["no_words"].append(len(ngrams))
    return pd.DataFrame(results)

In [None]:
def normalize_column(df, col):
    # Perform Min-Max normalization on the specified column
    min_value = df[col].min()
    max_value = df[col].max()

    df[col] = (df[col] - min_value) / (max_value - min_value)
    return df

In [None]:
def normalize_columns(df, cols):
    norm_df = df
    for i in cols:
        norm_df = normalize_column(norm_df, i)
    return norm_df

In [None]:
def plot(df, x='country', start_col=2, year=None):
    #nc = {}
    #for i in range(len(topics)):
    #    nc[i+1] = topics[i]

    #df = df.rename(columns=nc)
    # plot
    fig = go.Figure()

    # Iterate over each column (excluding the 'Category' column)
    for col in df.columns[2:]:
        fig.add_trace(go.Bar(x=df[x], y=df[col], name=get_topic_name(col)))

    # Customize the layout (optional)
    fig.update_layout(title=f'dimensions by {x} year {year}', xaxis_title=x, yaxis_title='coverage')

    # Display the chart
    fig.show()

In [None]:
def plot_years(df, x='year', start_col=2):
    
    df = df.sort_values(by='year', ascending=True)
    fig = go.Figure()

    # Iterate over each column (excluding the first n columns)
    for col in df.columns[2:]:
        fig.add_trace(go.Scatter(x=df[x], y=df[col], name=get_topic_name(col)))

    # Customize the layout (optional)
    fig.update_layout(title=f'dimensions by {x}', xaxis_title=x, yaxis_title='coverage')

    # Display the chart
    fig.show()

In [None]:
def plot_sources(df, x='source', start_col=2):    
    
    fig = go.Figure()

    # Iterate over each column (excluding the 'Category' column)
    for col in df.columns[2:]:
        fig.add_trace(go.Bar(x=df[x], y=df[col], name=get_topic_name(col)))

    # Customize the layout (optional)
    fig.update_layout(title=f'dimensions by {x}', xaxis_title=x, yaxis_title='coverage')

    # Display the chart
    fig.show()

In [None]:
def plot_corpus_vs_topic(df, x='no_words', col=None, y_pred=[], topic_name='', title=''):
    fig = go.Figure()    
    fig.add_trace(go.Scatter(x=df[x], y=df[col], name=topic_name, mode='markers', text=df['country']))
    # Plot linear regression model is given
    if(len(y_pred) > 0):
        fig.add_trace(go.Scatter(x=df[x], y=y_pred, name=f'linear fit'))
    # Customize the layout (optional)
    fig.update_layout(title=title, xaxis_title='corpus size', yaxis_title=f'{topic_name} counts')

    # Display the chart
    fig.show()

In [None]:
def plot_fitted_vs_residuals(df, x='no_words', col=None, topic_name='', title=''):
    # Plot to check for heteroskedasticity in data set
    fig = go.Figure()    
    
    X = sm.add_constant(df[x])
    y = df[col]

    model = sm.OLS(y,X).fit()

    # Calculate residuals
    residuals = y - model.predict(X)
    
    fig.add_trace(go.Scatter(x=df[x], y=residuals, name=topic_name, mode='markers', text=df['country']))
    fig.add_trace(go.Scatter(x=df[x], y=np.zeros(len(residuals)), name=f'predicted'))
    # Customize the layout (optional)
    fig.update_layout(title=title, xaxis_title='corpus size', yaxis_title=f'{topic_name} residuals')

    # Display the chart
    fig.show()

In [None]:
def plot_corpus_mean_deviation(df, x='country', col=None, topic_name='', title=''):
    fig = go.Figure()    
    X = sm.add_constant(df['no_words'])
    for c in col:        
        y = df[c]        
        model = sm.OLS(y,X).fit()

        # Calculate residuals
        residuals = y - model.predict(X)
        min_value = residuals.min()
        max_value = residuals.max()

        norm_residuals = 2 * (residuals - min_value) / (max_value - min_value) - 1
        
        
        fig.add_trace(go.Bar(x=df[x], y=norm_residuals, name=get_topic_name(c)))
    
    # Customize the layout (optional)
    fig.update_layout(title=title, xaxis_title='country', yaxis_title='residuals')

    # Display the chart
    fig.show()

In [None]:
def plot_topic(df, n):    
    # plot single topic
    df_sorted = df.sort_values(by=topic, ascending=True)
    fig = go.Figure(data=[go.Bar(x=df_sorted['country'], y=df_sorted[topic])])

    # Customize the layout (optional)
    fig.update_layout(title=topic, xaxis_title='Country', yaxis_title=f'{get_topic_name(n)}')

    # Display the chart
    fig.show()

In [None]:
def plot_z_score(df, n):    
    # plot single z_score of topic
    col_name = str(n) + "_z_score"
    df_sorted = df.sort_values(by=col_name, ascending=True)
    fig = go.Figure(data=[go.Bar(x=df_sorted['country'], y=df_sorted[col_name])])

    # Customize the layout (optional)
    name = get_topic_name(n) + " z_score"
    fig.update_layout(title=topic, xaxis_title='Country', yaxis_title=name)

    # Display the chart
    fig.show()

In [None]:
def plot_terms_in_dimension(freq, topic):
    data = freq[topic]
    # Sort the dictionary by values in descending order
    sorted_data = sorted(data.items(), key=lambda x: x[1], reverse=True)
    # Extract sorted keys and values from the sorted dictionary
    keys = [item[0] for item in sorted_data]
    values = [item[1] for item in sorted_data]
    log_values = [math.log(value) for value in values]
    fig = go.Figure(data=[go.Bar(x=keys, y=log_values)])

    # Add labels and title
    fig.update_layout(
        xaxis_title='keywords',
        yaxis_title=f'{get_topic_name(topic)} log counts',
        title=f'{get_topic_name(topic)} dimension'
    )

    # Show the plot
    fig.show()

In [None]:
def get_most_freq_terms_per_topic(tks):
    topics = {}
    for years in tks:
        for year in years.keys():
            for i in range(len(years[year])):
                topic_id = i + 1  
                data = years[year][i]
                collector = topics.get(topic_id, {})
                
                for term in data.keys():
                    count = data[term]
                    collector[term] = collector.get(term, 0) + count
                topics[topic_id] = collector
                #max_key = max(data, key=lambda k: data[k])
                #max_value = data[max_key]
                #topic = results.get(i, {})
                #for d in data:
                #    topic[i] = topic.get(d, 0) + data[d]
                #results[i] = topic
                #print(f'{year} {get_topic_name(i+1)} {max_key} {max_value} {i+1}')
    return topics


In [None]:
def calc_mean_std(df, cols):
    # Function to calculate z-score
    def calculate_z_score(x, mean, std):
        return (x - mean) / std
    
    for c in cols:
        c_name_mean = str(c)+"_mean"
        c_name_std = str(c)+"_std"
        c_name_z_score = str(c)+"_z_score"
        #df[c_name_mean] = df[c].mean()
        #df[c_name_std] = df[c].std()
        mean = df[c].mean()
        std = df[c].std()
        df[c_name_z_score] = df[c].apply(calculate_z_score, args=(mean, std))
    return df

In [None]:
def plot_mean(df, c):
  
    
    # Calculate the mean and standard deviation of 'Column1'
    mean_column1 = df[c].mean()
    std_column1 = df[c].std()
    
    print(f'mean: {mean_column1}, std: {std_column1}')

    # Generate data for the normal curve
    x = np.linspace(mean_column1 - 3*std_column1, mean_column1 + 3*std_column1, 100)
    y = norm.pdf(x, mean_column1, std_column1)

    # Create the plot
    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=x,
        y=y,
        mode='lines',
        line=dict(color='blue', width=2),
        name='Normal Distribution'
    ))

    fig.update_layout(
        title='Normal Distribution Curve',
        xaxis_title='Value',
        yaxis_title='Probability Density',
        showlegend=False,
        template='plotly_white'
    )

    # Show the plot
    fig.show()


In [None]:
def linear_regression(df, x_name, y_name):
    X = df[[x_name]]
    y = df[y_name]
   
    model = LinearRegression()
    model.fit(X,y)
    y_pred= model.predict(X)   
    return y_pred    

In [None]:
def calc_vec_mean_std_z_score(df, cols):
    copy_df = df[cols].copy()
    # Calculate the mean of the vector
    mean_vector = copy_df.mean()

    # Calculate the standard deviation of the vector
    std_vector = copy_df.std()

    # Calculate the Z-scores of the vector using the zscore() function
    z_scores = zscore(copy_df)

    # Convert the Z-scores array to a DataFrame with column names
    z_scores_df = pd.DataFrame(z_scores, columns=cols)
    
    return(mean_vector, std_vector, z_scores_df)


In [None]:
def merge_results(dfs, start_from=2):
    result_df = dfs[0].copy()
    column_names_list = result_df.columns.tolist()[start_from:]
    dfs_i = [[i+1] for i in range(len(column_names_list))]
    # concat count columns into one result_df
    for i in range(1, len(dfs)):
        df = dfs[i]        
        new_names = [ str(i) + "_" + str(n) for n in column_names_list ]
        for j in range(len(new_names)):               
                dfs_i[j].append(new_names[j])
        result_df[new_names] = df[column_names_list].copy()
        
    # sum the counts togather and drop extra columns
    for c in column_names_list:        
        result_df[c] = result_df[dfs_i[c-1]].sum(axis=1)
        result_df.drop(columns=dfs_i[c-1][1:], inplace=True)
 
    return result_df

In [None]:
def breusch_pagan_test(df, x_name, y_name, alpha=0.05):
    # Linear regression
    X = sm.add_constant(df[x_name])  # Add a constant term for the intercept
    y = df[y_name]

    model = sm.OLS(y, X).fit()
    residuals = model.resid

    # White's test
    white_test = het_white(residuals, X)
    print("White's test p-value:", white_test[1])
    if white_test[1] < alpha:
        print("Heteroskedasticity present")
    else:
        print("NO heteroskedasticity present")

    # Breusch-Pagan test
    bp_test = het_breuschpagan(residuals, X)
    print("Breusch-Pagan test p-value:", bp_test[1])
    if bp_test[1] < alpha:
        print("Heteroskedasticity present")
    else:
        print("NO heteroskedasticity present")
    
    # Extract the coefficient of X and its standard error from the model summary
    coef_x = model.params[x_name]
    std_err_x = model.bse[x_name]

    # Calculate the t-statistic for the coefficient of X
    t_stat = coef_x / std_err_x

    # Calculate the p-value associated with the t-statistic
    p_value = model.pvalues[x_name]

    # Print the results
    print("Coefficient of X:", coef_x)
    print("Standard error of coefficient of X:", std_err_x)
    print("t-statistic:", t_stat)
    print("p-value:", p_value)

    if p_value < alpha:
        print(f"Reject the null hypothesis: {x_name} has a significant effect on {get_topic_name(y_name)}.")
    else:
        print(f"Fail to reject the null hypothesis: There is not enough evidence to conclude that {x_name} has a significant effect on {get_topic_name(y_name)}.")


    return (white_test[1], bp_test[1])

In [None]:
def get_stats(data, confidence_level=0.95):
    data_mean = np.mean(data)
    data_std_dev = np.std(data)
    # Calculate the critical value based on the confidence level and a normal distribution
    critical_value = stats.norm.ppf((1 + confidence_level) / 2)

    # Calculate the standard error of the mean for a population
    standard_error = data_std_dev / np.sqrt(len(data))

    # Calculate the margin of error
    margin_of_error = critical_value * standard_error

    # Calculate the confidence interval
    confidence_interval = (data_mean - margin_of_error, data_mean + margin_of_error)
    return (confidence_interval, margin_of_error, data_mean, data_std_dev, critical_value, standard_error)

In [None]:
# count ngrams 1 to 4
dfs = []
#year = "2020"
year = "All"
for n in range(1, 5):    
    results = get_ngram_results(n, dict_file, data_file, year=year)
    df = pd.DataFrame(results)
    dfs.append(df)
    
# merge all ngrams togather
merged_df = merge_results(dfs)
# save data frame
merged_df.to_csv('dims_all_years_all_countries.csv', index=False)

In [None]:
def plot_country_groups(merged_df, title='Democratic dimensions'):
    # add new column group
    #merged_df['group'] = 0
    # plot by country groups

    west_22 = ['greece', 'italy', 'ireland', 'liechtenstein', 'monaco', 'luxembourg', 
               'portugal', 'finland', 'austria', 'norway', 'france', 'denmark', 
               'spain', 'san-marino', 'switzerland', 'sweden', 'germany', 'united-kingdom', 
               'belgium', 'netherlands', 'iceland', 'andorra']
    cee = ['slovenia', 'slovakia', 'latvia', 'poland', 'malta', 
           'lithuania', 'croatia', 'cyprus', 'hungary', 
           'estonia', 'denmark', 'romania', 'bulgaria', 'czechia']
    eu_candidates = [ 'kosovo', 'moldova','ukraine', 'serbia', 
                     'north-macedonia', 'turkey', 'bosnia-herzegovina', 'montenegro', 
                     'albania','georgia']
    non_eu = ['armenia', 'azerbaijan', 'belarus', 'russia'] 


    #merged_df.loc[merged_df['country'].isin(west_22), 'group'] = 1
    #merged_df.loc[merged_df['country'].isin(cee), 'group'] = 2
    #merged_df.loc[merged_df['country'].isin(eu_candidates), 'group'] = 3
    #merged_df.loc[merged_df['country'].isin(non_eu), 'group'] = 4


    group_4 = merged_df[merged_df['country'].isin(non_eu)]            
    group_3 = merged_df[merged_df['country'].isin(eu_candidates)]
    group_1 = merged_df[merged_df['country'].isin(west_22)]
    group_2 = merged_df[merged_df['country'].isin(cee)]

    groups = [group_1, group_2, group_3, group_4]

    fig = go.Figure()
    # Create a bar trace for each group
    category_names = ['west_eu','cee','eu_candidates', 'non_eu']
    traces = []
    for t in range(1,6):
        vals_mean = [g[t].mean() for g in groups]
        margin_error = [get_stats(g[t])[1] for g in groups]
        trace = go.Bar(x=category_names,
                       y=vals_mean,
                       error_y=dict(type='data', array=margin_error, visible=True),
                       name=get_topic_name(t))

        traces.append(trace)
    # Create the figure and add the bar traces
    fig = go.Figure(data=traces)

    # Update the layout
    fig.update_layout(
        title=title,
        xaxis_title='country groups',
        yaxis_title='counts',
        barmode='group'  # Set barmode to 'group' to create grouped bars
    )

    # Show the plot
    fig.show()
    
plot_country_groups(merged_df)
    
# statistical significance 
# Perform the two-sample t-test
#topics = [1]
#for topic in topics:
#    for i, g in enumerate(groups):
#        for k, j in enumerate(groups):
#            if (i == k):
#                continue
#            t_stat, p_value = stats.ttest_ind(g[topic], j[topic])
#            alpha = 0.05
            # Print the results
            #print("t-statistic:", t_stat)
            #print("p-value:", p_value)

#            if p_value < alpha:
#                print(f"Statistically significant difference between {category_names[i]} , {category_names[k]} on topic {get_topic_name(topic)} with alpha {alpha}.")
#                #print("Reject the null hypothesis: There is a statistically significant difference between the two groups.")
#            else:
#                print(f"NO statistically significant difference between {category_names[i]} , {category_names[k]} on topic {get_topic_name(topic)} with alpha {alpha}.")
                #print("Fail to reject the null hypothesis: There is no statistically significant difference between the two groups.")
        

#t_stat, p_value = stats.ttest_ind(group_1, group_3)
#alpha = 0.05
# Print the results
#print("t-statistic:", t_stat)
#print("p-value:", p_value)

#if p_value < alpha:
#    print("Reject the null hypothesis: There is a statistically significant difference between the two groups.")
#else:
#    print("Fail to reject the null hypothesis: There is no statistically significant difference between the two groups.")

In [None]:
# count ngrams 1 to 4 by source and by country
dfs = []

for n in range(1, 5):    
    results, token_counts = get_ngram_results_by_source_and_country(n, dict_file, data_file)
    df = pd.DataFrame(results)
    dfs.append(df)

In [None]:
# merge all ngrams togather
source_country_df = merge_results(dfs, start_from=3)
# save data frame
source_country_df.to_csv('source_and_country_counts.csv', index=False)

In [None]:
# plot grouped countries by sources

for source in sources:
    df = source_country_df[source_country_df['source'] == source]
    plot_country_groups(df, title=f'Democratic dimensions from source: {source}')

In [None]:
# plot per country
for topic in range(1,5):
    # liner regression model fit
    y_pred = linear_regression(merged_df, 'no_words', topic)
    plot_corpus_vs_topic(merged_df, col=topic, y_pred=y_pred, topic_name=get_topic_name(topic), title=f'{year} {get_topic_name(topic)} counts vs corpus')    
    # plot residulas to check for heteroscedasticity
    plot_fitted_vs_residuals(merged_df, col=topic, topic_name=get_topic_name(topic), title=f'{year} {get_topic_name(topic)} residuals')
    breusch_pagan_test(merged_df, x_name="no_words", y_name=topic)
    
plot_corpus_mean_deviation(merged_df, col=[1,2,3,4,5],  topic_name=get_topic_name(topic), title='Dimensions over all years')

# testing for heteroskedasticity
#X = sm.add_constant(merged_df['no_words'])
#y = merged_df[topic]

#model = sm.OLS(y,X).fit()

# Calculate residuals
#residuals = y - model.predict(X)
#merged_df[str(topic)+"_residuals"] = residuals
#plot_corpus_mean_deviation(merged_df, col=str(topic)+"_residuals", y_pred=np.zeros(len(residuals)), topic_name=get_topic_name(topic), title=f'{year} {get_topic_name(topic)} mean difference')

# Get heteroskedasticity-consistent standard errors (HC3)
# HC3 is one of the methods for robust standard errors
#robust_se = model.get_robustcov_results(cov_type='HC3').bse

#print("Coefficients:")
#print(model.params)
#print("\nHeteroskedasticity-consistent standard errors:")
#print(robust_se)
#print("\nP-values:")
#print(model.pvalues)
#print("\nConfidence Intervals at 95%")
#i = 0
#lower_bound = model.params[i] - 1.96 * robust_se[i]
#upper_bound = model.params[i] + 1.96 * robust_se[i]
#print(lower_bound)
#print(upper_bound)
#print("\nt-statistic")
#t_stat = model.tvalues[i]
#p_value = model.pvalues[i]
#print(f't-stat: {t_stat}, p-value: {p_value}')



# mean, std, z_score = calc_vec_mean_std_z_score(merged_df, ["no_words", 1])
# print(f'{mean} {std} {z_score}')

#print(merged_df)
# normalize the counts using min-max
#df = normalize_columns(merged_df, [1,2,3,4])
#plot(merged_df, year=year)
#print(df)


In [None]:
# count ngrams 1 to 4
dfs = []
tks = []
for n in range(1, 5):    
    results, token_counts = get_ngram_results_by_year(n, dict_file, data_file)
    df = pd.DataFrame(results)
    dfs.append(df)
    tks.append(token_counts)

In [None]:
# plots by year
merged_df_years = merge_results(dfs)
merged_df_years.to_csv('dims_vs_years.csv', index=False)
plot_years(merged_df_years, x="year")
#print(merged_df_years)

In [None]:
# count ngrams 1 to 4
dfs = []
tks = []
for n in range(1, 5):    
    results, token_counts = get_ngram_results_by_source(n, dict_file, data_file)
    df = pd.DataFrame(results)
    dfs.append(df)
    tks.append(token_counts)
    
merged_df_sources = merge_results(dfs)
merged_df_sources.to_csv('corpus_sources.csv', index=False)


In [None]:
# plots by source

#topic = 5
#y_pred = linear_regression(merged_df, 'no_words', topic)
#plot_corpus_vs_topic(merged_df, col=topic, y_pred=y_pred, topic_name=get_topic_name(topic))

# mean, std, z_score = calc_vec_mean_std_z_score(merged_df, ["no_words", 1])
# print(f'{mean} {std} {z_score}')

#print(merged_df)
# normalize the counts using min-max
#df = normalize_columns(merged_df, [1,2,3,4])
plot_sources(merged_df_sources, x="source")
#print(df)

In [None]:
# plot most frequent tokens
r = get_most_freq_terms_per_topic(tks)
for i in range(get_no_topics()):
    plot_terms_in_dimension(r, i+1)

In [None]:
# calc means and std
df = calc_mean_std(merged_df, [1,2,3,4,5])
topic = 5
plot_z_score(df, topic)
plot_topic(df, topic)



In [None]:
# plot size
df = get_corpus_sizes(data_file, "2022")
df_sorted = df.sort_values(by='no_words', ascending=True)
fig = go.Figure(data=[go.Bar(x=df_sorted['country'], y=df_sorted['no_words'])])

# Customize the layout (optional)
fig.update_layout(title='Corpus Size', xaxis_title='Country', yaxis_title='Size')

# Display the chart
fig.show()

In [None]:
stats.norm.ppf((1 + confidence_level) / 2)