In [None]:
import pandas as pd
import os
import csv
from functools import reduce
import json
import configparser
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
# root folder
root_folder = "../../output_data/topic_modelling/run_20230601_113109"
report_folder = os.path.join(root_folder, "reports")
topic_folder = os.path.join(root_folder, "topics")

In [None]:
def get_country_year(filename):
        parts = filename.split("_")
        if len(parts) > 3:
            country = parts[0] + "_" + parts[1]
            year = parts[2]
        else:
            country = parts[0]
            year = parts[1]    
        return (country, year)
        

In [None]:
# load dataframes
dataframes = []
countries = []
years = []

# Iterate over files in the folder
for filename in os.listdir(report_folder):
    if filename.endswith(".csv"):
        country, year = get_country_year(filename)
        if country not in countries:
            countries.append(country)
        if year not in years:
            years.append(year)
            
        file_path = os.path.join(report_folder, filename)
        # Read the CSV file into a pandas DataFrame
        df = pd.read_csv(file_path, header=None)
        df["country"] = country
        df["year"] = year
        df.columns.values[0] = 'attribute'
        df.columns.values[1] = 'topic_no'
        df.columns.values[2] = "word_from_dict"
        df.columns.values[3] = "similarity"
        df.columns.values[4] = "normalized_count"
       
        # Append the DataFrame to the list
        dataframes.append(df)
        
combined_df = pd.concat(dataframes, ignore_index=True)

In [None]:
# load topics
topics_df = []
# Iterate over files in the folder
for filename in os.listdir(topic_folder):
    if filename.endswith(".csv"):
        country, year = get_country_year(filename)        
        file_path = os.path.join(topic_folder, filename)
        # Read the CSV file into a pandas DataFrame
        df = pd.read_csv(file_path)
        df["country"] = country
        df["year"] = year
        #df.columns.values[0] = 'attribute'
        #df.columns.values[1] = 'topic_no'
        #df.columns.values[2] = "word_from_dict"
        #df.columns.values[3] = "similarity"
        #df.columns.values[4] = "normalized_count"
       
        # Append the DataFrame to the list
        topics_df.append(df)
        
combined_topics_df = pd.concat(topics_df, ignore_index=True)
print(combined_topics_df.head())

In [None]:
corpus_df = []
data = {'country':[], 'year': [], 'corpus_size': []}
for country in countries:
    for year in years:
        tdf = combined_topics_df
        df = tdf[(tdf['year'] == year) & (tdf['country'] == country)]
        size = df['Count'].sum()
        data['country'].append(country)
        data['year'].append(year)
        data['corpus_size'].append(size)
        
        
corpus_df = pd.DataFrame(data)
print(corpus_df.head())            

In [None]:
# plot an attribute and year for all countries
attribute = 'corruption'
year = '2020'
filtered_df = combined_df[(combined_df['year'] == year) & (combined_df['attribute'] == attribute)]
a_sorted_df = filtered_df.sort_values('normalized_count')
a_sorted_df['normalized_count'] = a_sorted_df['normalized_count'].astype(float)
a_sorted_df = a_sorted_df.reset_index(drop=True)

In [None]:
fig = px.bar(a_sorted_df, x='country', y='normalized_count')
fig.update_layout(yaxis_title=f'coverage of {attribute}')
fig.update_layout(xaxis_title=f'country {year}')
fig.show()

In [None]:
# plot corpus sizes for all countries
year = '2020'
filtered_df = corpus_df[(corpus_df['year'] == year) & (corpus_df['corpus_size'] > 0)]
sorted_df = filtered_df.sort_values('corpus_size')
max_corpus = sorted_df['corpus_size'].max()
sorted_df['normalized_corpus'] = sorted_df['corpus_size'].apply(lambda x: x/max_corpus)
sorted_df['normalized_corpus'] = sorted_df['normalized_corpus'].astype(float)
data = { 'country': [], 'compound_metric1': [], 'compound_metric2': [], 'normalized_count':[], 'normalized_corpus':[]}
for i, row in a_sorted_df.iterrows():
    
    country = row['country']
    corpus_row = sorted_df[sorted_df['country'] == country]
    norm_corpus = corpus_row['normalized_corpus'].values[0]
    xx = a_sorted_df.loc[i]['normalized_count']
    data['country'].append(country)
    data['compound_metric1'].append(xx / (norm_corpus))
    data['compound_metric2'].append((xx * norm_corpus)**2)
    data['normalized_count'].append(xx)
    data['normalized_corpus'].append(norm_corpus)
    #a_sorted_df.loc[i]['compound_metric'] =  xx * norm_corpus

new_df = pd.DataFrame(data)
new_sorted_df = new_df.sort_values('compound_metric1')
print(new_sorted_df)

In [None]:
fig = px.bar(new_sorted_df, x='country', y='compound_metric1')
fig.update_layout(yaxis_title=f'coverage of corrpution')
fig.update_layout(xaxis_title=f'country {year}')
fig.show()

In [None]:
fig = px.bar(sorted_df, x='country', y='corpus_size')
fig.update_layout(yaxis_title='corpus size')
fig.update_layout(xaxis_title=f'country {year}')
fig.show()