In [None]:
import pandas as pd
import random
import seaborn as sns
import utils
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go

In [None]:
# load data setof all countries, years and sources
data_file = '../data/all_countries_0.0.6.csv'
countries, years, all_countries_data, sources = utils.get_countries_data(data_file)
#all_countries_data = pd.read_csv(data_file, dtype={'year': str}, comment='#')
# cast sentence column to string
all_countries_data['sentence'] = all_countries_data['sentence'].astype(str)

In [None]:
# list of countries
countries = set(list(all_countries_data['country']))
print(countries)


In [None]:
# list sources
print(sources)

In [None]:
# list years
print(years)

In [None]:
# check size of dataset
print(f"number of sentences: {len(all_countries_data)}.")
k = random.randint(0, len(all_countries_data)-1)
# check first entry
print(all_countries_data.iloc[k])

In [None]:
# calculate sentence lengths
all_countries_data["sentence_len"] = all_countries_data["sentence"].apply(lambda x: len(x.split()))
plt.figure()
sns.displot(all_countries_data["sentence_len"], kde=False)
plt.show()

In [None]:
# check for short sentences
df = pd.DataFrame(all_countries_data)
short_s = df[df['sentence_len'] < 3]
assert(len(short_s['sentence']) == 0)


In [None]:
# long sentences

long_s = df[df['sentence_len'] > 200]
l = len(long_s['sentence'])
print(l)
k = random.randint(0, l-1)
print(long_s.iloc[k]['sentence'])
print(long_s.iloc[k]['source'])
print(long_s.iloc[k]['country'])
print(long_s.iloc[k]['year'])


In [None]:
# plot sources over years

data = { 'source': [], 'year': [], 'corpus_size': [] }
for source in sources:
    for year in years:                
        df_filter = df[ (df['source'] == source) & (df['year'] == year) ]
        corpus_size = df_filter['sentence_len'].sum()
        if(corpus_size > 0):
            data['source'].append(source)
            data['year'].append(year)
            data['corpus_size'].append(corpus_size)
            
s_df = pd.DataFrame(data) 

In [None]:
fig = go.Figure()
for source in sources:
    source_df = s_df[s_df['source'] == source].sort_values(by='year', ascending=True)
    fig.add_trace(go.Bar(x=source_df['year'], y=source_df['corpus_size'], name=source))

    fig.update_xaxes(categoryorder='array', categoryarray=np.sort(np.array(list(years))))
    # Customize the layout (optional)
    fig.update_layout(title="Corpus size by source over years", xaxis_title='year', yaxis_title='corpus size')

    # Display the chart
fig.show()

In [None]:
# plot sources by countries

data = { 'source': [], 'country': [], 'corpus_size': [] }
for source in sources:
    for country in countries:                
        df_filter = df[ (df['source'] == source) & (df['country'] == country) ]
        corpus_size = df_filter['sentence_len'].sum()
        if(corpus_size > 0):
            data['source'].append(source)
            data['country'].append(country)
            data['corpus_size'].append(corpus_size)
            
c_df = pd.DataFrame(data) 

In [None]:
fig = go.Figure()
for source in sources:
    source_df = c_df[c_df['source'] == source]
    fig.add_trace(go.Bar(x=source_df['country'], y=source_df['corpus_size'], name=source))

    #fig.update_xaxes(categoryorder='array', categoryarray=np.sort(np.array(list(years))))
    # Customize the layout (optional)
    fig.update_layout(title="Corpus size by source by country", xaxis_title='country', yaxis_title='corpus size')

    # Display the chart
fig.write_image("fig1.svg", width="1221", height="360")

fig.show()