In [7]:
# This is a Jupyter Notebook intended to help people interested in the corpus visualise relevant information

In [132]:
import pandas as pd
import plotly.io as pio
import plotly.express as px
import plotly.offline as py

In [197]:
roman_df = pd.read_csv('../data/roman20.csv')

In [None]:
roman_df.info()

# Distributions per year
The following visualisations help to get an overall picture of the corpus. Visualisations can be saved directly via the widget when clicking ``Download plot as png``

### Visualisation of publications per year (first publication)

In [236]:
count_publ_first_per_year = roman_df['publ-first'].value_counts(ascending=True).to_frame().sort_values(by=['publ-first'], ascending=True).rename(columns={'publ-first': 'COUNT'})
count_publ_first_per_year = count_publ_first_per_year.reset_index().rename(columns={'index': 'YEAR'})

fig = px.bar(
    count_publ_first_per_year,
    x='YEAR',
    y='COUNT',
    hover_data=['COUNT'],
    color='COUNT',
    text_auto='.2s',
    labels={'YEAR':'year of first publication'},
    title="Number of Publications: Distribution Over the Years 1933–2009",
    height=400)
fig.show()

### Visualisation of genre distributions per year

In [237]:
genre_dist_per_year =  roman_df.value_counts(["publ-first", "subgenre"]).to_frame().sort_values(by=['publ-first'], ascending=True).rename(columns={0: 'COUNT'})
genre_dist_per_year = genre_dist_per_year.reset_index().rename(columns={'publ-first': 'YEAR'})

fig = px.bar(
    genre_dist_per_year,
    x="YEAR",
    y="COUNT",
    labels={'YEAR':'year of first publication'},
    color="subgenre",
    text="subgenre",
    title="Genre: Distribution Over the Years 1933–2009"
)
fig.show()

### Visualisation of type distributions per year

In [238]:
type_dist_per_year =  roman_df.value_counts(["publ-first", "type"]).to_frame().sort_values(by=['publ-first'], ascending=True).rename(columns={0: 'COUNT'})
type_dist_per_year = type_dist_per_year.reset_index().rename(columns={'publ-first': 'YEAR'})

fig = px.bar(
    type_dist_per_year,
    x="YEAR",
    y="COUNT",
    labels={'YEAR':'year of first publication'},
    color="type",
    text="type",
    title="Type of Literary Work (Populaire or Blanche?): Distribution Over the Years 1933–2009"
)
fig.show()

### Visualisation of authors' nationalities distributed over the years

In [239]:

nationality_dist_per_year =  roman_df.value_counts(["publ-first", "author-nationality"]).to_frame().sort_values(by=['publ-first'], ascending=True).rename(columns={0: 'COUNT'})
nationality_dist_per_year = nationality_dist_per_year.reset_index().rename(columns={'publ-first': 'YEAR'})

fig = px.bar(
    nationality_dist_per_year,
    x="YEAR",
    y="COUNT",
    labels={'YEAR':'year of first publication'},
    color="author-nationality",
    text="author-nationality",
    title="Authors' Nationality: Distribution Over the Years 1933–2009"
)
fig.show()

### Visualisation of authors' gender distributions over the years

In [240]:
gender_dist_per_year =  roman_df.value_counts(["publ-first", "author-gender"]).to_frame().sort_values(by=['publ-first'], ascending=True).rename(columns={0: 'COUNT'})
gender_dist_per_year = gender_dist_per_year.reset_index()

fig = px.bar(
    gender_dist_per_year,
    x="publ-first",
    y="COUNT",
    labels={'publ-first':'year of first publication'},
    color="author-gender",
    text="author-gender",
    title="Authors' Gender: Distribution Over the Years 1933–2009")
fig.update_layout(barmode='stack')
fig.show()

# Overall distribution

In [206]:
# OVERALL NUMBER OF WORKS IN CORPUS
number_of_works_in_corpus = len(roman_df)

In [207]:
# GENDER
count_gender = roman_df['author-gender'].value_counts(ascending=True).to_frame().rename(columns={"author-gender": "COUNT"})
count_gender.index = count_gender.index.set_names(['GENDER'])
count_gender = count_gender.reset_index()

fig = px.pie(count_gender, values='COUNT', names='GENDER', title='Distribution of Genders in the Corpus')
fig.show()

In [126]:
# TYPE
count_type = roman_df['type'].value_counts(ascending=True).to_frame().rename(columns={"type": "COUNT"})
count_type.index = count_type.index.set_names(['TYPE'])
count_type = count_type.reset_index()

fig = px.pie(count_type, values='COUNT', names='TYPE', title='Distribution of Types in the Corpus')
fig.show()

In [127]:
# SUBGENRE
count_subgenre = roman_df['subgenre'].value_counts(ascending=True).to_frame().rename(columns={"subgenre": "COUNT"})
count_subgenre.index = count_subgenre.index.set_names(['SUBGENRE'])
count_subgenre = count_subgenre.reset_index()

fig = px.pie(count_subgenre, values='COUNT', names='SUBGENRE', title='Distribution of Subgenres in the Corpus')
fig.show()