# Descriptive Data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import calendar
from scipy.stats import spearmanr
import seaborn as sns

In [None]:
dmeeting = pd.read_parquet('./data/data_meeting.parquet')
data_agenda1 = pd.read_parquet('./data/data_agenda1.parquet')
data_agenda2 = pd.read_parquet('./data/data_agenda2.parquet')
data_agenda3 = pd.read_parquet('./data/data_agenda3.parquet')
data_speech1 = pd.read_parquet('./data/data_speech1.parquet')
data_speech2 = pd.read_parquet('./data/data_speech2.parquet')
data_speech3 = pd.read_parquet('./data/data_speech3.parquet')
parMem = pd.read_parquet('./data/parliament_members.parquet')

dagenda = pd.concat([data_agenda1, data_agenda2, data_agenda3], axis=0)
dspeech = pd.concat([data_speech1, data_speech2, data_speech3], axis=0)

In [None]:
print(dspeech.shape)

## Meeting Data

In [None]:
dmeeting

In [None]:
print(f"First Meeting date: {dmeeting['date'].min()}")
print(f"Last Meeting date: {dmeeting['date'].max()}")

In [None]:
year_freq = dmeeting['date'].dt.to_period('Y').value_counts().sort_index()
plt.figure(figsize=(10,6))
year_freq.plot(kind='line', marker='o')
plt.xlabel('Year')
plt.ylabel('Number of yearly meetings');

In [None]:
month_freq = dmeeting.groupby(dmeeting['date'].dt.month).size().sort_index()
month_freq
plt.figure(figsize=(10,6))
month_freq.plot(kind='line', marker='o')
plt.xlabel('Month')
plt.ylabel('Number of monthly meetings (aggregated by all years)')
month_names = [calendar.month_abbr[month] for month in month_freq.index]
plt.xticks(month_freq.index, month_names);

In [None]:
dmeeting['day'].value_counts()

In [None]:
dmeeting['time_start'].value_counts()[0:10]

## Agenda Data

Make some extra columns

In [None]:
data_speech_group = dspeech.groupby(["meeting_id", "agenda_item_id"])

# Add speech items text
data_speech_group_text = (
data_speech_group["speech_item_text"].apply(lambda x: " ".join(x)).reset_index()
)
dagenda = dagenda.merge(
data_speech_group_text, on=["meeting_id", "agenda_item_id"], how="left"
)

#Add time_start and time_end from speech_data
data_speech_group_time_start = data_speech_group["time_start"].first().reset_index()
data_speech_group_time_end = data_speech_group["time_end"].last().reset_index()
dagenda = dagenda.merge(
data_speech_group_time_start, on=["meeting_id", "agenda_item_id"], how="left"
)
dagenda = dagenda.merge(
data_speech_group_time_end, on=["meeting_id", "agenda_item_id"], how="left"
)

# add duration column
dagenda["time_start_f"] = pd.to_datetime(
dagenda["time_start"], format="%H:%M:%S"
)
dagenda["time_end_f"] = pd.to_datetime(
dagenda["time_end"], format="%H:%M:%S"
)

dagenda["duration"] = np.where(
# if it crosses midnight
dagenda["time_start_f"].dt.hour > dagenda["time_end_f"].dt.hour,
(
pd.to_datetime("23:59:59", format="%H:%M:%S") - dagenda["time_start_f"]
).dt.total_seconds()
+ (
dagenda["time_end_f"] - pd.to_datetime("00:00:00", format="%H:%M:%S")
).dt.total_seconds(),
# else
(dagenda["time_end_f"] - dagenda["time_start_f"]).dt.total_seconds(),
)
dagenda.drop(columns=["time_start_f", "time_end_f"], inplace=True)

# add number of words
dagenda["number_of_words"] = (
dagenda["speech_item_text"].str.split(" ").apply(len)
    )

In [None]:
dagenda['type'].value_counts()

In [None]:
dagenda.groupby('type')['number_of_words'].mean().reset_index(name='Mean number of words per agenda item')

In [None]:
number_of_speech_items_agenda = dspeech.groupby(['meeting_id', 'agenda_item_id']).size().reset_index(name='count')
pd.merge(dagenda, number_of_speech_items_agenda).groupby('type')['count'].mean().reset_index(name='Mean number of speech items per agenda item')

In [None]:
plt.hist(dagenda['duration'], bins = 100);
plt.xlabel('Duration in seconds')
plt.ylabel('Frequency')
dagenda['duration'].describe()

In [None]:
plt.hist(dagenda['number_of_words'], bins = 100);
plt.xlabel('Number of words')
plt.ylabel('Frequency')
dagenda['number_of_words'].describe()

In [None]:
plt.scatter(x=dagenda['number_of_words'], y = dagenda['duration'], c=dagenda['type'], s=1, cmap='viridis'));
plt.xlabel('Number of words');
plt.ylabel('Duration (s)');
correlation_coefficient, p_value = spearmanr(dagenda['number_of_words'], dagenda['duration'])
print(f"Spearman correlation coefficient: {correlation_coefficient}")
print(f"P-value: {p_value}")

In [None]:
from matplotlib.dates import DateFormatter
import datetime

# Assuming 'start_times' and 'end_times' are your columns with start and end times
start_times = dagenda['time_start']
end_times = dagenda['time_end']

# Convert strings to datetime objects
start_times = [datetime.datetime.strptime(time, '%H:%M:%S') for time in start_times]
end_times = [datetime.datetime.strptime(time, '%H:%M:%S') for time in end_times]

# Create a figure and axis
fig, ax = plt.subplots()

# Plot the time intervals as horizontal lines on a timeline
for i, (start, end) in enumerate(zip(start_times, end_times)):
    ax.plot([start, end], [i, i], linewidth=1, solid_capstyle="butt", alpha=0.7)

# Beautify the plot
ax.xaxis_date()
ax.xaxis.set_major_formatter(DateFormatter('%H:%M'))
#plt.yticks(range(len(start_times)), [f'Task {i+1}' for i in range(len(start_times))])
plt.xlabel('Time')
plt.ylabel('Agenda Item')
plt.title('Timeline of Agenda Time Intervals')
plt.grid(axis='x')

plt.show()


## Speech Data

In [None]:
dspeech

In [None]:
plt.hist(dspeech['duration'], bins = 100);
plt.xlabel('Duration in seconds')
plt.ylabel('Frequency')
dspeech['duration'].describe()

In [None]:
plt.hist(dspeech['number_of_words'], bins = 100);
plt.xlabel('Number of words')
plt.ylabel('Frequency')
dspeech['number_of_words'].describe()

In [None]:
plt.scatter(x=dspeech['number_of_words'], y = dspeech['duration'], s=1);
plt.xlabel('Number of words');
plt.ylabel('Duration (s)');
correlation_coefficient, p_value = spearmanr(dspeech['number_of_words'], dspeech['duration'])
print(f"Spearman correlation coefficient: {correlation_coefficient}")
print(f"P-value: {p_value}")

In [None]:
dspeech['speaker_name'].value_counts()[0:25]

In [None]:
dspeech['speaker_party'].value_counts()

In [None]:
dspeech['speaker_role'].value_counts()

In [None]:
dspeech_year = pd.merge(dspeech, dmeeting[['meeting_id', 'date']])
dspeech_year['year'] = dspeech_year['date'].dt.to_period('Y')
number_of_speech_years = dspeech_year.groupby('year').size().reset_index(name='number_of_speech_year')
party_year = dspeech_year.groupby(['speaker_party', 'year']).size().reset_index(name='Counts')
party_year = pd.merge(party_year, number_of_speech_years, on='year')
party_year['Percent'] = round((party_year['Counts'] / party_year['number_of_speech_year'])*100, 1)
party_year_pivot = party_year.pivot(index='year', columns='speaker_party', values='Percent').fillna(0)

ax = party_year_pivot.plot(kind='line', marker='o', figsize=(12, 6))

plt.xlabel('Year')
plt.ylabel('Percent of speech items')
plt.legend(title='Party')
ax.legend(title='Party', bbox_to_anchor=(1.0, 1), loc='upper left')
plt.tight_layout() 
plt.show()

In [None]:
dspeech_year = pd.merge(dspeech, dmeeting[['meeting_id', 'date']])
dspeech_year['year'] = dspeech_year['date'].dt.to_period('Y')
dspeech_year = dspeech_year[dspeech_year['speaker_party'].isin(['S', 'DF', 'ALT', 'DD', 'EL', 'FG', 'KD', 'KF', 'LA', 'M', 'NB', 'RV', 'S', 'SF', 'V'])]
number_of_speech_years = dspeech_year.groupby('year').size().reset_index(name='number_of_speech_year')
party_year = dspeech_year.groupby(['speaker_party', 'year']).size().reset_index(name='Counts')
party_year = pd.merge(party_year, number_of_speech_years, on='year')
party_year['Percent'] = round((party_year['Counts'] / party_year['number_of_speech_year'])*100, 1)
party_year_pivot = party_year.pivot(index='year', columns='speaker_party', values='Percent').fillna(0)

ax = party_year_pivot.plot(kind='line', marker='o', figsize=(12, 6))

plt.xlabel('Year')
plt.ylabel('Percent of speech items')
plt.legend(title='Party')
ax.legend(title='Party', bbox_to_anchor=(1.0, 1), loc='upper left')
plt.tight_layout() 
plt.show()

## Parliament Members

In [None]:
parMem