In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold

In [6]:
#annotation_data = pd.read_csv('dataraw_annotation/annotation_data.csv')
dmeeting = pd.read_parquet('./data/data_meeting.parquet')
dagenda = pd.read_parquet('./data/data_agenda.parquet')
data_speech1 = pd.read_parquet('./data/data_speech1.parquet')
data_speech2 = pd.read_parquet('./data/data_speech2.parquet')
data_speech3 = pd.read_parquet('./data/data_speech3.parquet')

### Sampling Agenda items to annotate

We sample 

In [None]:
def extract_year(text):
  return text.year

dagenda["year"] = dagenda["date"].apply(extract_year)
dagenda["group"] = dagenda["year"].astype(str) + "_" + dagenda["type"].astype(str)
dagenda["unique_id"] = dagenda["meeting_id"].astype(str) + "_" + dagenda["agenda_item_id"].astype(str)

In [None]:
X = [0] * len(dagenda)

stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
for i, (train_idx, val_idx) in enumerate(stratified_kfold.split(X, dagenda.group.to_list())):
  val_index = list(val_idx)
  temp_df = dagenda.loc[val_index]

  duplicates = pd.merge(temp_df, dagenda, on='unique_id', how='inner')
  dup_rate = len(duplicates) / len(temp_df) * 100
  print(f"Duplicate Rate for Fold{i}: {dup_rate:.2f}%")

In [None]:
annotation_data.replace({'C': 0, 'NC': 1, '?':2}, inplace=True)

kappa_score = cohen_kappa_score(annotation_data['Eisuke_label'], annotation_data['Anders_label'])
print(f'Kappa Score between Annotator1 and Annotator2: {kappa_score}')

kappa_score = cohen_kappa_score(annotation_data['Eisuke_label'], annotation_data['Andreas_label'])
print(f'Kappa Score between Annotator1 and Annotator3: {kappa_score}')

kappa_score = cohen_kappa_score(annotation_data['Anders_label'], annotation_data['Andreas_label'])
print(f'Kappa Score between Annotator2 and Annotator3: {kappa_score}')

### Annotating '?'

We manually annotate these all together

In [None]:
#annotation_together = pd.read_csv('dataraw_annotation/annotation_together.csv')

In [None]:
annotation_q = annotation_data[
    (annotation_data['Eisuke_label'] == '?') |
    (annotation_data['Andreas_label'] == '?') |
    (annotation_data['Anders_label'] == '?')
]

In [None]:
annotation_q_old = pd.read_csv('dataraw_annotation/annotation_together.csv')

In [None]:
annotation_q = pd.merge(annotation_q, annotation_q_old[['title', 'group', 'label']], on=['title', 'group'], how='left')

In [None]:
annotation_q.to_csv('dataraw_annotation/annotation_together.csv', index=False)

### Combined labels

In [None]:
annotation_data = annotation_data[
    (annotation_data['Eisuke_label'] != '?') &
    (annotation_data['Andreas_label'] != '?') &
    (annotation_data['Anders_label'] != '?')
]

In [None]:
def generate_combined_label(row):
    count_C = 0
    if row['Eisuke_label'] == 'C':
        count_C += 1
    if row['Andreas_label'] == 'C':
        count_C += 1
    if row['Anders_label'] == 'C':
        count_C += 1
    if count_C >= 2:
        return 'C'
    else:
        return 'NC'

In [None]:
annotation_data['label'] = annotation_data.apply(lambda row: generate_combined_label(row), axis=1)

In [None]:
annotation_data = pd.concat([annotation_data, annotation_q])

In [None]:
annotation_data['label'].value_counts()

In [None]:
#annotation_data.to_parquet('./data/data_annotation.parquet')