In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
from tqdm import tqdm

from gensim.models import KeyedVectors

In [None]:
dmeeting = pd.read_parquet('./data/data_meeting.parquet')
data_agenda1 = pd.read_parquet('./data/data_agenda1.parquet')
data_agenda2 = pd.read_parquet('./data/data_agenda2.parquet')
data_agenda3 = pd.read_parquet('./data/data_agenda3.parquet')
data_speech1 = pd.read_parquet('./data/data_speech1.parquet')
data_speech2 = pd.read_parquet('./data/data_speech2.parquet')
data_speech3 = pd.read_parquet('./data/data_speech3.parquet')
parMem = pd.read_parquet('./data/parliament_members.parquet')

dagenda = pd.concat([data_agenda1, data_agenda2, data_agenda3], axis=0)
dspeech = pd.concat([data_speech1, data_speech2, data_speech3], axis=0)

annotation_data = pd.read_csv('data/annotation_data.csv')

In [None]:
agendaC = annotation_data[annotation_data['label'] == 'C']
speechC = dspeech[dspeech[['meeting_id', 'agenda_item_id']].apply(tuple, axis=1).isin(agendaC[['meeting_id', 'agenda_item_id']].apply(tuple, axis=1))]
speechC['label'] = 'C'
agendaNC = annotation_data[annotation_data['label'] == 'NC']
speechNC = dspeech[dspeech[['meeting_id', 'agenda_item_id']].apply(tuple, axis=1).isin(agendaNC[['meeting_id', 'agenda_item_id']].apply(tuple, axis=1))]
speechNC = pd.merge(speechNC, agendaNC[['meeting_id', 'agenda_item_id', 'group']], on=['meeting_id', 'agenda_item_id'], how='left')
speechNC['label'] = 'NC'

In [None]:
nlp = spacy.load("da_core_news_sm")
exception_list = ['CO2', 'co2']
def preprocess_text(text):
    doc = nlp(text)
    processed_tokens = [token.text.lower()for token in doc if not token.is_stop and token.is_alpha or token.text in exception_list]
    return processed_tokens

def preprocess_text_lemma(text):
    doc = nlp(text)
    processed_tokens_lemma = [token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha or token.text in exception_list]
    return processed_tokens_lemma 

In [None]:
speech_comb = pd.concat([speechC, speechNC])
speech_comb = speech_comb.rename(columns={'label': 'label_agenda'})
speech_comb['processed_tokens'] = speech_comb.speech_item_text.apply(preprocess_text)
speech_comb['processed_tokens_lemma'] = speech_comb.speech_item_text.apply(preprocess_text_lemma)

In [None]:
df = speech_comb[["meeting_id","agenda_item_id","speech_item_id","label_agenda","processed_tokens","processed_tokens_lemma"]]
df['id'] = df["meeting_id"].astype(str) + "_" + df["agenda_item_id"].astype(str) + "_" + df["speech_item_id"].astype(str)
df = df.drop(columns=["meeting_id","agenda_item_id","speech_item_id"])

In [None]:
df.to_csv('../../labeled_tokenized_data.csv', index=False)

### Count Odds 


In [None]:
def caluc_proportion(numerator, denominator):
    if denominator == 0:
        if numerator == 0:
            return 0
        else:
            return numerator
    else:
        if numerator == 0:
            return 0
        else:
            return numerator/denominator

In [None]:
from collections import Counter
unique_words_list_C = []
unique_words_list_NC = []

for i in range(len(df[df['label_agenda']=='C'])):
    unique_words_list_C.extend(df[df['label_agenda']=='C'].processed_tokens_lemma.iloc[i])
for i in range(len(df[df['label_agenda']=='NC'])):
    unique_words_list_NC.extend(df[df['label_agenda']=='NC'].processed_tokens_lemma.iloc[i])

unique_words_C_counter = Counter(unique_words_list_C)
unique_words_C_df= pd.DataFrame.from_dict(unique_words_C_counter, orient='index', columns = ['word_count_C'])
unique_words_C_df.reset_index(inplace=True)
unique_words_C_df.rename(columns={'index': 'word'}, inplace=True)

unique_words_NC_counter = Counter(unique_words_list_NC)
unique_words_NC_df= pd.DataFrame.from_dict(unique_words_NC_counter, orient='index', columns = ['word_count_NC'])
unique_words_NC_df.reset_index(inplace=True)
unique_words_NC_df.rename(columns={'index': 'word'}, inplace=True)

In [None]:
unique_words = pd.merge(unique_words_C_df, unique_words_NC_df, on='word', how='outer')
unique_words[['word_count_C', 'word_count_NC']] = unique_words[['word_count_C', 'word_count_NC']].fillna(0)
unique_words['frequency_C'] = unique_words['word_count_C'] / sum(unique_words_C_df['word_count_C'])
unique_words['frequency_NC'] = unique_words['word_count_NC'] / sum(unique_words_NC_df['word_count_NC'])
unique_words["odds_C"] = unique_words.apply(lambda row: caluc_proportion(row['frequency_C'], row['frequency_NC']), axis=1)
unique_words["odds_NC"] = unique_words.apply(lambda row: caluc_proportion(row['frequency_NC'], row['frequency_C']), axis=1)