In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
from tqdm import tqdm

from gensim.models import KeyedVectors


In [None]:
dmeeting = pd.read_parquet('./data/data_meeting.parquet')
data_agenda1 = pd.read_parquet('./data/data_agenda1.parquet')
data_agenda2 = pd.read_parquet('./data/data_agenda2.parquet')
data_agenda3 = pd.read_parquet('./data/data_agenda3.parquet')
data_speech1 = pd.read_parquet('./data/data_speech1.parquet')
data_speech2 = pd.read_parquet('./data/data_speech2.parquet')
data_speech3 = pd.read_parquet('./data/data_speech3.parquet')
parMem = pd.read_parquet('./data/parliament_members.parquet')

dagenda = pd.concat([data_agenda1, data_agenda2, data_agenda3], axis=0)
dspeech = pd.concat([data_speech1, data_speech2, data_speech3], axis=0)

annotation_data = pd.read_csv('data/annotation_data.csv')

In [None]:
annotation_data["id"] = annotation_data["meeting_id"].astype(str) + "_" + annotation_data["agenda_item_id"].astype(str)
dspeech["id"] = dspeech["meeting_id"].astype(str) + "_" + dspeech["agenda_item_id"].astype(str)
dspeech = dspeech.loc[~dspeech.id.isin(annotation_data.id)].reset_index(drop=False).reset_index(drop=True)

In [None]:

MODEL_FILE = '../../dsl_skipgram_2020_m5_f500_epoch2_w5.model.w2v.bin'
model = KeyedVectors.load_word2vec_format(MODEL_FILE, binary=True)

In [None]:
exception_list = ['CO2', 'co2']
nlp = spacy.load("da_core_news_sm")

def preprocess_text(text):
    doc = nlp(text)
    processed_tokens = []
    vec_representation = np.zeros((500))
    for token in doc:
        print(token, token.is_stop)
        if token.is_stop:
            continue
        elif not token.is_alpha:
            continue
        elif token in exception_list:
            continue
        token = token.text.lower()
        processed_tokens.append(token)
        vec = model.word_vec(token)
        vec_representation += vec/len(doc)
    # processed_tokens = [token.text.lower()for token in doc if not token.is_stop and token.is_alpha or token.text in exception_list]
    return processed_tokens, vec_representation

In [None]:
tokenized_text= []
s_i_embeddings = []
for speech_item in tqdm(dspeech["speech_item_text"].to_list()[0:5]):
    tokenized, s_i_e = preprocess_text(speech_item)
    tokenized_text.append(tokenized)
    s_i_embeddings.append(s_i_e)

In [None]:
dspeech["tokenized_text"] = tokenized_text
dspeech["s_i_embeddings"] = s_i_embeddings

dspeech.to_parquet('data/data_speech_tokenized.parquet', compression='gzip') 

In [None]:
with open('data/C_word_list.txt', 'r', encoding='utf-8') as f:
    C_words_list = f.read().splitlines()
    C_words_set = set(C_words_list)

with open('data/NC_word_list.txt', 'r', encoding='utf-8') as f:
    NC_words_list = f.read().splitlines()
    NC_words_set = set(NC_words_list)

with open('data/NC_word_list_all.txt', 'r', encoding='utf-8') as f:
    NC_words_all = f.read().splitlines()
    NC_words_set_all = set(NC_words_all)


In [None]:
label_counts = []
for speech_item in tqdm(dspeech_sample['tokenized_text'].to_list()):
    C_count = 0
    NC_count = 0
    NC_count_all = 0
    for word in speech_item:
        if word in C_words_set:
            C_count += 1
        elif word in NC_words_set:
            NC_count += 1
        elif word in NC_words_set_all:
            NC_count_all += 1
    label_counts.append([C_count, NC_count, NC_count_all])

dspeech_sample[['C_counts', 'NC_counts', 'NC_counts_all']] = label_counts


In [None]:
#dspeech_sample.to_csv('data_additionaal/tokenized33000.csv')

In [None]:
dspeech_sample.head(10)

In [None]:
qsi = dspeech_sample[(dspeech_sample['C_counts']>10) & (dspeech_sample['NC_counts_all']< dspeech_sample['C_counts'])]

In [None]:
# those_eighty = dspeech_sample[(dspeech_sample['C_counts']>3) & (dspeech_sample['NC_counts']< dspeech_sample['C_counts']) & (dspeech_sample["NC_counts_all"]>= dspeech_sample['C_counts'])]

In [None]:
qsi.shape

In [None]:
qsi['speaker_party'].value_counts()

In [None]:
for text in qsi['speech_item_text'].to_list():
    print(text)
    print('-----------------')