In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import umap

from gensim.models import KeyedVectors

from collections import Counter

In [None]:
%run helper_functions

In [None]:
data_path = "./data/"

data_speech1 = pd.read_parquet(data_path + 'data_speech1.parquet')
data_speech2 = pd.read_parquet(data_path + 'data_speech2.parquet')
data_speech3 = pd.read_parquet(data_path + 'data_speech3.parquet')
data_speech1_tok = pd.read_parquet(data_path+'data_speech1_tok.parquet')
data_speech2_tok = pd.read_parquet(data_path+'data_speech2_tok.parquet')

dspeech = pd.concat([data_speech1, data_speech2, data_speech3], axis=0)
dspeech_tok = pd.concat([data_speech1_tok, data_speech2_tok], axis=0)

In [None]:
C_words = pd.read_parquet(data_path+'final_C_words.parquet')
C_word_set = set(C_words['word'])

In [None]:
MODEL_FILE = 'dsl_skipgram_2020_m5_f500_epoch2_w5.model.w2v.bin'
model = KeyedVectors.load_word2vec_format(data_path+MODEL_FILE, binary=True)

In [None]:
dspeech_C = pd.merge(dspeech_tok, dspeech.loc[dspeech.label=="C", ["meeting_id", "agenda_item_id", "speech_item_id", "label"]], on=["meeting_id", "agenda_item_id", "speech_item_id"], how="right")
dspeech_C = add_custom_features(dspeech_C, C_word_set, model)

In [None]:
reducer = umap.UMAP(random_state=42)
embedding = reducer.fit_transform(dspeech_C['average_vec_C'].tolist())

kmeans = KMeans(n_clusters=4, random_state=42).fit(embedding)

plt.scatter(embedding[:, 0], embedding[:, 1], c=kmeans.labels_, cmap='Spectral', s=5)

plt.gca().set_aspect('equal', 'datalim')

plt.colorbar(boundaries=np.arange(5)-0.5).set_ticks(np.arange(4))

plt.savefig(data_path+'SI_C_Kmeans.svg', format='svg')

In [None]:
dspeech_C["kmeans_group"] = kmeans.labels_
dspeech_C

In [None]:
dspeech_0 = dspeech_C[dspeech_C['kmeans_group']==0]
dspeech_1 = dspeech_C[dspeech_C['kmeans_group']==1]
dspeech_2 = dspeech_C[dspeech_C['kmeans_group']==2]
dspeech_3 = dspeech_C[dspeech_C['kmeans_group']==3]

unique_words_0_list = [word for tokens in dspeech_0.C_words for word in tokens]
unique_words_1_list = [word for tokens in dspeech_1.C_words for word in tokens]
unique_words_2_list = [word for tokens in dspeech_2.C_words for word in tokens]
unique_words_3_list = [word for tokens in dspeech_3.C_words for word in tokens]

unique_words_0_dict = Counter(unique_words_0_list)
unique_words_1_dict = Counter(unique_words_1_list)
unique_words_2_dict = Counter(unique_words_2_list)
unique_words_3_dict = Counter(unique_words_3_list)

unique_words_0_df = pd.DataFrame.from_dict(unique_words_0_dict, orient='index', columns=['word_count_0']).reset_index().rename(columns={'index': 'word'})
unique_words_1_df = pd.DataFrame.from_dict(unique_words_1_dict, orient='index', columns=['word_count_1']).reset_index().rename(columns={'index': 'word'})
unique_words_2_df = pd.DataFrame.from_dict(unique_words_2_dict, orient='index', columns=['word_count_2']).reset_index().rename(columns={'index': 'word'})
unique_words_3_df = pd.DataFrame.from_dict(unique_words_3_dict, orient='index', columns=['word_count_3']).reset_index().rename(columns={'index': 'word'})

#merge
merged_df = pd.merge(unique_words_0_df, unique_words_1_df, on='word', how='outer')
merged_df = pd.merge(merged_df, unique_words_2_df, on='word', how='outer')
unique_words = pd.merge(merged_df, unique_words_3_df, on='word', how='outer')

unique_words[['word_count_0', 'word_count_1', 'word_count_2', 'word_count_3']] = unique_words[['word_count_0', 'word_count_1', 'word_count_2', 'word_count_3']].fillna(0)
unique_words['frequency_0'] = unique_words['word_count_0'] / sum(unique_words['word_count_0'])
unique_words['frequency_0_rest'] = (unique_words['word_count_1'] + unique_words['word_count_2'] + unique_words['word_count_3'])/(sum(unique_words['word_count_1']) + sum(unique_words['word_count_2']) + sum(unique_words['word_count_3']))
unique_words['frequency_1'] = unique_words['word_count_1'] / sum(unique_words['word_count_1'])
unique_words['frequency_1_rest'] = (unique_words['word_count_0'] + unique_words['word_count_2'] + unique_words['word_count_3'])/(sum(unique_words['word_count_0']) + sum(unique_words['word_count_2']) + sum(unique_words['word_count_3']))
unique_words['frequency_2'] = unique_words['word_count_2'] / sum(unique_words['word_count_2'])
unique_words['frequency_2_rest'] = (unique_words['word_count_0'] + unique_words['word_count_1'] + unique_words['word_count_3'])/(sum(unique_words['word_count_0']) + sum(unique_words['word_count_1']) + sum(unique_words['word_count_3']))
unique_words['frequency_3'] = unique_words['word_count_3'] / sum(unique_words['word_count_3'])
unique_words['frequency_3_rest'] = (unique_words['word_count_0'] + unique_words['word_count_1'] + unique_words['word_count_2'])/(sum(unique_words['word_count_0']) + sum(unique_words['word_count_1']) + sum(unique_words['word_count_2']))
freq_1_0 = 1/(sum(unique_words['word_count_1']) + sum(unique_words['word_count_2']) + sum(unique_words['word_count_3']))
freq_1_1 = 1/(sum(unique_words['word_count_0']) + sum(unique_words['word_count_2']) + sum(unique_words['word_count_3']))
freq_1_2 = 1/(sum(unique_words['word_count_0']) + sum(unique_words['word_count_1']) + sum(unique_words['word_count_3']))
freq_1_3 = 1/(sum(unique_words['word_count_0']) + sum(unique_words['word_count_1']) + sum(unique_words['word_count_2']))

unique_words["odds_0"] = unique_words.apply(lambda row: calculate_custom_odds(row['frequency_0'], row['frequency_0_rest'], freq_1_0), axis=1)
unique_words["odds_1"] = unique_words.apply(lambda row: calculate_custom_odds(row['frequency_1'], row['frequency_1_rest'], freq_1_1), axis=1)
unique_words["odds_2"] = unique_words.apply(lambda row: calculate_custom_odds(row['frequency_2'], row['frequency_2_rest'], freq_1_2), axis=1)
unique_words["odds_3"] = unique_words.apply(lambda row: calculate_custom_odds(row['frequency_3'], row['frequency_3_rest'], freq_1_3), axis=1)

In [None]:
unique_words.sort_values("odds_0", ascending=False)[0:20]

In [None]:
unique_words.sort_values("odds_1", ascending=False)[0:20]

In [None]:
unique_words.sort_values("odds_2", ascending=False)[0:20]

In [None]:
unique_words.sort_values("odds_3", ascending=False)[0:20]

In [None]:
dspeech_C.kmeans_group.value_counts()

In [None]:
dspeech_C["speech_item_tokenized"] = dspeech_C["speech_item_tokenized"].apply(lambda x: str(x))
dspeech_C[["meeting_id", "agenda_item_id", "speech_item_id", "speech_item_tokenized", "kmeans_group"]].to_parquet(data_path+'data_speech_C.parquet')

In [None]:
unique_words[["word", "word_count_0", "word_count_1", "word_count_2", "word_count_3", "odds_0", "odds_1", "odds_2", "odds_3"]].to_parquet(data_path+'unique_words_C.parquet')