In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
from tqdm import tqdm
from gensim.models import KeyedVectors
import multiprocessing as mp
from spacy.tokens import Doc
from typing import List
from collections import Counter
import umap
import plotly.express as px
from sklearn.cluster import KMeans
import plotly.graph_objs as go
from scipy.spatial import distance
import plotly.io as pio

In [None]:
%run helper_functions

In [5]:
data_path = "./data/"

data_speech1 = pd.read_parquet(data_path + 'data_speech1.parquet')
data_speech2 = pd.read_parquet(data_path + 'data_speech2.parquet')
data_speech3 = pd.read_parquet(data_path + 'data_speech3.parquet')
data_speech1_tok = pd.read_parquet(data_path+'data_speech1_tok.parquet')
data_speech2_tok = pd.read_parquet(data_path+'data_speech2_tok.parquet')

dspeech = pd.concat([data_speech1, data_speech2, data_speech3], axis=0)
dspeech_tok = pd.concat([data_speech1_tok, data_speech2_tok], axis=0)

In [6]:
C_words = pd.read_parquet(data_path+'final_C_words.parquet')
C_word_set = set(C_words['word'])

In [7]:
MODEL_FILE = 'dsl_skipgram_2020_m5_f500_epoch2_w5.model.w2v.bin'
model = KeyedVectors.load_word2vec_format(data_path+MODEL_FILE, binary=True)

In [8]:
dspeech_C = pd.merge(dspeech_tok, dspeech.loc[dspeech.label=="C", ["meeting_id", "agenda_item_id", "speech_item_id", "label"]], on=["meeting_id", "agenda_item_id", "speech_item_id"], how="right")
dspeech_C = add_custom_features(dspeech_C, C_word_set, model)

In [9]:
reducer = umap.UMAP(n_components=3, random_state=None)
embedding = reducer.fit_transform(dspeech_C['average_vec_C'].tolist())

kmeans = KMeans(n_clusters=3, random_state=None).fit(embedding)
dspeech_C["kmeans_group_3d"] = kmeans.labels_

# 3D scatterplot
fig = px.scatter_3d(
    embedding, x=0, y=1, z=2, color=kmeans.labels_, size=0.1*np.ones(len(embedding)), opacity = 1,
    title='UMAP plot in 3D',
    labels={'0': 'comp. 1', '1': 'comp. 2', '2': 'comp. 3'},
    width=650, height=500
)

centroids = kmeans.cluster_centers_

fig.add_trace(go.Scatter3d(
    x=centroids[:, 0], y=centroids[:, 1], z=centroids[:, 2],
    mode='markers',
    marker=dict(
        color='rgb(255, 0, 0)', # set color to red
        size=10,
        symbol='cross'
    ),
    name='Centroids'
))

fig.show()



In [12]:
closest_points = [np.argmin(distance.cdist([centroid], embedding, 'euclidean')) for centroid in centroids]

In [14]:
similar_words0 = model.most_similar(positive=[dspeech_C.average_vec_C.iloc[closest_points[0]]], topn=10)

In [15]:
similar_words0

[('afledningsafgift', 0.7947692275047302),
 ('fossilbiler', 0.7908735871315002),
 ('iblandingskrav', 0.7893093824386597),
 ('afgiftsfritages', 0.7890761494636536),
 ('kilometerafgiften', 0.7884555459022522),
 ('afgiftsforøgelse', 0.7882301807403564),
 ('lavemissionsbiler', 0.7881604433059692),
 ('starthusleje', 0.7869181632995605),
 ('afgiftsfordele', 0.7866101264953613),
 ('nulemissionsbiler', 0.7863680720329285)]

In [16]:
similar_words1 = model.most_similar(positive=[dspeech_C.average_vec_C.iloc[closest_points[1]]], topn=10)

In [17]:
similar_words1

[('fossilfrie', 0.7684251666069031),
 ('lavenergisamfund', 0.7678879499435425),
 ('energisystemerne', 0.7658321261405945),
 ('energifremtid', 0.7571661472320557),
 ('datacentrenes', 0.7517784237861633),
 ('biomasseressourcer', 0.7504869103431702),
 ('energiudfordring', 0.7491044402122498),
 ('kulstoffattige', 0.7475237846374512),
 ('energikurs', 0.7473050951957703),
 ('opvarmningskilder', 0.7461227178573608)]

In [None]:
similar_words2 = model.most_similar(positive=[dspeech_C.average_vec_C.iloc[closest_points[2]]], topn=10)

In [None]:
similar_words2

[('energisystemerne', 0.7693237066268921),
 ('fossilfrie', 0.7534082531929016),
 ('plusenergihuse', 0.7417027354240417),
 ('biomasseressourcer', 0.7388436794281006),
 ('vindkraftel', 0.7346017956733704),
 ('kulstoffattig', 0.7341635823249817),
 ('kulafbrænding', 0.7339805364608765),
 ('energikurs', 0.7317765355110168),
 ('klimarigtige', 0.7314140200614929),
 ('miljøressourcer', 0.7311323285102844)]

In [None]:
dspeech_C.iloc[closest_points[2]].C_words

['fremtidige',
 'generationer',
 'energisystem',
 'forurener',
 'energi',
 'grønne',
 'miljøet',
 'kickstarte']

In [21]:
dspeech_C.kmeans_group_3d.value_counts()

1    13351
2     9165
0     5330
Name: kmeans_group_3d, dtype: int64

In [None]:
label_dict = {0: "Agriculture/Nature", 1: "Energy", 2: "Infrastructure"}
dspeech_C["kmeans_group_3d_names"] = dspeech_C["kmeans_group_3d"].map(label_dict)

fig = go.Figure()

for label, name in label_dict.items():
    fig.add_trace(go.Scatter3d(
        x=embedding[dspeech_C["kmeans_group_3d"]==label, 0],
        y=embedding[dspeech_C["kmeans_group_3d"]==label, 1],
        z=embedding[dspeech_C["kmeans_group_3d"]==label, 2],
        mode='markers',
        marker=dict(size=4),
        name=name
    ))

fig.update_layout(
    #title=dict(
    #    text='UMAP plot in 3D',
    #    font=dict(
    #        size=20
    #    )
    #),
    width=1000,
    height=750,
    legend=dict(
        font=dict(
            size=20,
        )
    ),
    scene=dict(
        camera=dict(
            eye=dict(x=-1, y=2.25, z=0.1)
        )
    )
)
fig.show()
fig.write_image(data_path+'umap_plot.svg')