In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import os
import random

# Other plotting libraries
import kaleido
import colorcet as cc
from colorcet.plotting import swatch, swatches
import holoviews as hv
hv.extension('matplotlib')

import matplotlib

# Set to display all columns
pd.set_option('display.max_columns', None)

# Change working directory to one folder up
os.chdir('..')

# Print working directory
print(os.getcwd())

In [None]:
df_embeddings = pd.read_excel('Data/Embeddings_Visualizations_BestOverall.xlsx')

In [None]:
df_embeddings.info()

In [None]:
# Show all data points
fig = px.scatter(df_embeddings,x='UMAP 1', y='UMAP 2')
fig.show()

In [None]:
exclude_noise = df_embeddings[df_embeddings['Topic'] != -1]

In [None]:
exclude_noise.info()

In [None]:
noise = df_embeddings[df_embeddings['Topic'] == -1]

In [None]:
# Show all clustered data points
fig = px.scatter(exclude_noise,x='UMAP 1', y='UMAP 2',color='Name')

fig.update_layout(showlegend=False)

fig.show()

In [None]:
rep_docs = exclude_noise[exclude_noise['Representative_document'] == True]

In [None]:
rep_docs.info()

In [None]:
mean_rep = rep_docs.groupby('Topic').mean(numeric_only=True)

In [None]:
mean_rep

In [None]:
topic_names = pd.read_excel('Data/Reviews_FinalLabels.xlsx')

In [None]:
topic_names.info()

In [None]:
# Show only representative documents
fig = px.scatter(x=mean_rep['UMAP 1'], 
                 y=mean_rep['UMAP 2'],
                 text=topic_names['Final Label'],
                 opacity=0)

fig.update_layout(font=dict(size=10))

fig.show()

In [None]:
topic_names['Final Label'] = topic_names['Final Label'].str.replace(' and ','<br>and ')
topic_names['Final Label'] = topic_names['Final Label'].str.replace(' of ','<br>of ')
topic_names['Final Label'] = topic_names['Final Label'].str.replace(' or ','<br>or ')

# Fix some wonky behavior
topic_names['Final Label'] = topic_names['Final Label'].str.replace('<br>and Associated',' and Associated')

In [None]:
np.unique(exclude_noise['Topic'])

In [None]:
# Choose colors with RGB (v2):

#make_colors = mean_rep.copy()

#make_colors['UMAP 1'] = 255*(make_colors['UMAP 1'] - make_colors['UMAP 1'].min())/(make_colors['UMAP 1'].max() - make_colors['UMAP 1'].min())
#make_colors['UMAP 2'] = 255*(make_colors['UMAP 2'] - make_colors['UMAP 2'].min())/(make_colors['UMAP 2'].max() - make_colors['UMAP 2'].min())

#red = list(make_colors['UMAP 1'])
#blue = list(make_colors['UMAP 2'])

#red_blue = dict(zip(red,blue))

#color_list = list()

#for key, value in red_blue.items():
    #color = "rgb(" + str(int(key)) + "," + str(random.randint(0, 255)) + "," + str(int(value)) + ")"
    #color_list.append(color)

In [None]:
swatch("glasbey_bw_minc_20")[0:46]

In [None]:
cc.b_glasbey_bw_minc_20[:46]

In [None]:
#color_list = px.colors.qualitative.Alphabet + px.colors.qualitative.Plotly + px.colors.qualitative.G10
#color_list.remove('#E2E2E2')
color_map = dict(zip(np.unique(exclude_noise['Topic']),cc.b_glasbey_bw_minc_20[:46]))

In [None]:
color_map

In [None]:
cluster_color = exclude_noise['Topic'].map(color_map)

In [None]:
cluster_color

In [None]:
# Let's combine some of these together...

# Show all clustered data points

fig = go.Figure(
    go.Scatter(
        x=mean_rep['UMAP 1'],
        y=mean_rep['UMAP 2'],
        text=topic_names['Final Label'],
        mode='text',
        
    )
)

fig.add_trace(
    go.Scatter(x=exclude_noise['UMAP 1'],
               y=exclude_noise['UMAP 2'],
               mode = 'markers',
               marker=dict(color=cluster_color),
               zorder = -2)
)

fig.add_trace(
   go.Scatter(x=noise['UMAP 1'],
               y=noise['UMAP 2'],
               mode = 'markers',
               marker=dict(color='#E2E2E2'),
               zorder = -3)
)

fig.update_layout(showlegend=False,
                 autosize=False,
                  width=1200,
                  height=800,
                  font_size=8,
                 paper_bgcolor='rgba(255,255,255,1)',
                  plot_bgcolor='rgba(255,255,255,1)')

fig.update_traces(
    marker=dict(opacity=0.25),
    #selector=dict(name='Unclustered')
)

fig.show()

In [None]:
fig.write_image('Results/Fig2_TopicMap.pdf')

In [None]:
fig.write_image('Results/Fig2_TopicMap.png')