# 3D visualization of UMAP embeddings with respect to different MSHA variables

## Load Data

In [14]:
import pandas as pd
import warnings 
warnings.filterwarnings("ignore")

data = pd.read_csv('data_with_kmeans_cluster_labels_and_umap_embeddings_of_different_scenarios.csv')
data.head(2)

Unnamed: 0,SUBUNIT,MINING_EQUIP,CLASSIFICATION,ACCIDENT_TYPE,INJURY_SOURCE,NATURE_INJURY,INJ_BODY_PART,DEGREE_INJURY,CAL_YR,MINING_EQUIP_C,...,umap1_2D_variables_only,umap2_2D_variables_only,umap1_reduced_embeddings_only_text_variables,umap2_reduced_embeddings_only_text_variables,umap3_reduced_embeddings_only_text_variables,9cluster_kmeans_merged_narratives,9cluster_kmeans_only_variables,9cluster_kmeans_only_narratives,sub_cluster_9cluster_kmeans_merged_narratives,final_clusters
0,Mill Operation/Preparation plant,"Bench grinder, drill press, Band/Table saw, Sa...",Machinery,Struck against stationary object,"Metal, NEC (pipe, wire, nail)","Cut, Lacer, Punct-Opn wound",Finger (s) /Thumb,"NO DYS AWY FRM WRK,NO RSTR ACT",2003,6.0,...,-6.036556,-1.21337,13.431704,5.581674,6.384573,4,3,3,4.1,2
1,"Strip, Quary, open pit",no_value,Handling of materials,Over-Exertion in lifting objects,"Broken rock, coal, ore, Wste","Sprain, strain Rupt disc",Forearm/Ulnar/Radius,DAYS RESTRICTED ACTIVITY ONLY,2003,7.0,...,0.715287,16.372698,3.784758,-1.64633,-4.721196,1,0,0,1.1,6


In [3]:
import numpy as np

data.replace(np.nan, '', regex=True, inplace=True)

In [4]:
len(data)

248873

## Visualization function

In [10]:
import plotly.express as px

def chart(X, color):
    # Create a 3D graph
    fig = px.scatter_3d(X, x=X[:,0], y=X[:,1], z=X[:,2], color=color, height=900, width=950)

    # Update chart looks
    fig.update_layout(title='Narratives embeddings',
                      showlegend=True,
                      legend=dict(orientation="h", yanchor="top", y=0, xanchor="center", x=0.5),
                      scene_camera=dict(up=dict(x=0, y=0, z=1), 
                                            center=dict(x=0, y=0, z=-0.1),
                                            eye=dict(x=1.5, y=-1.4, z=0.5)),
                                            margin=dict(l=0, r=0, b=0, t=0),
                      scene = dict(xaxis=dict(backgroundcolor='white',
                                              color='black',
                                              gridcolor='#f0f0f0',
                                              title_font=dict(size=10),
                                              tickfont=dict(size=10),
                                             ),
                                   yaxis=dict(backgroundcolor='white',
                                              color='black',
                                              gridcolor='#f0f0f0',
                                              title_font=dict(size=10),
                                              tickfont=dict(size=10),
                                              ),
                                   zaxis=dict(backgroundcolor='lightgrey',
                                              color='black', 
                                              gridcolor='#f0f0f0',
                                              title_font=dict(size=10),
                                              tickfont=dict(size=10),
                                             )))
    # Update marker size
    fig.update_traces(marker=dict(size=3, line=dict(color='black', width=0.1)))
    fig.show()
    #fig.write_html("variable_name.html")

In [11]:
data.columns

Index(['SUBUNIT', 'MINING_EQUIP', 'CLASSIFICATION', 'ACCIDENT_TYPE',
       'INJURY_SOURCE', 'NATURE_INJURY', 'INJ_BODY_PART', 'DEGREE_INJURY',
       'CAL_YR', 'MINING_EQUIP_C', 'INJ_BODY_PART_C', 'ACCIDENT_TYPE_C',
       'NARRATIVE', 'MERGERED_NARRATIVE', 'umap1_2D_combined_text',
       'umap2_2D_combined_text', 'umap1_reduced_combined_text',
       'umap2_reduced_combined_text', 'umap3_reduced_combined_text',
       'umap1_2D_narratives_only', 'umap2_2D_narratives_only',
       'umap1_reduced_narratives_only', 'umap2_reduced_narratives_only',
       'umap3_reduced_narratives_only', 'umap1_2D_variables_only',
       'umap2_2D_variables_only',
       'umap1_reduced_embeddings_only_text_variables',
       'umap2_reduced_embeddings_only_text_variables',
       'umap3_reduced_embeddings_only_text_variables',
       '9cluster_kmeans_merged_narratives', '9cluster_kmeans_only_variables',
       '9cluster_kmeans_only_narratives',
       'sub_cluster_9cluster_kmeans_merged_narratives', 'fin

## User can change the variable name in below function to visualize any variable against the low-dimensional UMAP embeddings
### for example in following function MINING_EQUIP_C is visualized against UMAP 3D embeddings and it can be changed to INJ_BODY_PART_C
### to visualize that variable

In [19]:
# chart(data[~data['MINING_EQUIP_C'].isna()][['umap1_reduced_combined_text', 'umap2_reduced_combined_text', 'umap3_reduced_combined_text']].values, data[~data['MINING_EQUIP_C'].isna()]['MINING_EQUIP_C'])