# Setup and Settings

Necessary imports and settings:

In [2]:
import torch, os
from sklearn.decomposition import PCA
from sklearn.cluster import OPTICS
import pandas as pd
import plotly.io as pio
import plotly.express as px
pio.renderers.default = "vscode"

We now select a word to analyze. Note that to visualize data, you must have already created the processed data file using 
```sh
python main.py --word [your_word]
```

Processed data will be loaded from `cache/[your_word].pt`. 

In [3]:
word = 'right'

# Visualization

Running the following code block will create an interactive 3D PCA plot of the data extracted for the chosen word.

In [None]:

def format_author(author):
    return ' '.join(author.split(',')[::-1])

# Load the cached data for this word and ouput the sample size
with open(os.path.join(os.getcwd(), f'cache/{word}.pt'), 'rb') as FILE:
    tensor, metadata = torch.load(FILE)

num_samples = len(metadata)
print(f'Number of samples: {num_samples}')

# Preform PCA on the embedded vectors
pca = PCA(n_components = 10)
transformed_data = pca.fit_transform(tensor.to('cpu').numpy())
clustering = OPTICS(xi = 0.01).fit_predict(transformed_data)

# Format the dataframe for display purposes
df = pd.concat([metadata.reset_index(), pd.DataFrame(clustering, columns=['cluster']), pd.DataFrame(transformed_data)], axis = 1)
df = df[df['cluster'] > 0]
df['size'] = [0.1] * len(df)
df['author'] = df['author'].apply(lambda author : format_author(author) + ", " if type(author) is str else "")

# Display data
fig = px.scatter_3d(
    df, 
    x = 0, 
    y = 1, 
    z = 2, 
    custom_data=['context','author','title'],
    color = 'cluster',
    size = 'size', 
    opacity = 0.1, 
    width = 800, 
    height = 800
    )
fig.update_traces(hovertemplate='<b>%{customdata[0]}</b><br>%{customdata[1]}<i>%{customdata[2]}</i>')
fig.show()

df