# Setup and Settings

Necessary imports and settings:

In [2]:
import torch, os
from sklearn.decomposition import PCA
from sklearn.cluster import OPTICS
import pandas as pd
import plotly.io as pio
import plotly.express as px
pio.renderers.default = "vscode"

We now select a word to analyze. Note that to visualize data, you must have already created the processed data file using 
```sh
python main.py --word [your_word]
```

Processed data will be loaded from `cache/[your_word].pt`. 

In [3]:
word = 'right'

# Data Processing

The data stored in `cache` will contain a table of the metadata and the vector embeddings of each use of the word. To visualize it, we must preform some analysis. The following code will load the data, preform PCA to reduce the dimension, and then apply a clustering algorithm to attempt to automatically differentiate the different meanings. 

In [None]:

def format_author(author):
    return ' '.join(author.split(',')[::-1])

# Load the cached data for this word and ouput the sample size
with open(os.path.join(os.getcwd(), f'cache/{word}.pt'), 'rb') as FILE:
    tensor, metadata = torch.load(FILE)

num_samples = len(metadata)
print(f'Number of samples: {num_samples}')

# Preform PCA & clustering on the embedded vectors
pca = PCA(n_components = 10)
transformed_data = pca.fit_transform(tensor.to('cpu').numpy())
clustering = OPTICS(xi = 0.01).fit_predict(transformed_data)

# Visualization

Now that the data has been processed, we just need to make a plot! We will use a three dimensional scatter plot along the three most important principal axes.

In [5]:
# Format the dataframe for display purposes
df = pd.concat([metadata.reset_index(), pd.DataFrame(clustering, columns=['cluster']), pd.DataFrame(transformed_data)], axis = 1)
df['size'] = [0.1] * len(df)
df['author'] = df['author'].apply(lambda author : format_author(author) + ", " if type(author) is str else "")

# Select only core points
df = df[df['cluster'] > 0]

# Display data
fig = px.scatter_3d(
    df, 
    x = 0, 
    y = 1, 
    z = 2, 
    custom_data=['context','author','title'],
    color = 'cluster',
    color_continuous_scale=px.colors.cyclical.Phase,
    size = 'size', 
    opacity = 0.1, 
    width = 800, 
    height = 800
    )
fig.update_traces(hovertemplate='<b>%{customdata[0]}</b><br>%{customdata[1]}<i>%{customdata[2]}</i>')
fig.show()

df

Unnamed: 0,index,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,type,...,1,2,3,4,5,6,7,8,9,size
9,9,/home/andrew/polysemia/SPGC-tokens-2018-07-18/...,Abraham Lincoln's First Inaugural Address,"Abraham Lincoln,",1809.0,1865.0,['en'],36.0,{'United States -- Politics and government -- ...,Text,...,11.813222,-0.059437,0.479397,0.968950,-1.557299,-2.185760,-0.614403,-1.651793,-0.155131,0.1
10,9,/home/andrew/polysemia/SPGC-tokens-2018-07-18/...,Abraham Lincoln's First Inaugural Address,"Abraham Lincoln,",1809.0,1865.0,['en'],36.0,{'United States -- Politics and government -- ...,Text,...,9.686356,-0.388266,0.322470,-0.743470,-3.417445,0.815747,-2.752133,2.005354,-2.821510,0.1
14,9,/home/andrew/polysemia/SPGC-tokens-2018-07-18/...,Abraham Lincoln's First Inaugural Address,"Abraham Lincoln,",1809.0,1865.0,['en'],36.0,{'United States -- Politics and government -- ...,Text,...,11.999957,-2.216446,0.698786,1.536634,-0.384171,-1.639021,-1.200243,-0.147184,-2.168077,0.1
19,10,/home/andrew/polysemia/SPGC-tokens-2018-07-18/...,The King James Version of the Bible,,,,['en'],3818.0,{'Bible'},Text,...,-0.685811,0.238710,1.791875,-2.982142,-2.105741,1.624009,-1.448319,-0.174083,0.955424,0.1
30,10,/home/andrew/polysemia/SPGC-tokens-2018-07-18/...,The King James Version of the Bible,,,,['en'],3818.0,{'Bible'},Text,...,-1.329047,-2.129349,-5.069890,0.859514,3.004183,0.949253,-2.445701,0.197705,-1.230922,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46211,1099,/home/andrew/polysemia/SPGC-tokens-2018-07-18/...,The Riverman,"Stewart Edward White,",1873.0,1946.0,['en'],26.0,"{'Adventure and adventurers -- Fiction', 'Cond...",Text,...,-5.883408,-3.966858,-4.962946,4.538615,-3.674036,-1.303033,-1.565928,-1.233443,0.684646,0.1
46216,1099,/home/andrew/polysemia/SPGC-tokens-2018-07-18/...,The Riverman,"Stewart Edward White,",1873.0,1946.0,['en'],26.0,"{'Adventure and adventurers -- Fiction', 'Cond...",Text,...,-4.254066,-6.787354,-5.647851,2.382524,-1.617905,2.775594,-0.490758,-0.423125,-0.454031,0.1
46250,1100,/home/andrew/polysemia/SPGC-tokens-2018-07-18/...,The First Part of Henry the Sixth,"William Shakespeare,",1564.0,1616.0,['en'],10.0,"{'Henry VI, King of England, 1421-1471 -- Dram...",Text,...,6.666350,-0.634612,0.802814,-0.530209,-0.839166,1.760336,-0.328950,-1.963763,0.892909,0.1
46262,1101,/home/andrew/polysemia/SPGC-tokens-2018-07-18/...,The Second Part of King Henry the Sixth,"William Shakespeare,",1564.0,1616.0,['en'],20.0,"{'Henry VI, King of England, 1421-1471 -- Dram...",Text,...,4.367765,1.177364,2.344680,-3.279665,-4.940442,0.538068,-3.097991,0.478105,1.698400,0.1
