# Setup and Settings

Necessary imports and settings:

In [14]:
import torch, os
from sklearn.decomposition import PCA
from sklearn.cluster import OPTICS
import pandas as pd
import plotly.io as pio
import plotly.express as px
pio.renderers.default = "vscode"

We now select a word to analyze. Note that to visualize data, you must have already created the processed data file using 
```sh
python main.py --word [your_word]
```

Processed data will be loaded from `cache/[your_word].pt`. 

In [15]:
word = 'right'

# Data Processing

The data stored in `cache` will contain a table of the metadata and the vector embeddings of each use of the word. To visualize it, we must preform some analysis. The following code will load the data, preform PCA to reduce the dimension, and then apply a clustering algorithm to attempt to automatically differentiate the different meanings. 

In [None]:

def format_author(author):
    return ' '.join(author.split(',')[::-1])

# Load the cached data for this word and ouput the sample size
with open(os.path.join(os.getcwd(), f'cache/{word}.pt'), 'rb') as FILE:
    tensor, metadata = torch.load(FILE)

num_samples = len(metadata)
print(f'Number of samples: {num_samples}')

# Preform PCA & clustering on the embedded vectors
pca = PCA(n_components = 20)
transformed_data = pca.fit_transform(tensor.to('cpu').numpy())
clustering = OPTICS(xi = 0.001).fit_predict(transformed_data)

Number of samples: 70



You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.



In [17]:
metadata

Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,type,context
9999,/Users/andrewmoore/work/llc/polysemia/polysemi...,The Magna Carta,Anonymous,,,['en'],188.0,"{'Magna Carta', 'Constitutional history -- Eng...",Text,... of the church elections a<i> right</i> rec...
9999,/Users/andrewmoore/work/llc/polysemia/polysemi...,The Magna Carta,Anonymous,,,['en'],188.0,"{'Magna Carta', 'Constitutional history -- Eng...",Text,... thereby be deprived of the<i> right</i> of...
9999,/Users/andrewmoore/work/llc/polysemia/polysemi...,The Magna Carta,Anonymous,,,['en'],188.0,"{'Magna Carta', 'Constitutional history -- Eng...",Text,... no one deny or delay<i> right</i> or justi...
9999,/Users/andrewmoore/work/llc/polysemia/polysemi...,The Magna Carta,Anonymous,,,['en'],188.0,"{'Magna Carta', 'Constitutional history -- Eng...",Text,... fee claims to own a<i> right</i> on our re...
9999,/Users/andrewmoore/work/llc/polysemia/polysemi...,The Magna Carta,Anonymous,,,['en'],188.0,"{'Magna Carta', 'Constitutional history -- Eng...",Text,... will we refuse or delay<i> right</i> or ju...
...,...,...,...,...,...,...,...,...,...,...
10001,/Users/andrewmoore/work/llc/polysemia/polysemi...,The House on the Borderland,"Hodgson, William Hope",1877.0,1918.0,['en'],593.0,{'Science fiction'},Text,... silences far to my<i> right</i> away up am...
10001,/Users/andrewmoore/work/llc/polysemia/polysemi...,The House on the Borderland,"Hodgson, William Hope",1877.0,1918.0,['en'],593.0,{'Science fiction'},Text,... study is situated there lying<i> right</i>...
10001,/Users/andrewmoore/work/llc/polysemia/polysemi...,The House on the Borderland,"Hodgson, William Hope",1877.0,1918.0,['en'],593.0,{'Science fiction'},Text,... at the window on my<i> right</i> as i do s...
10001,/Users/andrewmoore/work/llc/polysemia/polysemi...,The House on the Borderland,"Hodgson, William Hope",1877.0,1918.0,['en'],593.0,{'Science fiction'},Text,... it has covered all my<i> right</i> arm and...


# Visualization

Now that the data has been processed, we just need to make a plot! We will use a three dimensional scatter plot along the three most important principal axes.

In [18]:
# Format the dataframe for display purposes
df = pd.concat([metadata.reset_index(), pd.DataFrame(clustering.astype(int), columns=['cluster']), pd.DataFrame(transformed_data)], axis = 1)
df['size'] = [0.1] * len(df)
df['author'] = df['author'].apply(lambda author : format_author(author) + ", " if type(author) is str else "")

# Select only core points
df = df[df['cluster'] >= 0]

# Display data
fig = px.scatter_3d(
    df, 
    x = 0, 
    y = 1, 
    z = 2, 
    custom_data=['context','author','title'],
    color = 'cluster',
    color_discrete_sequence = px.colors.qualitative.D3,
    color_continuous_scale = px.colors.cyclical.Phase,
    size = 'size', 
    opacity = 1.0, 
    width = 800, 
    height = 800
    )
fig.update_traces(hovertemplate='<b>%{customdata[0]}</b><br>%{customdata[1]}<i>%{customdata[2]}</i>')
fig.show()

df

Unnamed: 0,index,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,type,...,1,2,3,4,5,6,7,8,9,size
0,9999,/Users/andrewmoore/work/llc/polysemia/polysemi...,The Magna Carta,"Anonymous,",,,['en'],188.0,"{'Magna Carta', 'Constitutional history -- Eng...",Text,...,0.881886,-0.528351,-1.895368,0.613471,-1.317061,0.666994,-0.690474,1.178499,0.852587,0.1
1,9999,/Users/andrewmoore/work/llc/polysemia/polysemi...,The Magna Carta,"Anonymous,",,,['en'],188.0,"{'Magna Carta', 'Constitutional history -- Eng...",Text,...,-1.91169,0.076905,-0.488904,0.314233,-0.213403,3.445475,1.326531,-1.098274,-0.549277,0.1
2,9999,/Users/andrewmoore/work/llc/polysemia/polysemi...,The Magna Carta,"Anonymous,",,,['en'],188.0,"{'Magna Carta', 'Constitutional history -- Eng...",Text,...,0.617966,-4.746714,-0.502597,2.461632,-2.433477,-1.375246,-0.716915,-1.882135,2.516887,0.1
3,9999,/Users/andrewmoore/work/llc/polysemia/polysemi...,The Magna Carta,"Anonymous,",,,['en'],188.0,"{'Magna Carta', 'Constitutional history -- Eng...",Text,...,0.334003,1.001794,-1.550493,0.328001,-1.194864,2.698033,-2.074622,-0.28282,0.619872,0.1
4,9999,/Users/andrewmoore/work/llc/polysemia/polysemi...,The Magna Carta,"Anonymous,",,,['en'],188.0,"{'Magna Carta', 'Constitutional history -- Eng...",Text,...,1.582301,-4.303625,1.338778,1.234804,-1.112276,-3.238378,-1.080543,-1.398403,1.262514,0.1
7,9999,/Users/andrewmoore/work/llc/polysemia/polysemi...,The Magna Carta,"Anonymous,",,,['en'],188.0,"{'Magna Carta', 'Constitutional history -- Eng...",Text,...,-2.197036,4.382241,-2.018462,-2.064909,-1.512171,0.225321,-1.777179,0.159914,-0.383437,0.1
8,9999,/Users/andrewmoore/work/llc/polysemia/polysemi...,The Magna Carta,"Anonymous,",,,['en'],188.0,"{'Magna Carta', 'Constitutional history -- Eng...",Text,...,0.158496,-2.281681,-0.424756,1.609536,-1.421711,3.295506,-1.250035,-0.302559,0.964264,0.1
9,9999,/Users/andrewmoore/work/llc/polysemia/polysemi...,The Magna Carta,"Anonymous,",,,['en'],188.0,"{'Magna Carta', 'Constitutional history -- Eng...",Text,...,-1.145872,3.972378,-3.045503,-1.97816,0.412484,1.129781,-1.224859,-1.056059,1.642605,0.1
10,9999,/Users/andrewmoore/work/llc/polysemia/polysemi...,The Magna Carta,"Anonymous,",,,['en'],188.0,"{'Magna Carta', 'Constitutional history -- Eng...",Text,...,1.897795,-3.856409,1.358099,0.753512,-1.040547,-3.104825,-1.069534,-1.416816,1.282771,0.1
13,9999,/Users/andrewmoore/work/llc/polysemia/polysemi...,The Magna Carta,"Anonymous,",,,['en'],188.0,"{'Magna Carta', 'Constitutional history -- Eng...",Text,...,-2.57602,4.269138,-1.877802,-2.189383,-1.072178,0.38636,-1.842466,0.131196,-0.320002,0.1
