In [1]:
import itertools
import numpy as np

import plotly.express as px
import plotly.graph_objects as go

In [2]:
import weaviate
from weaviate.classes.config import Property, DataType, Configure
from weaviate import collections as col
from weaviate.classes.query import Filter
from weaviate.classes.init import AdditionalConfig, Timeout

client = weaviate.connect_to_local(
    additional_config=AdditionalConfig(
        timeout=Timeout(init=20, query=120, insert=120)  # Values in seconds
    ))

# Populate

## Common Words Dataset

In [122]:
import nltk
from nltk.corpus import brown
from collections import Counter

# Download the brown corpus if you haven't already
nltk.download('brown')

# Get words from the Brown corpus
words = brown.words()

# Count the frequency of each word
word_freq = Counter(words)

# Get the most common words
most_common_words = word_freq.most_common(10000)

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\cdalz\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [123]:
word = client.collections.create(
    "Word",
    properties=[
        Property(name="word", data_type=DataType.TEXT),
    ],
    generative_config=Configure().Generative().ollama(api_endpoint="http://weaviate_ollama_1:11434", model= "llama3"),
    vectorizer_config=Configure().Vectorizer().text2vec_ollama(api_endpoint="http://weaviate_ollama_1:11434")
)

In [128]:
import tqdm

with word.batch.dynamic() as batch:
    for w, _ in tqdm.tqdm(most_common_words):
        batch.add_object(
            properties={
        "word": w
    }
        )

100%|██████████| 10000/10000 [13:52<00:00, 12.01it/s]


## ML Paper Dataset

In [78]:
from datasets import load_dataset

In [93]:
dataset = load_dataset("CShorten/ML-ArXiv-Papers")

Downloading readme:   0%|          | 0.00/986 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/147M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/117592 [00:00<?, ? examples/s]


unclosed file <_io.BufferedReader name='C:\\Users\\cdalz\\.cache\\huggingface\\datasets\\downloads\\3f6f69cdd892afd03084d260fe5678f805316cdbfde216136f7d390a95b3cae8'>



In [117]:
print(str(dataset.data["train"]["abstract"][9]))

  Max-product belief propagation is a local, iterative algorithm to find the
mode/MAP estimate of a probability distribution. While it has been successfully
employed in a wide variety of applications, there are relatively few
theoretical guarantees of convergence and correctness for general loopy graphs
that may have many short cycles. Of these, even fewer provide exact ``necessary
and sufficient'' characterizations.
  In this paper we investigate the problem of using max-product to find the
maximum weight matching in an arbitrary graph with edge weights. This is done
by first constructing a probability distribution whose mode corresponds to the
optimal matching, and then running max-product. Weighted matching can also be
posed as an integer program, for which there is an LP relaxation. This
relaxation is not always tight. In this paper we show that \begin{enumerate}
\item If the LP relaxation is tight, then max-product always converges, and
that too to the correct answer. \item If the

In [119]:
len(dataset.data["train"]["title"])

117592

In [113]:
paper = client.collections.create(
    "Paper",
    properties=[
        Property(name="title", data_type=DataType.TEXT),
        Property(name="body", data_type=DataType.TEXT),
    ],
    generative_config=Configure().Generative().ollama(api_endpoint="http://weaviate_ollama_1:11434", model= "llama3"),
    vectorizer_config=Configure().Vectorizer().text2vec_ollama(api_endpoint="http://weaviate_ollama_1:11434")
)

In [143]:
import tqdm

with paper.batch.dynamic() as batch:
    for t, a in tqdm.tqdm(zip(dataset.data["train"]["title"], dataset.data["train"]["abstract"]), total=len(dataset.data["train"]["title"])):
        batch.add_object(
            properties={
        "title": str(t),
        "body": str(a),
    }
        )

100%|██████████| 117592/117592 [3:01:09<00:00, 10.82it/s] 


# Delete

# Vectors

In [3]:
paper = client.collections.get("Paper")

In [4]:
word = client.collections.get("Word")

In [5]:
n_samples = 10000
vs = np.zeros((n_samples, 768))
obj_properties = []
for i, p in enumerate(paper.iterator(include_vector=True)):  # take n_samples from paper dataset
    vs[i] = p.vector["default"]
    obj_properties.append(p.properties)
    if i == n_samples - 1:
        break

In [6]:
n_samples = 10000
wvs = np.zeros((n_samples, 768))
most_common_words = []
for i, w in enumerate(word.iterator(include_vector=True)):  # take n_samples from paper dataset
    wvs[i] = w.vector["default"]
    most_common_words.append(w.properties["word"])
    if i == n_samples - 1:
        break
most_common_words = np.array(most_common_words)

## PCA

In [7]:
from sklearn.decomposition import PCA

In [8]:
pca = PCA(n_components=768)
pca.fit(vs)

## Semantic Meaning

In [9]:
# compute vector norms
most_common_word_norms = np.array([np.linalg.norm(v) for v in wvs])

In [10]:
n_words = 30
most_common_word_norms = np.array([np.linalg.norm(v) for v in wvs])

def meaning_plot(v):
    dot_similarities = v @ wvs.T / most_common_word_norms
    softmax_similarities = np.exp(dot_similarities) / np.sum(np.exp(dot_similarities), axis=0)
    sorted_idx = np.argsort(-softmax_similarities)
    return px.bar(x=most_common_words[sorted_idx][:n_words], y=softmax_similarities[sorted_idx][:n_words])

## KL Divergence

Given the invariant measure $m(x)$ as the distribution of word embeddings (as defined here: https://en.wikipedia.org/wiki/Limiting_density_of_discrete_points), and $p(x)$ as the dataset of papers, what is the information gained about the papers from comparing the distributions: $H(X) = - D_{KL}(P || M)$?

Capture the approximate shape of both the distribution of words and papers with a joint gaussian model, which captures the variances in each dimension and covariances between them, assuming a gaussian relationship.

In [11]:
word_covariance_matrix = np.cov(wvs.T)

In [12]:
# extract only the variance - this is a quick and dirty way to remove many parameters, and "cheat" the distribution into a much broader more general one
word_covariance_matrix = np.diag(np.diagonal(word_covariance_matrix))

In [13]:
paper_covariance_matrix = np.cov(vs.T)

In [14]:
word_inv_cov = np.linalg.inv(word_covariance_matrix)
_, word_log_det_cov = np.linalg.slogdet(word_covariance_matrix)

In [15]:
paper_inv_cov = np.linalg.inv(paper_covariance_matrix)
_, paper_log_det_cov = np.linalg.slogdet(paper_covariance_matrix)

In [18]:
kl_divergence = 0.5 * (
    np.trace(word_inv_cov @ paper_covariance_matrix) + (np.mean(wvs, axis=0) - np.mean(vs, axis=0)).T @ word_inv_cov @ (np.mean(wvs, axis=0) - np.mean(vs, axis=0)) - paper_covariance_matrix.shape[0] + paper_log_det_cov - word_log_det_cov
) * np.log2(np.e)  # compute the total KL Divergence, and convert to bits

In [24]:
print("Total KL Divergence:", kl_divergence, "bits")

Total KL Divergence: -887.4033216100204 bits


In [19]:
def multivariance_log_likelihood(x, inverse, log_determinant):
    k = inverse.shape[0]
    return -0.5 * k * np.log(2 * np.pi) - 0.5 * log_determinant - 0.5 * x.T @ inverse @ x

#### Projected PCA Distribution Plot

In [20]:
N = 50

def plot_pca_pdfs(dims):
    # Project into relevant dimensions:
    projection_matrix = pca.components_[dims]
    projected_mean_paper = projection_matrix @ np.mean(vs, axis=0)
    projected_cov_paper = projection_matrix @ paper_covariance_matrix @ projection_matrix.T
    projected_mean_word = projection_matrix @ np.mean(wvs, axis=0)
    projected_cov_word = projection_matrix @ word_covariance_matrix @ projection_matrix.T
    
    # Find limits
    lims = np.maximum(
        np.max(np.abs(projected_mean_paper) + 2 * np.sqrt(np.diagonal(projected_cov_paper))),
        np.max(np.abs(projected_mean_word) + 2 * np.sqrt(np.diagonal(projected_cov_word))) 
    )

    # Create grid sampling:
    x = np.linspace(-lims, lims, N)
    y = np.linspace(-lims, lims, N)
    cov = projected_cov_paper
    icov = np.linalg.inv(cov)
    _, ldcov = np.linalg.slogdet(cov)
    covw = projected_cov_word
    icovw = np.linalg.inv(covw)
    _, ldcovw = np.linalg.slogdet(covw)
    
    kl_divergence = 0.5 * (
        np.trace(icovw @ cov) + (projected_mean_word - projected_mean_paper).T @ icovw @ (projected_mean_word - projected_mean_paper) - cov.shape[0] + ldcov - ldcovw
    ) * np.log2(np.e)

    # Compute PDFs:
    
    z_paper = np.array([np.exp(multivariance_log_likelihood(np.array(v) - projected_mean_paper, icov, ldcov)) for v in itertools.product(x, y)]).reshape(N, N)
    z_word = np.array([np.exp(multivariance_log_likelihood(np.array(v) - projected_mean_word, icovw, ldcovw)) for v in itertools.product(x, y)]).reshape(N, N)

    # Plot surfaces:
    fig = go.Figure([go.Surface(x=x, y=y, z=z_paper, opacity=0.5, colorscale=[[0, 'red'], [1, 'red']], showscale=False),
               go.Surface(x=x, y=y, z=z_word, opacity=0.5, colorscale=[[0, 'blue'], [1, 'blue']], showscale=False),
              ]).update_layout(height=1000, scene_zaxis_title="Probability Density", scene_zaxis_range=[0, max(min(np.max(z_word) * 3, np.max(z_paper)), np.max(z_word))],
                              title=f"PCA Dimensions: {dims}, Marginal KL Divergence: {kl_divergence} bits")
    
    # Add annotations for a legend-like effect
    fig.add_trace(go.Scatter3d(
        x=[0], y=[0], z=[0], 
        mode='lines',
        marker=dict(size=10, color='blue'),
        showlegend=True,
        name='Neutral Distribution'
    ))
    
    fig.add_trace(go.Scatter3d(
        x=[0], y=[0], z=[0], 
        mode='lines',
        marker=dict(size=10, color='red'),
        showlegend=True,
        name='ML Paper Distribution'
    ))

    return fig

### Compute Projected KL Divergences

In [21]:
def get_projected_kl_divergence(dims):
    # Project into relevant dimensions:
    projection_matrix = pca.components_[dims]
    projected_mean_paper = projection_matrix @ np.mean(vs, axis=0)
    projected_cov_paper = projection_matrix @ paper_covariance_matrix @ projection_matrix.T
    projected_mean_word = projection_matrix @ np.mean(wvs, axis=0)
    projected_cov_word = projection_matrix @ word_covariance_matrix @ projection_matrix.T
    
    # Find limits
    lims = np.maximum(
        np.max(np.abs(projected_mean_paper) + 2 * np.sqrt(np.diagonal(projected_cov_paper))),
        np.max(np.abs(projected_mean_word) + 2 * np.sqrt(np.diagonal(projected_cov_word))) 
    )

    # Create grid sampling:
    x = np.linspace(-lims, lims, N)
    y = np.linspace(-lims, lims, N)
    cov = projected_cov_paper
    icov = np.linalg.inv(cov)
    _, ldcov = np.linalg.slogdet(cov)
    covw = projected_cov_word
    icovw = np.linalg.inv(covw)
    _, ldcovw = np.linalg.slogdet(covw)

    return 0.5 * (
        np.trace(icovw @ cov) + (projected_mean_word - projected_mean_paper).T @ icovw @ (projected_mean_word - projected_mean_paper) - cov.shape[0] + ldcov - ldcovw
    ) * np.log2(np.e)

In [22]:
marginal_kl_divergences = [get_projected_kl_divergence([i]) for i in range(pca.n_components)]

# Flask Front End

In [23]:
from flask import Flask, render_template_string, jsonify, request
import plotly.graph_objs as go
from threading import Thread, Event

app = Flask(__name__)
stop_event = Event()

@app.route('/')
def index():
    return render_template_string('''
        <!DOCTYPE html>
        <html lang="en">
        <head>
            <meta charset="UTF-8">
            <title>Plotly in Flask</title>
            <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
            <style>
                .container {
                    display: flex;
                    justify-content: space-around;
                }
                .left-column, .right-column {
                    width: 45%;
                    margin-bottom: 20px;
                }
                .plot-container {
                    width: 100%;
                }
                .plot {
                    width: 100%;
                    margin-bottom: 20px;
                }
                table {
                    width: 100%;
                    border-collapse: collapse;
                    margin-bottom: 20px;
                }
                th, td {
                    border: 1px solid black;
                    padding: 8px;
                    text-align: left;
                }
                th {
                    background-color: #f2f2f2;
                }
            </style>
        </head>
        <body>
            <h1>Embedding Entropy Analysis</h1>
            <div class="container">
                <div class="left-column">
                    <h2>PCA of ML Paper Embeddings</h2>
                    <table>
                        <tr>
                            {% for col in columns %}
                            <th>{{ col }}</th>
                            {% endfor %}
                        </tr>
                        {% for row in table_dataset %}
                        <tr>
                            {% for col in columns %}
                            <td>{{ row[col] }}</td>
                            {% endfor %}
                        </tr>
                        {% endfor %}
                    </table>
                </div>
                <div class="right-column">
                    <div class="plot-container">
                        <div class="plot">
                            <label for="datasetInput1">PCA Dimension x:</label>
                            <input type="text" id="datasetInput1">
                            <button onclick="submitDataset(1)">Plot</button>
                            <div id="plot1"></div>
                        </div>
                        <div class="plot">
                            <label for="datasetInput2">PCA Dimension y:</label>
                            <input type="text" id="datasetInput2">
                            <button onclick="submitDataset(2)">Plot</button>
                            <div id="plot2"></div>
                        </div>
                        <div class="plot">
                            <div id="plot3"></div>
                        </div>
                    </div>
                </div>
            </div>
            <script>
                var graphDiv1 = document.getElementById('plot1');
                var graphDiv2 = document.getElementById('plot2');
                var graphDiv3 = document.getElementById('plot3');
                var hasPlot1 = false;
                var hasPlot2 = false;
                var hasPlot3 = false;

                function fetchPlot(dataset, plotDiv) {
                    fetch(`/plot?dataset=${dataset}`)
                        .then(response => response.json())
                        .then(data => {
                            var parsedData = JSON.parse(data);
                            if (plotDiv === 'plot1') {
                                if (hasPlot1) {
                                    Plotly.react(graphDiv1, parsedData.data, parsedData.layout);
                                } else {
                                    Plotly.newPlot(graphDiv1, parsedData.data, parsedData.layout);
                                    hasPlot1 = true;
                                }
                            } else if (plotDiv === 'plot2') {
                                if (hasPlot2) {
                                    Plotly.react(graphDiv2, parsedData.data, parsedData.layout);
                                } else {
                                    Plotly.newPlot(graphDiv2, parsedData.data, parsedData.layout);
                                    hasPlot2 = true;
                                }
                            }
                        })
                        .catch(error => console.error('Error:', error));
                }

                function fetchPlotDistribution(dataset1, dataset2) {
                    fetch(`/plot_distribution?dataset1=${dataset1}&dataset2=${dataset2}`)
                        .then(response => response.json())
                        .then(data => {
                            var parsedData = JSON.parse(data);
                            if (hasPlot3) {
                                Plotly.react(graphDiv3, parsedData.data, parsedData.layout);
                            } else {
                                Plotly.newPlot(graphDiv3, parsedData.data, parsedData.layout);
                                hasPlot3 = true;
                            }
                        })
                        .catch(error => console.error('Error:', error));
                }

                function submitDataset(dataset) {
                    var datasetValue = parseFloat(document.getElementById(`datasetInput${dataset}`).value) || 0;
                    var plotDiv = `plot${dataset}`;
                    fetchPlot(datasetValue, plotDiv);
                    if (dataset === 1 || dataset === 2) {
                        updateCombinedPlot();
                    }
                }

                function updateCombinedPlot() {
                    var dataset1 = parseFloat(document.getElementById('datasetInput1').value) || 0;
                    var dataset2 = parseFloat(document.getElementById('datasetInput2').value) || 0;
                    fetchPlotDistribution(dataset1, dataset2);
                }

                // Initial plots (optional)
                // fetchPlot(1, 'plot1');
                // fetchPlot(2, 'plot2');
            </script>
        </body>
        </html>
    ''', table_dataset=[{"Dimension": d, "Explained Variance": ev, "KL Divergence": kld} for d, ev, kld in zip(np.arange(pca.n_components), pca.explained_variance_, marginal_kl_divergences)],
         columns=["Dimension", "Explained Variance", "KL Divergence"])

@app.route('/plot')
def plot():
    dataset = request.args.get('dataset', '1')  # Ensure dataset is a string
    fig = meaning_plot(pca.components_[int(dataset)])
    graphJSON = fig.to_json()  # Convert the figure to JSON format
    return jsonify(graphJSON)

@app.route('/plot_distribution')
def plot_distribution():
    pca_dim_1 = float(request.args.get('dataset1', '0'))
    pca_dim_2 = float(request.args.get('dataset2', '0'))
    fig = plot_pca_pdfs([int(pca_dim_1), int(pca_dim_2)])
    graphJSON = fig.to_json()  # Convert the figure to JSON format
    return jsonify(graphJSON)

@app.route('/stop')
def stop():
    stop_event.set()
    return 'Server stopping...'

def run_app():
    app.run(port=5000, use_reloader=False)

run_app()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [22/Jun/2024 17:13:49] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [22/Jun/2024 17:13:54] "GET /plot_distribution?dataset1=5&dataset2=0 HTTP/1.1" 200 -
127.0.0.1 - - [22/Jun/2024 17:13:54] "GET /plot?dataset=5 HTTP/1.1" 200 -
127.0.0.1 - - [22/Jun/2024 17:13:58] "GET /plot?dataset=8 HTTP/1.1" 200 -
127.0.0.1 - - [22/Jun/2024 17:13:58] "GET /plot_distribution?dataset1=5&dataset2=8 HTTP/1.1" 200 -
127.0.0.1 - - [22/Jun/2024 17:15:00] "GET /plot?dataset=550 HTTP/1.1" 200 -
127.0.0.1 - - [22/Jun/2024 17:15:00] "GET /plot_distribution?dataset1=550&dataset2=500 HTTP/1.1" 200 -
127.0.0.1 - - [22/Jun/2024 17:15:01] "GET /plot?dataset=500 HTTP/1.1" 200 -
127.0.0.1 - - [22/Jun/2024 17:15:01] "GET /plot_distribution?dataset1=550&dataset2=500 HTTP/1.1" 200 -
