# Nearest neighbors graph

Anton Antonov   
September 2024

-----

## Setup

In [1]:
use Data::Importers;
use LLM::Functions;
use XDG::BaseDirectory :terms;

use LLM::RetrievalAugmentedGeneration;
use LLM::RetrievalAugmentedGeneration::VectorDatabase;

use Data::Reshapers;
use Data::Summarizers;
use Math::Nearest;
use Math::DistanceFunctions::Native;
use Statistics::OutlierIdentifiers;

use NativeCall;

use Math::Nearest;
use Graph;
use JavaScript::D3;

### JavaScript

Here we prepare the notebook to visualize with JavaScript:

In [None]:
#% javascript
require.config({
     paths: {
     d3: 'https://d3js.org/d3.v7.min'
}});

require(['d3'], function(d3) {
     console.log(d3);
});

Verification:

In [None]:
#% js
js-d3-list-line-plot(10.rand xx 40, background => 'none', stroke-width => 2)

Here we set a collection of visualization variables:

In [None]:
my $title-color = 'Ivory';
my $stroke-color = 'SlateGray';
my $tooltip-color = 'LightBlue';
my $tooltip-background-color = 'none';
my $background = '1F1F1F';
my $color-scheme = 'schemeTableau10';
my $edge-thickness = 3;
my $vertex-size = 6;
my $mmd-theme = q:to/END/;
%%{
  init: {
    'theme': 'forest',
    'themeVariables': {
      'lineColor': 'Ivory'
    }
  }
}%%
END
my %force = collision => {iterations => 0, radius => 10},link => {distance => 180};
my %force2 = charge => {strength => -30, iterations => 4}, collision => {radius => 50, iterations => 4}, link => {distance => 30};

-------

## Small graph

Here is a set of words:

In [None]:
my @content = <apple ardvark bible car cat cherry chocolate cookie cow devil film horse house movie projector raccoon tiger tree>;

Here we specify LLM-access configuration:

In [None]:
my $conf = llm-configuration('Gemini');
$conf.Hash.elems

Here we create semantic search index:

In [None]:
my $vdbObjSmall = create-semantic-search-index(@content, e => $conf, name => 'words')

Here we see the dimensions of the obtained vectors:

In [None]:
$vdbObjSmall.vectorsÂ».elems

Here we find the embedding of a certain word (using the same LLM model as above):

In [None]:
my $vec = llm-embedding("coffee", e => $conf).head;
$vec.elems

Here we find the closest Nearest Neighbors (NNs) of that word:

In [None]:
my @nns = $vdbObjSmall.nearest($vec, 3, prop => 'label' ).map(*.Slip)

Here are the corresponding words:

In [None]:
$vdbObjSmall.items{@nns}

Here we find the corresponding NNs graph with 1 and 2 nns per vertex:

In [None]:
my ($gr1, $gr2) = [1, 2].map({ 
        # NNs graph
        my @edges = nearest-neighbor-graph(
            $vdbObjSmall.vectors.pairs, 
            $_, 
            method => 'Scan', 
            distance-function => &euclidean-distance, 
            format => 'dataset'
        );

        # Replace IDs with names
        @edges .= map({ $_<from> = $vdbObjSmall.items{$_<from>}; $_<to> = $vdbObjSmall.items{$_<to>}; $_ });
        
        # Make the graph
        Graph.new(@edges)
}).flat


Find 1-nns graph's connected components:

In [None]:
my @comps = $gr1.connected-components

In [None]:
#%js

$gr1.edges(:dataset)
==> js-d3-graph-plot(
        :$background,
        highlight => [|@comps.head, |$gr1.subgraph(@comps.head).edges],
        width => 600,
        vertex-label-color => 'Ivory',
        edge-thickness => 2,
        vertex-size => 3,
        vertex-color => 'Blue',
        edge-color => 'SteelBlue',
        force => { charge => {strength => -200, iterations => 4}, collision => {iterations => 1, radius => 10} }
    )

------

## Ingest vector database

In [None]:
my $vdbObj = LLM::RetrievalAugmentedGeneration::VectorDatabase.new();

In [None]:
vector-database-objects(f=>'hash').grep({ $_<id> eq 'd2effebc-2cef-4b2b-84ca-5dcfa3c1864b'}).head<file>
==> { $vdbObj.import($_) }()

In [None]:
$vdbObj.vectors.values.head.elems

-----

## Nearest neighbor graph

In [None]:
my @edges = nearest-neighbor-graph($vdbObj.vectors.pairs, 1, method => 'Scan', distance-function => &euclidean-distance, format => 'raku')

In [None]:
my $gr = Graph.new(@edges)

In [None]:
my @comps = $gr.connected-components.sort(-*.elems);
.say for @comps.head(12)

In [None]:
#% markdown

$vdbObj.items<119.0>

In [None]:
#%js

@edges 
==> js-d3-graph-plot(
        :$background,
        highlight => @comps.head(6).map({ [|$_, |$gr.subgraph($_).edges] }),
        vertex-label-color => 'none',
        edge-thickness => 2,
        vertex-size => 3,
        vertex-color => 'Blue',
        width => 1200,
        edge-color => 'Gray',
        vertex-color => 'Ivory',
    )