# Semantic Categories

Pass a list of segments to the LLM and ask for labels and descriptions.

In [8]:
import { SystemMessage, HumanMessage } from "@langchain/core/messages";
import { ChatPromptTemplate, MessagesPlaceholder } from "@langchain/core/prompts";
import { JsonOutputParser } from "@langchain/core/output_parsers";
import { readFileSync } from 'node:fs';

import { EXPERIMENTS_DIR, SERVER_DATA_DIR } from '../server/src/util/fileUtils.ts';
import { getNotebookLogger } from '../server/src/Logger.ts';
import { newModel } from '../server/src/agents/agent.ts';

function splitText(text: string) {
  const segments = text.split('\n\n');
  const segind = [];
  let lastIndex = 0;
  for (const segment of segments) {
    segind.push({segment: segment, start: lastIndex, stop: lastIndex + segment.length});
    lastIndex += segment.length + 2; // +2 for the two newlines
  }
  const segindices = [];
  for (const {segment, start, stop} of segind) {
    if (segment !== '') {
      if (segment[0] === '\n') {
        segindices.push({segment: segment.slice(1), start: start+1, stop: stop});
      } else {
        segindices.push({segment, start, stop});
      }
    }
  }
  return segindices;
}

const prompt = ChatPromptTemplate.fromMessages([ new MessagesPlaceholder("messages") ]);
const llm = newModel("Anthropic");
const parser = new JsonOutputParser();
const chain = prompt.pipe(llm).pipe(parser);

const logger = getNotebookLogger();
const lhsText = readFileSync(`${SERVER_DATA_DIR}/AES-md/selected-text.txt`, 'utf-8');
const PROMPT = readFileSync(`${EXPERIMENTS_DIR}/annotateNodePromptCategories8.txt`, 'utf-8');

const lhsSegIndices = splitText(lhsText);
const lhsSegments = [];
for (let i = 0; i < lhsSegIndices.length; i++) {
  const {segment, start, stop} = lhsSegIndices[i];
  lhsSegments.push({text: segment, label: `${i}:${start}-${stop}`}); 
}

const userInput = JSON.stringify(lhsSegments, null, 2);
const output = await chain.invoke({ messages: [
  new SystemMessage(PROMPT),
  new HumanMessage(userInput)
]});
logger.info(output);

[
  {
    label: "0:0-29",
    description: "Section header for algorithm specifications",
    category: "Navigation"
  },
  {
    label: "1:31-161",
    description: "Introduction to the general functions CIPHER() and INVCIPHER() for AES algorithms",
    category: "Definition"
  },
  {
    label: "2:163-361",
    description: "Footnote explaining neutral terminology choice for cipher functions",
    category: "Elaboration"
  },
  {
    label: "3:363-651",
    description: "Explanation of rounds and round keys in AES algorithms",
    category: "Definition"
  },
  {
    label: "4:653-964",
    description: "Definition of the KEYEXPANSION() function that generates round keys",
    category: "Definition"
  },
  {
    label: "5:966-1560",
    description: "Explanation of differences between AES-128, AES-192, and AES-256 variants",
    category: "Definition"
  },
  {
    label: "6:1562-1666",
    description: "Cross-reference to implementation issues in another section",
    category: "Navi

In [9]:
console.log(JSON.stringify(output, null, 2));

[
  {
    "label": "0:0-29",
    "description": "Section header for algorithm specifications",
    "category": "Navigation"
  },
  {
    "label": "1:31-161",
    "description": "Introduction to the general functions CIPHER() and INVCIPHER() for AES algorithms",
    "category": "Definition"
  },
  {
    "label": "2:163-361",
    "description": "Footnote explaining neutral terminology choice for cipher functions",
    "category": "Elaboration"
  },
  {
    "label": "3:363-651",
    "description": "Explanation of rounds and round keys in AES algorithms",
    "category": "Definition"
  },
  {
    "label": "4:653-964",
    "description": "Definition of the KEYEXPANSION() function that generates round keys",
    "category": "Definition"
  },
  {
    "label": "5:966-1560",
    "description": "Explanation of differences between AES-128, AES-192, and AES-256 variants",
    "category": "Definition"
  },
  {
    "label": "6:1562-1666",
    "description": "Cross-reference to implementation issues 

In [10]:
console.log("SEGMENTS LENGTH", lhsSegments.length);
console.log("LLM OUTPUT LENGTH", output.length);

const counter = {};
for (const {text, label} of lhsSegments) {
  counter[label] = 0;
}
for (const {label, description, category} of output) {
  if (label in counter) {
    if (counter[label] !== 0) {
      console.log("ERROR: LABEL ALREADY EXISTS IN OUTPUT", label, counter[label]);
    }
    counter[label] += 1;
  } else {
    console.log("ERROR: LABEL NOT FOUND IN LHS TEXT", label);
  } 
}
for (const {text, label} of lhsSegments) {
  if (counter[label] !== 1) {
    console.log("ERROR: LABEL NOT FOUND IN OUTPUT", label, counter[label]);
  }
}
console.log("DONE: If no errors above, then everything is ok.");

SEGMENTS LENGTH 62
LLM OUTPUT LENGTH 62
DONE: If no errors above, then everything is ok.


In [11]:
const annotations = []
for (const {label, description, category} of output) {
  const match = label.match(/(\d+):(\d+)-(\d+)/);
  const start = parseInt(match[2]);
  const stop = parseInt(match[3]);
  const text = lhsText.substring(start,stop);
  annotations.push({start, stop, category, description, text});
  console.log("LABEL", label);
  console.log("CATEGORY", category);
  console.log("DESCRIPTION", description);
  console.log("TEXT", text);
  console.log("");
}

LABEL 0:0-29
CATEGORY Navigation
DESCRIPTION Section header for algorithm specifications
TEXT # 5. Algorithm Specifications

LABEL 1:31-161
CATEGORY Definition
DESCRIPTION Introduction to the general functions CIPHER() and INVCIPHER() for AES algorithms
TEXT The general function for executing AES-128, AES-192, or AES-256 is denoted by CIPHER(); its inverse is denoted by INVCIPHER().[^1]

LABEL 2:163-361
CATEGORY Elaboration
DESCRIPTION Footnote explaining neutral terminology choice for cipher functions
TEXT [^1]: Informally, these functions are sometimes called "encryption" and "decryption," but neutral terminology is appropriate because there are other applications of block ciphers besides encryption.

LABEL 3:363-651
CATEGORY Definition
DESCRIPTION Explanation of rounds and round keys in AES algorithms
TEXT The core of the algorithms for CIPHER() and INVCIPHER() is a sequence of fixed transformations of the state called a *round*. Each round requires an additional input called the *r

In [12]:
console.log(JSON.stringify(annotations, null, 2));

[
  {
    "start": 0,
    "stop": 29,
    "category": "Navigation",
    "description": "Section header for algorithm specifications",
    "text": "# 5. Algorithm Specifications"
  },
  {
    "start": 31,
    "stop": 161,
    "category": "Definition",
    "description": "Introduction to the general functions CIPHER() and INVCIPHER() for AES algorithms",
    "text": "The general function for executing AES-128, AES-192, or AES-256 is denoted by CIPHER(); its inverse is denoted by INVCIPHER().[^1]"
  },
  {
    "start": 163,
    "stop": 361,
    "category": "Elaboration",
    "description": "Footnote explaining neutral terminology choice for cipher functions",
    "text": "[^1]: Informally, these functions are sometimes called \"encryption\" and \"decryption,\" but neutral terminology is appropriate because there are other applications of block ciphers besides encryption."
  },
  {
    "start": 363,
    "stop": 651,
    "category": "Definition",
    "description": "Explanation of rounds an