some refactoring and kickoff comparePrompts

andrefs · Mar 6, 2024 · f8ef107 · f8ef107
1 parent ee6f0b6
commit f8ef107
Show file tree

Hide file tree

Showing 9 changed files with 262 additions and 85 deletions.
diff --git a/src/lib/dataset-adapters/collection.ts b/src/lib/dataset-adapters/collection.ts
@@ -1,35 +1,66 @@
-import * as ds from "punuy-datasets";
+import { DatasetProfile } from "../types";
 
-interface DatasetColPairs {
+export interface MultiDatasetScores {
   [w1: string]: {
     [w2: string]: {
-      [dataset: string]: number[];
+      [dataset: string]: number;
     };
   };
 }
 
-const col = {} as DatasetColPairs;
+export interface DatasetScores {
+  [w1: string]: {
+    [w2: string]: number;
+  };
+}
 
-for (const dataset in ds) {
-  const d = ds[dataset as keyof typeof ds];
+export async function loadDatasetScores(dsId: string) {
+  const d = (await import(`punuy-datasets/datasets/${dsId}`)) as DatasetProfile;
+  const res = {} as DatasetScores;
   for (const part of d.partitions) {
     for (const row of part.data) {
       const w1 = row.term1.toLowerCase();
       const w2 = row.term2.toLowerCase();
-
-      col[w1] = col[w1] || {};
-      col[w1][w2] = col[w1][w2] || {};
       if ("value" in row && row.value !== undefined) {
-        col[w1][w2][dataset] = [row.value];
+        res[w1] = res[w1] || {};
+        res[w1][w2] = row.value;
         continue;
       }
       if ("values" in row && Array.isArray(row.values)) {
-        col[w1][w2][dataset] = row.values.filter(
-          v => v !== undefined && v !== null
-        ) as number[];
+        const vals = row.values.filter(v => typeof v === "number") as number[];
+        res[w1] = res[w1] || {};
+        res[w1][w2] = vals.reduce((a, b) => a + b, 0) / vals.length;
       }
     }
   }
+  return res;
 }
 
-export default col;
+export async function loadAllDatasetScores() {
+  const ds = (await import("punuy-datasets")).default;
+  const res = {} as MultiDatasetScores;
+
+  for (const dataset in ds) {
+    const d = ds[dataset as keyof typeof ds];
+    for (const part of d.partitions) {
+      for (const row of part.data) {
+        const w1 = row.term1.toLowerCase();
+        const w2 = row.term2.toLowerCase();
+
+        res[w1] = res[w1] || {};
+        res[w1][w2] = res[w1][w2] || {};
+        if ("value" in row && row.value !== undefined) {
+          res[w1][w2][dataset] = row.value;
+          continue;
+        }
+        if ("values" in row && Array.isArray(row.values)) {
+          const vals = row.values.filter(
+            v => typeof v === "number"
+          ) as number[];
+          res[w1][w2][dataset] = vals.reduce((a, b) => a + b, 0) / vals.length;
+        }
+      }
+    }
+  }
+  return res;
+}
diff --git a/src/lib/experiments/compare-mc30.ts b/src/lib/experiments/compare-mc30.ts
@@ -9,21 +9,14 @@ import oldFs from "fs";
 import { Model, ModelIds, gpt35turbo, gpt4, gpt4turbo } from "../models";
 import { JsonSyntaxError } from "../validation";
 import logger from "../logger";
-
-interface DatasetScores {
-  [term1: string]: {
-    [term2: string]: {
-      [dataset: string]: number;
-    };
-  };
-}
+import { MultiDatasetScores } from "../dataset-adapters/collection";
 
 type ModelsResults = {
   [key in ModelIds]: string[];
 };
 
 export const loadDatasetScores = async () => {
-  const pairs: DatasetScores = {};
+  const pairs: MultiDatasetScores = {};
 
   for (const part of mc30.partitions) {
     for (const entry of part.data) {
@@ -117,7 +110,7 @@ export const loadDatasetScores = async () => {
   return pairs;
 };
 
-const getPairs = (scores: DatasetScores) => {
+const getPairs = (scores: MultiDatasetScores) => {
   const pairs: [string, string][] = [];
 
   for (const term1 in scores) {
@@ -154,6 +147,7 @@ const resultSchema = {
   },
 };
 
+/** Run a single trial of the experiment, with a single model */
 async function runTrialModel(model: Model, prompt: string) {
   const f = {
     name: "evaluate_scores",
@@ -165,6 +159,7 @@ async function runTrialModel(model: Model, prompt: string) {
   return res;
 }
 
+/** Run multiple trials of the experiment, with a single model */
 async function runTrialsModel(trials: number, model: Model, prompt: string) {
   logger.info(`  model ${model.modelId}.`);
   logger.debug(`Prompt: ${prompt}`);
@@ -181,6 +176,7 @@ async function runTrialsModel(trials: number, model: Model, prompt: string) {
   return results;
 }
 
+/** Run multiple trials of the experiment, with multiple models */
 async function runTrials(trials: number) {
   const scores = await loadDatasetScores();
   const pairs = getPairs(scores);
@@ -247,7 +243,7 @@ function unzipResults(results: MC30Results) {
 
 async function validate(
   modelsRes: ModelsResults,
-  humanScores: DatasetScores,
+  humanScores: MultiDatasetScores,
   trials: number
 ) {
   try {
@@ -366,7 +362,11 @@ function calcCorrelation(data: number[][]) {
   return corrMatrix;
 }
 
-function mergeResults(modelsRes: ModelsResults, humanScores: DatasetScores) {
+/** Merge the results from the models and the human scores */
+function mergeResults(
+  modelsRes: ModelsResults,
+  humanScores: MultiDatasetScores
+) {
   const res = {} as MC30Results;
 
   try {

diff --git a/src/lib/experiments/comparePrompts.ts b/src/lib/experiments/comparePrompts.ts
@@ -0,0 +1,127 @@
+import { ws353 } from "punuy-datasets";
+import { Model, gpt4, gpt4turbo, gpt35turbo } from "../models";
+import logger from "../logger";
+import {
+  DatasetScores,
+  loadDatasetScores,
+} from "../dataset-adapters/collection";
+import { ExperimentData } from ".";
+const name = "compare-prompts";
+const description = "Compare the results obtained with different prompts";
+
+interface Prompts {
+  [key: string]: {
+    type: "relatedness" | "similarity";
+    text: string;
+  };
+}
+const prompts: Prompts = {
+  simplest: {
+    type: "relatedness",
+    text: "Indicate how strongly the words in each pair are related in meaning using integers from 1 to 5, where 1 means very unrelated and 5 means very related.",
+  },
+  simpleScale: {
+    type: "relatedness",
+    text: "Indicate how strongly the words in each pair are related in meaning using integers from 1 to 5, where the scale means: 1 - not at all related, 2 - vaguely related, 3 - indirectly related, 4 - strongly related, 5 - inseparably related.",
+  },
+  adaptedWs353: {
+    type: "relatedness",
+    text: 'Hello, we kindly ask you to assist us in a psycholinguistic experiment, aimed at estimating the semantic relatedness of various words in the English language. The purpose of this experiment is to assign semantic relatedness scores to pairs of words, so that machine learning algorithms can be subsequently trained and adjusted using human-assigned scores. Below is a list of pairs of words. For each pair, please assign a numerical relatedness score between 1 and 5 (1 = words are totally unrelated, 5 = words are VERY closely related). By definition, the relatedness of the word to itself should be 5. You may assign fractional scores (for example, 3.5).  When estimating relatedness of antonyms, consider them "related" (i.e., belonging to the same domain or representing features of the same concept), rather than "unrelated". Thank you for your assistance!',
+  },
+  simlex999: {
+    type: "similarity",
+    text: "Two words are synonyms if they have very similar meanings. Synonyms represent the same type or category of thing. Here are some examples of synonym pairs: cup/mug, glasses/spectacles, envy/jealousy. In practice, word pairs that are not exactly synonymous may still be very similar. Here are some very similar pairs - we could say they are nearly synonyms: alligator/crocodile, love / affection, frog/toad. In contrast, although the following word pairs are related, they are not very similar. The words represent entirely different types of thing:car/tyre, car/motorway, car/crash, In this survey, you are asked to compare word pairs and to rate how similar they are by moving a slider. Remember, things that are related are not necessarily similar. If you are ever unsure, think back to the examples of synonymous pairs (glasses/spectacles), and consider how close the words are (or are not) to being synonymous. There is no right answer to these questions. It is perfectly reasonable to use your intuition or gut feeling as a native English speaker, especially when you are asked to rate word pairs that you think are not similar at all.",
+  },
+};
+
+const models = {
+  gpt35turbo,
+  gpt4,
+  gpt4turbo,
+};
+
+const resultSchema = {
+  type: "object",
+  properties: {
+    scores: {
+      type: "array",
+      items: {
+        type: "object",
+        properties: {
+          words: { type: "array", items: { type: "string" } },
+          score: { type: "string" },
+        },
+      },
+    },
+  },
+};
+
+async function runTrialModel(model: Model, dsId: string, promptId: string) {
+  const f = {
+    name: "evaluate_scores",
+    description: "Evaluate the word similarity or relatedness scores",
+    parameters: resultSchema,
+  };
+  const res = await model.makeRequest(prompts[promptId].text, { function: f });
+  return res;
+}
+
+async function runTrialsModel(
+  trials: number,
+  model: Model,
+  dsId: string,
+  promptId: string
+) {
+  const results = [];
+  logger.info(`  model ${model.modelId}.`);
+  logger.debug(`Prompt ID: ${promptId}`);
+
+  for (let i = 0; i < trials; i++) {
+    logger.info(`    trial #${i + 1} of ${trials}`);
+    const res = await runTrialModel(model, dsId, prompts[promptId].text);
+    results.push(
+      res.type === "openai"
+        ? res.data.choices[0].message.tool_calls?.[0].function.arguments || ""
+        : ""
+    );
+  }
+  return results;
+}
+
+async function runTrials(trials: number) {
+  const datasetIds = ["ws353", "simlex999"];
+  const datasets: { [key: string]: DatasetScores } = {};
+  for (const dsId of datasetIds) {
+    datasets[dsId] = await loadDatasetScores(dsId);
+  }
+
+  logger.info(
+    `Running experiment ${name} with ${trials} trials on models [gpt35turbo, gpt4, gpt4turbo], datasets ${datasetIds} and prompts ${Object.keys(
+      prompts
+    )}.`
+  );
+
+  for (const modelId in models) {
+    for (const promptId in prompts) {
+      for (const dsId in datasets) {
+        const results = await runTrialsModel(
+          trials,
+          models[modelId as keyof typeof models],
+          dsId,
+          promptId
+        );
+        const res: ExperimentData = {
+          variables: {
+            modelId,
+            promptId,
+            dsId: "",
+          },
+          results: {
+            trial: results,
+          },
+        };
+        logger.info(`Results: ${JSON.stringify(res)}`);
+      }
+    }
+  }
+}