Skip to content

Commit d61512a

Browse files
lukstafiderekchiang
andcommitted
A "names" dataset with a bigram use-case helper, sites setup by Claude Sonnet
Co-authored-by: Derek Chiang <me@derekchiang.com>
1 parent f4a9bfc commit d61512a

File tree

7 files changed

+32107
-1
lines changed

7 files changed

+32107
-1
lines changed

arrayjit.opam

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,12 @@ build: [
5252
name
5353
"-j"
5454
jobs
55+
"--promote-install-files=false"
5556
"@install"
5657
"@runtest" {with-test}
5758
"@doc" {with-doc}
5859
]
60+
["dune" "install" "-p" name "--create-install-files" name]
5961
]
6062
dev-repo: "git+https://github.com/lukstafi/ocannl.git"
6163
x-maintenance-intent: ["(latest)"]

bin/dune

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,3 +103,10 @@
103103
(libraries ocannl)
104104
(preprocess
105105
(pps ppx_here ppx_ocannl)))
106+
107+
(executable
108+
(name bigram)
109+
(modules bigram)
110+
(libraries ocannl datasets)
111+
(preprocess
112+
(pps ppx_here ppx_ocannl)))

datasets/dune

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,13 @@
33
(library
44
(name datasets)
55
(public_name neural_nets_lib.datasets)
6-
(libraries unix zip curl csv bigarray))
6+
(libraries unix zip curl csv bigarray dune-site base stdio))
7+
8+
(install
9+
(package neural_nets_lib)
10+
(section (site (neural_nets_lib data)))
11+
(files (names.txt as names.txt)))
12+
13+
(generate_sites_module
14+
(module dataset_sites)
15+
(sites neural_nets_lib))

datasets/names.ml

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
open Base
2+
open Stdio
3+
4+
(* Access the site locations to find names.txt *)
5+
let read_names () =
6+
let data_locations : string list = Dataset_sites.Sites.data in
7+
let names_file = "names.txt" in
8+
let rec find_file = function
9+
| [] -> failwith (Printf.sprintf "Could not find %s in any data location" names_file)
10+
| dir :: rest ->
11+
let filepath = Stdlib.Filename.concat dir names_file in
12+
if Stdlib.Sys.file_exists filepath then filepath
13+
else find_file rest
14+
in
15+
let filepath = find_file data_locations in
16+
In_channel.read_lines filepath
17+
18+
let bigrams s =
19+
let chars = String.to_list s in
20+
let front = '.' :: chars in
21+
let back = chars @ [ '.' ] in
22+
List.zip_exn front back
23+
24+
let get_all_bigrams () = List.(read_names () >>| bigrams |> concat)
25+
let letters = List.init 26 ~f:(fun i -> Char.of_int_exn (Char.to_int 'a' + i))
26+
27+
(* Round the number of tokens up to 28 so it's divisible by 4 as we are using the bit-efficient
28+
random number generator. *)
29+
(* TODO: double check if this is necessary. *)
30+
let letters_with_dot = '.' :: ' ' :: letters
31+
32+
let char_to_index_tbl =
33+
let tbl = Hashtbl.create (module Char) in
34+
List.iteri letters_with_dot ~f:(fun i c -> Hashtbl.set tbl ~key:c ~data:i);
35+
tbl
36+
37+
let char_index c =
38+
match Hashtbl.find char_to_index_tbl c with
39+
| Some i -> i
40+
| None -> failwith (Printf.sprintf "Character not found: %c" c)
41+
42+
let bigrams_to_indices bigrams = List.(bigrams >>| fun (c1, c2) -> (char_index c1, char_index c2))
43+
let dict_size = List.length letters_with_dot
44+
45+
let char_to_one_hot c =
46+
let c_index = char_index c in
47+
let arr = Array.create ~len:dict_size 0. in
48+
arr.(c_index) <- 1.;
49+
arr

0 commit comments

Comments
 (0)