Add: CLI, offset_within, write_to

New experimental Python interfaces simplify big-data processing from the command line. The new Python CLI prototypes include: - over 3x faster `wc` word-, and line-counting utility - over 4x faster `split` dataset sharding utility
ashvardanian · Feb 20, 2024 · 4c738ea · 4c738ea
1 parent 92e4bc6
commit 4c738ea
Show file tree

Hide file tree

Showing 4 changed files with 271 additions and 3 deletions.
diff --git a/cli/README.md b/cli/README.md
@@ -0,0 +1,48 @@
+# SIMD-accelerate CLI utilities based on StringZilla
+
+## `wc`: Word Count
+
+The `wc` utility on Linux can be used to count the number of lines, words, and bytes in a file.
+Using SIMD-accelerated character and character-set search, StringZilla, even with slow SSDs, it can be noticeably faster.
+
+```bash
+$ time wc enwik9.txt 
+  13147025  129348346 1000000000 enwik9.txt
+
+real    0m3.562s
+user    0m3.470s
+sys     0m0.092s
+
+$ time cli/wc.py enwik9.txt
+13147025 139132610 1000000000 enwik9.txt
+
+real    0m1.165s
+user    0m1.121s
+sys     0m0.044s
+```
+
+## `split`: Split File into Smaller Ones
+
+The `split` utility on Linux can be used to split a file into smaller ones.
+The current prototype only splits by line counts.
+
+```bash
+$ time split -l 100000 enwik9.txt ...
+
+real    0m6.424s
+user    0m0.179s
+sys     0m0.663s
+
+$ time cli/split.py 100000 enwik9.txt ...
+
+real    0m1.482s
+user    0m1.020s
+sys     0m0.460s
+```
+
+---
+
+What other interfaces should be added?
+
+- Levenshtein distances?
+- Fuzzy search?
diff --git a/cli/split.py b/cli/split.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+
+import sys
+
+from stringzilla import File, Str
+
+
+def split_file(file_path, lines_per_file, output_prefix):
+    try:
+        # 1. Memory-map the large file
+        file_mapped = File(file_path)
+        file_contents = Str(file_mapped)
+
+        # Variables to keep track of the current position and file part number
+        current_position = 0
+        file_part = 0
+        newline_position = (
+            -1
+        )  # Start before file begins to find the first newline correctly
+
+        # Loop until the end of the file
+        while current_position < len(file_contents):
+            # 2. Loop to skip `lines_per_file` lines
+            for _ in range(lines_per_file):
+                newline_position = file_contents.find("\n", newline_position + 1)
+                if newline_position == -1:  # No more newlines
+                    break
+
+            # If no newlines were found and we're not at the start, process the rest of the file
+            if newline_position == -1 and current_position < len(file_contents):
+                newline_position = len(file_contents)
+
+            # 3. Use offset_within to get the length of the current section
+            # Assuming offset_within gives you the length from the current position
+            section_length = (
+                newline_position - current_position if newline_position != -1 else 0
+            )
+
+            # Extract the current section to write out
+            if section_length > 0:  # Prevent creating empty files
+                current_slice = file_contents[current_position : newline_position + 1]
+
+                # 4. Save the current slice to file
+                output_path = f"{output_prefix}{file_part}"
+                current_slice.write_to(output_path)
+
+                # Prepare for the next slice
+                file_part += 1
+                current_position = newline_position + 1
+
+    except FileNotFoundError:
+        print(f"No such file: {file_path}")
+    except Exception as e:
+        print(f"An error occurred: {e}")
+
+
+def main():
+    if len(sys.argv) < 4:
+        print(
+            "Usage: python split_file.py <lines_per_file> <input_file> <output_prefix>"
+        )
+        sys.exit(1)
+
+    lines_per_file = int(sys.argv[1])
+    file_path = sys.argv[2]
+    output_prefix = sys.argv[3]
+
+    split_file(file_path, lines_per_file, output_prefix)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cli/wc.py b/cli/wc.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+
+import sys
+
+from stringzilla import File, Str
+
+
+def wc(file_path):
+    try:
+        mapped_file = File(file_path)
+        mapped_bytes = Str(mapped_file)
+        line_count = mapped_bytes.count("\n")
+        word_count = mapped_bytes.count(" ")
+        char_count = mapped_bytes.__len__()
+
+        return line_count, word_count, char_count
+    except FileNotFoundError:
+        return f"No such file: {file_path}"
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python wc.py <file>")
+        sys.exit(1)
+
+    file_path = sys.argv[1]
+    counts = wc(file_path)
+
+    if isinstance(counts, tuple):
+        line_count, word_count, char_count = counts
+        print(f"{line_count} {word_count} {char_count} {file_path}")
+    else:
+        print(counts)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/lib.c b/python/lib.c
@@ -35,6 +35,7 @@ typedef SSIZE_T ssize_t;
 
 #include <Python.h> // Core CPython interfaces
 
+#include <stdio.h>  // `fopen`
 #include <string.h> // `memset`, `memcpy`
 
 #include <stringzilla/stringzilla.h>
@@ -827,6 +828,113 @@ static PyObject *Str_richcompare(PyObject *self, PyObject *other, int op) {
     }
 }
 
+/**
+ *  @brief  Saves a StringZilla string to disk.
+ */
+static PyObject *Str_write_to(PyObject *self, PyObject *args, PyObject *kwargs) {
+
+    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs != !is_member + 1) {
+        PyErr_SetString(PyExc_TypeError, "Invalid number of arguments");
+        return NULL;
+    }
+
+    PyObject *text_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
+    PyObject *path_obj = PyTuple_GET_ITEM(args, !is_member + 0);
+
+    // Parse keyword arguments
+    if (kwargs) {
+        PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument");
+        return NULL;
+    }
+
+    sz_string_view_t text;
+    sz_string_view_t path;
+
+    // Validate and convert `text` and `path`
+    if (!export_string_like(text_obj, &text.start, &text.length) ||
+        !export_string_like(path_obj, &path.start, &path.length)) {
+        PyErr_SetString(PyExc_TypeError, "Text and path must be string-like");
+        return NULL;
+    }
+
+    // There is a chance, the path isn't NULL-terminated, so copy it to a new buffer.
+    // Many OSes have fairly low limit for the maximum path length.
+    // On Windows its 260, but up to __around__ 32,767 characters are supported in extended API.
+    // But it's better to be safe than sorry and use malloc :)
+    //
+    // https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation?tabs=registry
+    // https://doc.owncloud.com/server/next/admin_manual/troubleshooting/path_filename_length.html
+    char *path_buffer = (char *)malloc(path.length + 1);
+    if (path_buffer == NULL) {
+        PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for the path");
+        return NULL;
+    }
+    memcpy(path_buffer, path.start, path.length);
+
+    FILE *file_pointer = fopen(path_buffer, "wb");
+    if (file_pointer == NULL) {
+        PyErr_SetFromErrnoWithFilename(PyExc_OSError, path_buffer);
+        free(path_buffer);
+        return NULL;
+    }
+
+    setbuf(file_pointer, NULL); // Set the stream to unbuffered
+    int status = fwrite(text.start, 1, text.length, file_pointer);
+    if (status != text.length) {
+        PyErr_SetFromErrnoWithFilename(PyExc_OSError, path_buffer);
+        free(path_buffer);
+        fclose(file_pointer);
+        return NULL;
+    }
+
+    free(path_buffer);
+    fclose(file_pointer);
+    Py_RETURN_NONE;
+}
+
+/**
+ *  @brief  Given a native StringZilla string, suggests it's offset within another native StringZilla string.
+ *          Very practical when dealing with large files.
+ *  @return Unsigned integer on success.
+ */
+static PyObject *Str_offset_within(PyObject *self, PyObject *args, PyObject *kwargs) {
+
+    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs != !is_member + 1) {
+        PyErr_SetString(PyExc_TypeError, "Invalid number of arguments");
+        return NULL;
+    }
+
+    PyObject *slice_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
+    PyObject *text_obj = PyTuple_GET_ITEM(args, !is_member + 0);
+
+    // Parse keyword arguments
+    if (kwargs) {
+        PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument");
+        return NULL;
+    }
+
+    sz_string_view_t text;
+    sz_string_view_t slice;
+
+    // Validate and convert `text` and `slice`
+    if (!export_string_like(text_obj, &text.start, &text.length) ||
+        !export_string_like(slice_obj, &slice.start, &slice.length)) {
+        PyErr_SetString(PyExc_TypeError, "Text and slice must be string-like");
+        return NULL;
+    }
+
+    if (slice.start < text.start || slice.start + slice.length > text.start + text.length) {
+        PyErr_SetString(PyExc_ValueError, "The slice is not within the text bounds");
+        return NULL;
+    }
+
+    return PyLong_FromSize_t((size_t)(slice.start - text.start));
+}
+
 /**
  *  @brief  Implementation function for all search-like operations, parameterized by a function callback.
  *  @return 1 on success, 0 on failure.
@@ -1531,12 +1639,10 @@ static PyObject *Str_split(PyObject *self, PyObject *args, PyObject *kwargs) {
 
     // Validate and convert `separator`
     if (separator_obj) {
-        Py_ssize_t len;
-        if (!export_string_like(separator_obj, &separator.start, &len)) {
+        if (!export_string_like(separator_obj, &separator.start, &separator.length)) {
             PyErr_SetString(PyExc_TypeError, "The separator argument must be string-like");
             return NULL;
         }
-        separator.length = (size_t)len;
     }
     else {
         separator.start = " ";
@@ -1726,6 +1832,11 @@ static PyMethodDef Str_methods[] = {
     {"find_last_not_of", Str_find_last_not_of, SZ_METHOD_FLAGS,
      "Finds the last occurrence of a character not present in another string."},
 
+    // Dealing with larger-than-memory datasets
+    {"offset_within", Str_offset_within, SZ_METHOD_FLAGS,
+     "Return the raw byte offset of one binary string within another."},
+    {"write_to", Str_write_to, SZ_METHOD_FLAGS, "Return the raw byte offset of one binary string within another."},
+
     {NULL, NULL, 0, NULL}};
 
 static PyTypeObject StrType = {