Skip to content

Commit

Permalink
Add: CLI, offset_within, write_to
Browse files Browse the repository at this point in the history
New experimental Python interfaces simplify big-data processing
from the command line. The new Python CLI prototypes include:

- over 3x faster `wc` word-, and line-counting utility
- over 4x faster `split` dataset sharding utility
  • Loading branch information
ashvardanian committed Feb 20, 2024
1 parent 92e4bc6 commit 4c738ea
Show file tree
Hide file tree
Showing 4 changed files with 271 additions and 3 deletions.
48 changes: 48 additions & 0 deletions cli/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# SIMD-accelerate CLI utilities based on StringZilla

## `wc`: Word Count

The `wc` utility on Linux can be used to count the number of lines, words, and bytes in a file.
Using SIMD-accelerated character and character-set search, StringZilla, even with slow SSDs, it can be noticeably faster.

```bash
$ time wc enwik9.txt
13147025 129348346 1000000000 enwik9.txt

real 0m3.562s
user 0m3.470s
sys 0m0.092s

$ time cli/wc.py enwik9.txt
13147025 139132610 1000000000 enwik9.txt

real 0m1.165s
user 0m1.121s
sys 0m0.044s
```

## `split`: Split File into Smaller Ones

The `split` utility on Linux can be used to split a file into smaller ones.
The current prototype only splits by line counts.

```bash
$ time split -l 100000 enwik9.txt ...

real 0m6.424s
user 0m0.179s
sys 0m0.663s

$ time cli/split.py 100000 enwik9.txt ...

real 0m1.482s
user 0m1.020s
sys 0m0.460s
```

---

What other interfaces should be added?

- Levenshtein distances?
- Fuzzy search?
72 changes: 72 additions & 0 deletions cli/split.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/usr/bin/env python3

import sys

from stringzilla import File, Str


def split_file(file_path, lines_per_file, output_prefix):
try:
# 1. Memory-map the large file
file_mapped = File(file_path)
file_contents = Str(file_mapped)

# Variables to keep track of the current position and file part number
current_position = 0
file_part = 0
newline_position = (
-1
) # Start before file begins to find the first newline correctly

# Loop until the end of the file
while current_position < len(file_contents):
# 2. Loop to skip `lines_per_file` lines
for _ in range(lines_per_file):
newline_position = file_contents.find("\n", newline_position + 1)
if newline_position == -1: # No more newlines
break

# If no newlines were found and we're not at the start, process the rest of the file
if newline_position == -1 and current_position < len(file_contents):
newline_position = len(file_contents)

# 3. Use offset_within to get the length of the current section
# Assuming offset_within gives you the length from the current position
section_length = (
newline_position - current_position if newline_position != -1 else 0
)

# Extract the current section to write out
if section_length > 0: # Prevent creating empty files
current_slice = file_contents[current_position : newline_position + 1]

# 4. Save the current slice to file
output_path = f"{output_prefix}{file_part}"
current_slice.write_to(output_path)

# Prepare for the next slice
file_part += 1
current_position = newline_position + 1

except FileNotFoundError:
print(f"No such file: {file_path}")
except Exception as e:
print(f"An error occurred: {e}")


def main():
if len(sys.argv) < 4:
print(
"Usage: python split_file.py <lines_per_file> <input_file> <output_prefix>"
)
sys.exit(1)

lines_per_file = int(sys.argv[1])
file_path = sys.argv[2]
output_prefix = sys.argv[3]

split_file(file_path, lines_per_file, output_prefix)


if __name__ == "__main__":
main()
37 changes: 37 additions & 0 deletions cli/wc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/usr/bin/env python3

import sys

from stringzilla import File, Str


def wc(file_path):
try:
mapped_file = File(file_path)
mapped_bytes = Str(mapped_file)
line_count = mapped_bytes.count("\n")
word_count = mapped_bytes.count(" ")
char_count = mapped_bytes.__len__()

return line_count, word_count, char_count
except FileNotFoundError:
return f"No such file: {file_path}"


def main():
if len(sys.argv) < 2:
print("Usage: python wc.py <file>")
sys.exit(1)

file_path = sys.argv[1]
counts = wc(file_path)

if isinstance(counts, tuple):
line_count, word_count, char_count = counts
print(f"{line_count} {word_count} {char_count} {file_path}")
else:
print(counts)


if __name__ == "__main__":
main()
117 changes: 114 additions & 3 deletions python/lib.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ typedef SSIZE_T ssize_t;

#include <Python.h> // Core CPython interfaces

#include <stdio.h> // `fopen`
#include <string.h> // `memset`, `memcpy`

#include <stringzilla/stringzilla.h>
Expand Down Expand Up @@ -827,6 +828,113 @@ static PyObject *Str_richcompare(PyObject *self, PyObject *other, int op) {
}
}

/**
* @brief Saves a StringZilla string to disk.
*/
static PyObject *Str_write_to(PyObject *self, PyObject *args, PyObject *kwargs) {

int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
Py_ssize_t nargs = PyTuple_Size(args);
if (nargs != !is_member + 1) {
PyErr_SetString(PyExc_TypeError, "Invalid number of arguments");
return NULL;
}

PyObject *text_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
PyObject *path_obj = PyTuple_GET_ITEM(args, !is_member + 0);

// Parse keyword arguments
if (kwargs) {
PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument");
return NULL;
}

sz_string_view_t text;
sz_string_view_t path;

// Validate and convert `text` and `path`
if (!export_string_like(text_obj, &text.start, &text.length) ||
!export_string_like(path_obj, &path.start, &path.length)) {
PyErr_SetString(PyExc_TypeError, "Text and path must be string-like");
return NULL;
}

// There is a chance, the path isn't NULL-terminated, so copy it to a new buffer.
// Many OSes have fairly low limit for the maximum path length.
// On Windows its 260, but up to __around__ 32,767 characters are supported in extended API.
// But it's better to be safe than sorry and use malloc :)
//
// https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation?tabs=registry
// https://doc.owncloud.com/server/next/admin_manual/troubleshooting/path_filename_length.html
char *path_buffer = (char *)malloc(path.length + 1);
if (path_buffer == NULL) {
PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for the path");
return NULL;
}
memcpy(path_buffer, path.start, path.length);

FILE *file_pointer = fopen(path_buffer, "wb");
if (file_pointer == NULL) {
PyErr_SetFromErrnoWithFilename(PyExc_OSError, path_buffer);
free(path_buffer);
return NULL;
}

setbuf(file_pointer, NULL); // Set the stream to unbuffered
int status = fwrite(text.start, 1, text.length, file_pointer);
if (status != text.length) {
PyErr_SetFromErrnoWithFilename(PyExc_OSError, path_buffer);
free(path_buffer);
fclose(file_pointer);
return NULL;
}

free(path_buffer);
fclose(file_pointer);
Py_RETURN_NONE;
}

/**
* @brief Given a native StringZilla string, suggests it's offset within another native StringZilla string.
* Very practical when dealing with large files.
* @return Unsigned integer on success.
*/
static PyObject *Str_offset_within(PyObject *self, PyObject *args, PyObject *kwargs) {

int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
Py_ssize_t nargs = PyTuple_Size(args);
if (nargs != !is_member + 1) {
PyErr_SetString(PyExc_TypeError, "Invalid number of arguments");
return NULL;
}

PyObject *slice_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
PyObject *text_obj = PyTuple_GET_ITEM(args, !is_member + 0);

// Parse keyword arguments
if (kwargs) {
PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument");
return NULL;
}

sz_string_view_t text;
sz_string_view_t slice;

// Validate and convert `text` and `slice`
if (!export_string_like(text_obj, &text.start, &text.length) ||
!export_string_like(slice_obj, &slice.start, &slice.length)) {
PyErr_SetString(PyExc_TypeError, "Text and slice must be string-like");
return NULL;
}

if (slice.start < text.start || slice.start + slice.length > text.start + text.length) {
PyErr_SetString(PyExc_ValueError, "The slice is not within the text bounds");
return NULL;
}

return PyLong_FromSize_t((size_t)(slice.start - text.start));
}

/**
* @brief Implementation function for all search-like operations, parameterized by a function callback.
* @return 1 on success, 0 on failure.
Expand Down Expand Up @@ -1531,12 +1639,10 @@ static PyObject *Str_split(PyObject *self, PyObject *args, PyObject *kwargs) {

// Validate and convert `separator`
if (separator_obj) {
Py_ssize_t len;
if (!export_string_like(separator_obj, &separator.start, &len)) {
if (!export_string_like(separator_obj, &separator.start, &separator.length)) {
PyErr_SetString(PyExc_TypeError, "The separator argument must be string-like");
return NULL;
}
separator.length = (size_t)len;
}
else {
separator.start = " ";
Expand Down Expand Up @@ -1726,6 +1832,11 @@ static PyMethodDef Str_methods[] = {
{"find_last_not_of", Str_find_last_not_of, SZ_METHOD_FLAGS,
"Finds the last occurrence of a character not present in another string."},

// Dealing with larger-than-memory datasets
{"offset_within", Str_offset_within, SZ_METHOD_FLAGS,
"Return the raw byte offset of one binary string within another."},
{"write_to", Str_write_to, SZ_METHOD_FLAGS, "Return the raw byte offset of one binary string within another."},

{NULL, NULL, 0, NULL}};

static PyTypeObject StrType = {
Expand Down

0 comments on commit 4c738ea

Please sign in to comment.