From cd6db04447108ff200ec56e41536828b3f1f334a Mon Sep 17 00:00:00 2001 From: Sylvain Utard Date: Sun, 21 Sep 2025 22:44:41 +0200 Subject: [PATCH 1/2] Migrate away from ExtensionUtil Following https://github.com/duckdb/duckdb/pull/17772 --- src/include/netquack_extension.hpp | 2 +- src/netquack_extension.cpp | 64 ++++++++++++------------------ 2 files changed, 26 insertions(+), 40 deletions(-) diff --git a/src/include/netquack_extension.hpp b/src/include/netquack_extension.hpp index 9976b39..e600ef3 100644 --- a/src/include/netquack_extension.hpp +++ b/src/include/netquack_extension.hpp @@ -10,7 +10,7 @@ namespace duckdb class NetquackExtension : public Extension { public: - void Load (DuckDB &db) override; + void Load (ExtensionLoader &loader) override; std::string Name () override; std::string Version () const override; }; diff --git a/src/netquack_extension.cpp b/src/netquack_extension.cpp index 91ddd9f..d56e63d 100644 --- a/src/netquack_extension.cpp +++ b/src/netquack_extension.cpp @@ -8,7 +8,6 @@ #include "duckdb/common/exception.hpp" #include "duckdb/common/string_util.hpp" #include "duckdb/function/scalar_function.hpp" -#include "duckdb/main/extension_util.hpp" #include "duckdb/parser/parsed_data/create_scalar_function_info.hpp" #include "functions/extract_domain.hpp" #include "functions/extract_extension.hpp" @@ -27,103 +26,100 @@ namespace duckdb { // Load the extension into the database - static void LoadInternal (DatabaseInstance &instance) + static void LoadInternal (ExtensionLoader &loader) { - ExtensionUtil::RegisterExtension ( - instance, - "netquack", - { "Parsing, extracting, and analyzing domains, URIs, and paths with ease." }); - + loader.SetDescription("Parsing, extracting, and analyzing domains, URIs, and paths with ease."); + auto netquack_extract_domain_function = ScalarFunction ( "extract_domain", { LogicalType::VARCHAR }, LogicalType::VARCHAR, ExtractDomainFunction); - ExtensionUtil::RegisterFunction (instance, netquack_extract_domain_function); + loader.RegisterFunction (netquack_extract_domain_function); auto netquack_update_suffixes_function = ScalarFunction ( "update_suffixes", {}, LogicalType::VARCHAR, netquack::UpdateSuffixesFunction); - ExtensionUtil::RegisterFunction (instance, netquack_update_suffixes_function); + loader.RegisterFunction (netquack_update_suffixes_function); auto netquack_extract_path_function = ScalarFunction ( "extract_path", { LogicalType::VARCHAR }, LogicalType::VARCHAR, ExtractPathFunction); - ExtensionUtil::RegisterFunction (instance, netquack_extract_path_function); + loader.RegisterFunction (netquack_extract_path_function); auto netquack_extract_schema_function = ScalarFunction ( "extract_schema", { LogicalType::VARCHAR }, LogicalType::VARCHAR, ExtractSchemaFunction); - ExtensionUtil::RegisterFunction (instance, netquack_extract_schema_function); + loader.RegisterFunction (netquack_extract_schema_function); auto netquack_extract_host_function = ScalarFunction ( "extract_host", { LogicalType::VARCHAR }, LogicalType::VARCHAR, ExtractHostFunction); - ExtensionUtil::RegisterFunction (instance, netquack_extract_host_function); + loader.RegisterFunction (netquack_extract_host_function); auto netquack_extract_query_string_function = ScalarFunction ( "extract_query_string", { LogicalType::VARCHAR }, LogicalType::VARCHAR, ExtractQueryStringFunction); - ExtensionUtil::RegisterFunction (instance, netquack_extract_query_string_function); + loader.RegisterFunction (netquack_extract_query_string_function); auto netquack_extract_tld_function = ScalarFunction ( "extract_tld", { LogicalType::VARCHAR }, LogicalType::VARCHAR, ExtractTLDFunction); - ExtensionUtil::RegisterFunction (instance, netquack_extract_tld_function); + loader.RegisterFunction (netquack_extract_tld_function); auto netquack_extract_subdomain_function = ScalarFunction ( "extract_subdomain", { LogicalType::VARCHAR }, LogicalType::VARCHAR, ExtractSubDomainFunction); - ExtensionUtil::RegisterFunction (instance, netquack_extract_subdomain_function); + loader.RegisterFunction (netquack_extract_subdomain_function); auto netquack_extract_port_function = ScalarFunction ( "extract_port", { LogicalType::VARCHAR }, LogicalType::VARCHAR, ExtractPortFunction); - ExtensionUtil::RegisterFunction (instance, netquack_extract_port_function); + loader.RegisterFunction (netquack_extract_port_function); auto netquack_extract_extension_function = ScalarFunction ( "extract_extension", { LogicalType::VARCHAR }, LogicalType::VARCHAR, ExtractExtensionFunction); - ExtensionUtil::RegisterFunction (instance, netquack_extract_extension_function); + loader.RegisterFunction (netquack_extract_extension_function); auto netquack_update_tranco_function = ScalarFunction ( "update_tranco", { LogicalType::BOOLEAN }, LogicalType::VARCHAR, netquack::UpdateTrancoListFunction); - ExtensionUtil::RegisterFunction (instance, netquack_update_tranco_function); + loader.RegisterFunction (netquack_update_tranco_function); auto get_tranco_rank_function = ScalarFunction ( "get_tranco_rank", { LogicalType::VARCHAR }, LogicalType::VARCHAR, netquack::GetTrancoRankFunction); - ExtensionUtil::RegisterFunction (instance, get_tranco_rank_function); + loader.RegisterFunction (get_tranco_rank_function); auto get_tranco_rank_category_function = ScalarFunction ( "get_tranco_rank_category", { LogicalType::VARCHAR }, LogicalType::VARCHAR, netquack::GetTrancoRankCategoryFunction); - ExtensionUtil::RegisterFunction (instance, get_tranco_rank_category_function); + loader.RegisterFunction (get_tranco_rank_category_function); auto ipcalc_function = TableFunction ( "ipcalc", @@ -133,7 +129,7 @@ namespace duckdb nullptr, netquack::IPCalcFunc::InitLocal); ipcalc_function.in_out_function = netquack::IPCalcFunc::Function; - ExtensionUtil::RegisterFunction (instance, ipcalc_function); + loader.RegisterFunction (ipcalc_function); auto version_function = TableFunction ( "netquack_version", @@ -142,12 +138,12 @@ namespace duckdb netquack::VersionFunc::Bind, netquack::VersionFunc::InitGlobal, netquack::VersionFunc::InitLocal); - ExtensionUtil::RegisterFunction (instance, version_function); + loader.RegisterFunction (version_function); } - void NetquackExtension::Load (DuckDB &db) + void NetquackExtension::Load (ExtensionLoader &loader) { - LoadInternal (*db.instance); + LoadInternal (loader); } std::string NetquackExtension::Name () { @@ -164,20 +160,10 @@ namespace duckdb } } // namespace duckdb -extern "C" -{ - DUCKDB_EXTENSION_API void netquack_init (duckdb::DatabaseInstance &db) - { - duckdb::DuckDB db_wrapper (db); - db_wrapper.LoadExtension (); - } - DUCKDB_EXTENSION_API const char *netquack_version () - { - return duckdb::DuckDB::LibraryVersion (); - } -} +extern "C" { -#ifndef DUCKDB_EXTENSION_MAIN -#error DUCKDB_EXTENSION_MAIN not defined -#endif + DUCKDB_CPP_EXTENSION_ENTRY(netquack, loader) { + duckdb::LoadInternal(loader); + } +} \ No newline at end of file From 5763ae64b8f8324f3801824e4dbbc494bbbd803c Mon Sep 17 00:00:00 2001 From: Sylvain Utard Date: Sun, 21 Sep 2025 23:10:45 +0200 Subject: [PATCH 2/2] Implement ClickHouse's URLPathHierarchy function Very convenient to use. --- src/functions/url_path_hierarchy.cpp | 146 +++++++++++++++++++++++++++ src/functions/url_path_hierarchy.hpp | 21 ++++ src/netquack_extension.cpp | 10 ++ test/sql/url_path_hierarchy.test | 50 +++++++++ 4 files changed, 227 insertions(+) create mode 100644 src/functions/url_path_hierarchy.cpp create mode 100644 src/functions/url_path_hierarchy.hpp create mode 100644 test/sql/url_path_hierarchy.test diff --git a/src/functions/url_path_hierarchy.cpp b/src/functions/url_path_hierarchy.cpp new file mode 100644 index 0000000..e755576 --- /dev/null +++ b/src/functions/url_path_hierarchy.cpp @@ -0,0 +1,146 @@ +// Copyright 2025 Altertable + +#include "url_path_hierarchy.hpp" +#include "extract_path.hpp" + +namespace duckdb +{ + // Scalar function implementation: returns LIST(VARCHAR) + void URLPathHierarchyFunction (DataChunk &args, ExpressionState &state, Vector &result) + { + // Input vector + auto &input_vector = args.data[0]; + + // Result must be a list(vector) + D_ASSERT(result.GetType().id() == LogicalTypeId::LIST); + + // Prepare result as FLAT for writing list entries + result.SetVectorType(VectorType::FLAT_VECTOR); + ListVector::SetListSize(result, 0); + + // Accessors + auto list_entries = FlatVector::GetData (result); + auto &child_entry = ListVector::GetEntry (result); + + // We'll accumulate all child strings linearly, then set list offsets/lengths + idx_t total_children = 0; + + for (idx_t row_idx = 0; row_idx < args.size (); row_idx++) + { + auto value = input_vector.GetValue (row_idx); + + if (value.IsNull ()) + { + // NULL input -> NULL list + FlatVector::Validity (result).SetInvalid (row_idx); + list_entries[row_idx] = { 0, 0 }; + continue; + } + + auto input = value.ToString (); + std::transform (input.begin (), input.end (), input.begin (), ::tolower); + + // Build hierarchy components + auto components = netquack::BuildURLPathHierarchy (input); + + // Ensure child capacity (simple reserve to required size) + auto required_children = total_children + components.size (); + ListVector::Reserve (result, required_children); + + // Write list entry metadata + list_entries[row_idx].offset = total_children; + list_entries[row_idx].length = components.size (); + + // Fill child strings + auto child_data = FlatVector::GetData (child_entry); + for (idx_t k = 0; k < components.size (); k++) + { + child_data[total_children + k] = StringVector::AddString (child_entry, components[k]); + } + + total_children += components.size (); + } + + // Finalize child count + ListVector::SetListSize (result, total_children); + + // Add heap reference to keep strings alive + StringVector::AddHeapReference (ListVector::GetEntry (result), args.data[0]); + + if (args.AllConstant ()) + { + result.SetVectorType (VectorType::CONSTANT_VECTOR); + } + } + + namespace netquack + { + std::vector BuildURLPathHierarchy (const std::string &input) + { + std::vector result; + auto path = ExtractPath (input); + + // Normalize: ensure leading slash if path exists, remove query/fragment handled by regex + if (path.empty () || path == "/") + { + return result; // empty list when only root + } + + // Split into segments ignoring empty + // Keep track if original path ended with '/' + bool ends_with_slash = !path.empty () && path.back () == '/'; + + // Remove leading slash for splitting + std::string trimmed = path; + if (!trimmed.empty () && trimmed.front () == '/') + { + trimmed.erase (trimmed.begin ()); + } + + std::vector segments; + size_t start = 0; + while (start <= trimmed.size ()) + { + auto pos = trimmed.find ('/', start); + if (pos == std::string::npos) + { + auto token = trimmed.substr (start); + if (!token.empty ()) + { + segments.push_back (token); + } + break; + } + auto token = trimmed.substr (start, pos - start); + if (!token.empty ()) + { + segments.push_back (token); + } + start = pos + 1; + } + + // Build hierarchy without carrying trailing slash into next iteration + std::string accum_no_trailing; + for (idx_t i = 0; i < segments.size (); i++) + { + if (!accum_no_trailing.empty ()) + { + accum_no_trailing.push_back ('/'); + } + accum_no_trailing.append (segments[i]); + + std::string out = "/"; + out.append (accum_no_trailing); + if (i < segments.size () - 1 || ends_with_slash) + { + out.push_back ('/'); + } + result.push_back (out); + } + + return result; + } + } // namespace netquack +} // namespace duckdb + + diff --git a/src/functions/url_path_hierarchy.hpp b/src/functions/url_path_hierarchy.hpp new file mode 100644 index 0000000..9dc93d7 --- /dev/null +++ b/src/functions/url_path_hierarchy.hpp @@ -0,0 +1,21 @@ +// Copyright 2025 Altertable + +#pragma once + +#include "duckdb.hpp" +#include + +namespace duckdb +{ + // Function to build URL path hierarchy as LIST(VARCHAR) + void URLPathHierarchyFunction (DataChunk &args, ExpressionState &state, Vector &result); + + namespace netquack + { + // Helper that returns vector of hierarchical path components + // e.g. "/browse/CONV-6788" -> {"/browse/", "/browse/CONV-6788"} + std::vector BuildURLPathHierarchy (const std::string &input); + } // namespace netquack +} // namespace duckdb + + diff --git a/src/netquack_extension.cpp b/src/netquack_extension.cpp index d56e63d..91f3664 100644 --- a/src/netquack_extension.cpp +++ b/src/netquack_extension.cpp @@ -21,6 +21,7 @@ #include "functions/get_tranco.hpp" #include "functions/get_version.hpp" #include "functions/ipcalc.hpp" +#include "functions/url_path_hierarchy.hpp" #include "utils/utils.hpp" namespace duckdb @@ -100,6 +101,15 @@ namespace duckdb ExtractExtensionFunction); loader.RegisterFunction (netquack_extract_extension_function); + // URLPathHierarchy(url) -> LIST(VARCHAR) + auto url_path_hierarchy_type = LogicalType::LIST (LogicalType::VARCHAR); + auto url_path_hierarchy_function_alias = ScalarFunction ( + "url_path_hierarchy", + { LogicalType::VARCHAR }, + url_path_hierarchy_type, + URLPathHierarchyFunction); + loader.RegisterFunction (url_path_hierarchy_function_alias); + auto netquack_update_tranco_function = ScalarFunction ( "update_tranco", { LogicalType::BOOLEAN }, diff --git a/test/sql/url_path_hierarchy.test b/test/sql/url_path_hierarchy.test new file mode 100644 index 0000000..bd43656 --- /dev/null +++ b/test/sql/url_path_hierarchy.test @@ -0,0 +1,50 @@ +# name: test/sql/url_path_hierarchy.test +# description: test netquack URLPathHierarchy function +# group: [netquack] + +require netquack + +statement ok +CREATE TABLE uri_list AS SELECT * FROM read_csv('test/data/examples.csv', header=false, columns={'uri': 'VARCHAR'}); + +# basic example from docs +query I +SELECT url_path_hierarchy('https://example.com/browse/CONV-6788'); +---- +[/browse/, /browse/conv-6788] + +# trailing slash preserves trailing slash on last element +query I +SELECT url_path_hierarchy('https://example.com/a/b/c/'); +---- +[/a/, /a/b/, /a/b/c/] + +# root-only path returns empty list +query I +SELECT url_path_hierarchy('https://example.com/'); +---- +[] + +# host without explicit scheme +query I +SELECT url_path_hierarchy('example.com/a/b'); +---- +[/a/, /a/b] + +# apply to a table of URIs +query I +SELECT url_path_hierarchy(uri) FROM uri_list; +---- +[] +[] +[] +[] +[] +[/a] +[/a] +[] +[] +[] +[/path/] + +