Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 146 additions & 0 deletions src/functions/url_path_hierarchy.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
// Copyright 2025 Altertable

#include "url_path_hierarchy.hpp"
#include "extract_path.hpp"

namespace duckdb
{
// Scalar function implementation: returns LIST(VARCHAR)
void URLPathHierarchyFunction (DataChunk &args, ExpressionState &state, Vector &result)
{
// Input vector
auto &input_vector = args.data[0];

// Result must be a list(vector)
D_ASSERT(result.GetType().id() == LogicalTypeId::LIST);

// Prepare result as FLAT for writing list entries
result.SetVectorType(VectorType::FLAT_VECTOR);
ListVector::SetListSize(result, 0);

// Accessors
auto list_entries = FlatVector::GetData<list_entry_t> (result);
auto &child_entry = ListVector::GetEntry (result);

// We'll accumulate all child strings linearly, then set list offsets/lengths
idx_t total_children = 0;

for (idx_t row_idx = 0; row_idx < args.size (); row_idx++)
{
auto value = input_vector.GetValue (row_idx);

if (value.IsNull ())
{
// NULL input -> NULL list
FlatVector::Validity (result).SetInvalid (row_idx);
list_entries[row_idx] = { 0, 0 };
continue;
}

auto input = value.ToString ();
std::transform (input.begin (), input.end (), input.begin (), ::tolower);

// Build hierarchy components
auto components = netquack::BuildURLPathHierarchy (input);

// Ensure child capacity (simple reserve to required size)
auto required_children = total_children + components.size ();
ListVector::Reserve (result, required_children);

// Write list entry metadata
list_entries[row_idx].offset = total_children;
list_entries[row_idx].length = components.size ();

// Fill child strings
auto child_data = FlatVector::GetData<string_t> (child_entry);
for (idx_t k = 0; k < components.size (); k++)
{
child_data[total_children + k] = StringVector::AddString (child_entry, components[k]);
}

total_children += components.size ();
}

// Finalize child count
ListVector::SetListSize (result, total_children);

// Add heap reference to keep strings alive
StringVector::AddHeapReference (ListVector::GetEntry (result), args.data[0]);

if (args.AllConstant ())
{
result.SetVectorType (VectorType::CONSTANT_VECTOR);
}
}

namespace netquack
{
std::vector<std::string> BuildURLPathHierarchy (const std::string &input)
{
std::vector<std::string> result;
auto path = ExtractPath (input);

// Normalize: ensure leading slash if path exists, remove query/fragment handled by regex
if (path.empty () || path == "/")
{
return result; // empty list when only root
}

// Split into segments ignoring empty
// Keep track if original path ended with '/'
bool ends_with_slash = !path.empty () && path.back () == '/';

// Remove leading slash for splitting
std::string trimmed = path;
if (!trimmed.empty () && trimmed.front () == '/')
{
trimmed.erase (trimmed.begin ());
}

std::vector<std::string> segments;
size_t start = 0;
while (start <= trimmed.size ())
{
auto pos = trimmed.find ('/', start);
if (pos == std::string::npos)
{
auto token = trimmed.substr (start);
if (!token.empty ())
{
segments.push_back (token);
}
break;
}
auto token = trimmed.substr (start, pos - start);
if (!token.empty ())
{
segments.push_back (token);
}
start = pos + 1;
}

// Build hierarchy without carrying trailing slash into next iteration
std::string accum_no_trailing;
for (idx_t i = 0; i < segments.size (); i++)
{
if (!accum_no_trailing.empty ())
{
accum_no_trailing.push_back ('/');
}
accum_no_trailing.append (segments[i]);

std::string out = "/";
out.append (accum_no_trailing);
if (i < segments.size () - 1 || ends_with_slash)
{
out.push_back ('/');
}
result.push_back (out);
}

return result;
}
} // namespace netquack
} // namespace duckdb


21 changes: 21 additions & 0 deletions src/functions/url_path_hierarchy.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// Copyright 2025 Altertable

#pragma once

#include "duckdb.hpp"
#include <vector>

namespace duckdb
{
// Function to build URL path hierarchy as LIST(VARCHAR)
void URLPathHierarchyFunction (DataChunk &args, ExpressionState &state, Vector &result);

namespace netquack
{
// Helper that returns vector of hierarchical path components
// e.g. "/browse/CONV-6788" -> {"/browse/", "/browse/CONV-6788"}
std::vector<std::string> BuildURLPathHierarchy (const std::string &input);
} // namespace netquack
} // namespace duckdb


2 changes: 1 addition & 1 deletion src/include/netquack_extension.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ namespace duckdb
class NetquackExtension : public Extension
{
public:
void Load (DuckDB &db) override;
void Load (ExtensionLoader &loader) override;
std::string Name () override;
std::string Version () const override;
};
Expand Down
74 changes: 35 additions & 39 deletions src/netquack_extension.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
#include "duckdb/common/exception.hpp"
#include "duckdb/common/string_util.hpp"
#include "duckdb/function/scalar_function.hpp"
#include "duckdb/main/extension_util.hpp"
#include "duckdb/parser/parsed_data/create_scalar_function_info.hpp"
#include "functions/extract_domain.hpp"
#include "functions/extract_extension.hpp"
Expand All @@ -22,108 +21,115 @@
#include "functions/get_tranco.hpp"
#include "functions/get_version.hpp"
#include "functions/ipcalc.hpp"
#include "functions/url_path_hierarchy.hpp"
#include "utils/utils.hpp"

namespace duckdb
{
// Load the extension into the database
static void LoadInternal (DatabaseInstance &instance)
static void LoadInternal (ExtensionLoader &loader)
{
ExtensionUtil::RegisterExtension (
instance,
"netquack",
{ "Parsing, extracting, and analyzing domains, URIs, and paths with ease." });

loader.SetDescription("Parsing, extracting, and analyzing domains, URIs, and paths with ease.");

auto netquack_extract_domain_function = ScalarFunction (
"extract_domain",
{ LogicalType::VARCHAR },
LogicalType::VARCHAR,
ExtractDomainFunction);
ExtensionUtil::RegisterFunction (instance, netquack_extract_domain_function);
loader.RegisterFunction (netquack_extract_domain_function);

auto netquack_update_suffixes_function = ScalarFunction (
"update_suffixes",
{},
LogicalType::VARCHAR,
netquack::UpdateSuffixesFunction);
ExtensionUtil::RegisterFunction (instance, netquack_update_suffixes_function);
loader.RegisterFunction (netquack_update_suffixes_function);

auto netquack_extract_path_function = ScalarFunction (
"extract_path",
{ LogicalType::VARCHAR },
LogicalType::VARCHAR,
ExtractPathFunction);
ExtensionUtil::RegisterFunction (instance, netquack_extract_path_function);
loader.RegisterFunction (netquack_extract_path_function);

auto netquack_extract_schema_function = ScalarFunction (
"extract_schema",
{ LogicalType::VARCHAR },
LogicalType::VARCHAR,
ExtractSchemaFunction);
ExtensionUtil::RegisterFunction (instance, netquack_extract_schema_function);
loader.RegisterFunction (netquack_extract_schema_function);

auto netquack_extract_host_function = ScalarFunction (
"extract_host",
{ LogicalType::VARCHAR },
LogicalType::VARCHAR,
ExtractHostFunction);
ExtensionUtil::RegisterFunction (instance, netquack_extract_host_function);
loader.RegisterFunction (netquack_extract_host_function);

auto netquack_extract_query_string_function = ScalarFunction (
"extract_query_string",
{ LogicalType::VARCHAR },
LogicalType::VARCHAR,
ExtractQueryStringFunction);
ExtensionUtil::RegisterFunction (instance, netquack_extract_query_string_function);
loader.RegisterFunction (netquack_extract_query_string_function);

auto netquack_extract_tld_function = ScalarFunction (
"extract_tld",
{ LogicalType::VARCHAR },
LogicalType::VARCHAR,
ExtractTLDFunction);
ExtensionUtil::RegisterFunction (instance, netquack_extract_tld_function);
loader.RegisterFunction (netquack_extract_tld_function);

auto netquack_extract_subdomain_function = ScalarFunction (
"extract_subdomain",
{ LogicalType::VARCHAR },
LogicalType::VARCHAR,
ExtractSubDomainFunction);
ExtensionUtil::RegisterFunction (instance, netquack_extract_subdomain_function);
loader.RegisterFunction (netquack_extract_subdomain_function);

auto netquack_extract_port_function = ScalarFunction (
"extract_port",
{ LogicalType::VARCHAR },
LogicalType::VARCHAR,
ExtractPortFunction);
ExtensionUtil::RegisterFunction (instance, netquack_extract_port_function);
loader.RegisterFunction (netquack_extract_port_function);

auto netquack_extract_extension_function = ScalarFunction (
"extract_extension",
{ LogicalType::VARCHAR },
LogicalType::VARCHAR,
ExtractExtensionFunction);
ExtensionUtil::RegisterFunction (instance, netquack_extract_extension_function);
loader.RegisterFunction (netquack_extract_extension_function);

// URLPathHierarchy(url) -> LIST(VARCHAR)
auto url_path_hierarchy_type = LogicalType::LIST (LogicalType::VARCHAR);
auto url_path_hierarchy_function_alias = ScalarFunction (
"url_path_hierarchy",
{ LogicalType::VARCHAR },
url_path_hierarchy_type,
URLPathHierarchyFunction);
loader.RegisterFunction (url_path_hierarchy_function_alias);

auto netquack_update_tranco_function = ScalarFunction (
"update_tranco",
{ LogicalType::BOOLEAN },
LogicalType::VARCHAR,
netquack::UpdateTrancoListFunction);
ExtensionUtil::RegisterFunction (instance, netquack_update_tranco_function);
loader.RegisterFunction (netquack_update_tranco_function);

auto get_tranco_rank_function = ScalarFunction (
"get_tranco_rank",
{ LogicalType::VARCHAR },
LogicalType::VARCHAR,
netquack::GetTrancoRankFunction);
ExtensionUtil::RegisterFunction (instance, get_tranco_rank_function);
loader.RegisterFunction (get_tranco_rank_function);

auto get_tranco_rank_category_function = ScalarFunction (
"get_tranco_rank_category",
{ LogicalType::VARCHAR },
LogicalType::VARCHAR,
netquack::GetTrancoRankCategoryFunction);
ExtensionUtil::RegisterFunction (instance, get_tranco_rank_category_function);
loader.RegisterFunction (get_tranco_rank_category_function);

auto ipcalc_function = TableFunction (
"ipcalc",
Expand All @@ -133,7 +139,7 @@ namespace duckdb
nullptr,
netquack::IPCalcFunc::InitLocal);
ipcalc_function.in_out_function = netquack::IPCalcFunc::Function;
ExtensionUtil::RegisterFunction (instance, ipcalc_function);
loader.RegisterFunction (ipcalc_function);

auto version_function = TableFunction (
"netquack_version",
Expand All @@ -142,12 +148,12 @@ namespace duckdb
netquack::VersionFunc::Bind,
netquack::VersionFunc::InitGlobal,
netquack::VersionFunc::InitLocal);
ExtensionUtil::RegisterFunction (instance, version_function);
loader.RegisterFunction (version_function);
}

void NetquackExtension::Load (DuckDB &db)
void NetquackExtension::Load (ExtensionLoader &loader)
{
LoadInternal (*db.instance);
LoadInternal (loader);
}
std::string NetquackExtension::Name ()
{
Expand All @@ -164,20 +170,10 @@ namespace duckdb
}
} // namespace duckdb

extern "C"
{
DUCKDB_EXTENSION_API void netquack_init (duckdb::DatabaseInstance &db)
{
duckdb::DuckDB db_wrapper (db);
db_wrapper.LoadExtension<duckdb::NetquackExtension> ();
}

DUCKDB_EXTENSION_API const char *netquack_version ()
{
return duckdb::DuckDB::LibraryVersion ();
}
}
extern "C" {

#ifndef DUCKDB_EXTENSION_MAIN
#error DUCKDB_EXTENSION_MAIN not defined
#endif
DUCKDB_CPP_EXTENSION_ENTRY(netquack, loader) {
duckdb::LoadInternal(loader);
}
}
Loading