Skip to content

C++/Rust UDF implementation #5162

@carloea2

Description

@carloea2

Feature Summary

I feel performance boost can partially be solved by providing faster UDFs. During the hackathon, a previous prototype of a matrix multiply small bench showed C++: 120x , rust: 170x speedups against python.

If it is of interest for apache/texera I am unsure, but it is worth discussing.

Thanks

Proposed Solution or Design

Image

C++ UDF

#include <chrono>

class MatrixMultiplyOperator : public texera::UDFOperator {
public:
    texera::TupleOutput process_tuple(const texera::Tuple& tuple, int port) override {
        int trial = tuple.get("trial").as_int();
        int n = tuple.get("matrix_size").as_int();
        long long seed = tuple.get("seed").as_long();

        auto start = std::chrono::high_resolution_clock::now();

        double checksum = 0.0;
        for (int i = 0; i < n; i++) {
            for (int j = 0; j < n; j++) {
                double cell = 0.0;
                for (int k = 0; k < n; k++) {
                    double a = ((seed + trial * 97LL + i * 31LL + k * 17LL) % 1000LL) / 1000.0;
                    double b = ((seed + trial * 53LL + k * 13LL + j * 29LL) % 1000LL) / 1000.0;
                    cell += a * b;
                }
                checksum += cell * ((i + 1) * 0.001 + (j + 1) * 0.0001);
            }
        }

        auto end = std::chrono::high_resolution_clock::now();
        double elapsed_ms = std::chrono::duration<double, std::milli>(end - start).count();

        return { texera::TupleLike{
            texera::Value::string_value("cpp"),
            texera::Value::double_value(checksum),
            texera::Value::double_value(elapsed_ms)
        }};
    }
};

using TexeraUDFOperator = MatrixMultiplyOperator;

Rust UDF

use std::time::Instant;

#[derive(Default)]
struct MatrixMultiplyOperator;

impl texera::UDFOperator for MatrixMultiplyOperator {
    fn process_tuple(
        &mut self,
        tuple: &texera::Tuple,
        _port: i32,
    ) -> Result<texera::TupleOutput, String> {
        let trial = tuple.get_by_name("trial")?.as_int()?;
        let n = tuple.get_by_name("matrix_size")?.as_int()?;
        let seed = tuple.get_by_name("seed")?.as_long()?;

        let start = Instant::now();

        let mut checksum = 0.0;
        for i in 0..n {
            for j in 0..n {
                let mut cell = 0.0;
                for k in 0..n {
                    let a = ((seed + trial as i64 * 97 + i as i64 * 31 + k as i64 * 17) % 1000) as f64 / 1000.0;
                    let b = ((seed + trial as i64 * 53 + k as i64 * 13 + j as i64 * 29) % 1000) as f64 / 1000.0;
                    cell += a * b;
                }
                checksum += cell * (((i + 1) as f64) * 0.001 + ((j + 1) as f64) * 0.0001);
            }
        }

        let elapsed_ms = start.elapsed().as_secs_f64() * 1000.0;

        Ok(vec![vec![
            texera::Value::string_value("rust"),
            texera::Value::double_value(checksum),
            texera::Value::double_value(elapsed_ms),
        ]])
    }
}

type TexeraUDFOperator = MatrixMultiplyOperator;

Affected Area

No response

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels
    No fields configured for Feature.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions