Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
280 changes: 234 additions & 46 deletions be/src/vec/functions/function_regexp.cpp

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,7 @@
import org.apache.doris.nereids.trees.expressions.functions.scalar.Radians;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Random;
import org.apache.doris.nereids.trees.expressions.functions.scalar.RandomBytes;
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpCount;
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtract;
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtractAll;
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtractOrNull;
Expand Down Expand Up @@ -863,6 +864,7 @@ public class BuiltinScalarFunctions implements FunctionHelper {
scalar(Radians.class, "radians"),
scalar(Random.class, "rand", "random"),
scalar(Regexp.class, "regexp"),
scalar(RegexpCount.class, "regexp_count"),
scalar(RegexpExtract.class, "regexp_extract"),
scalar(RegexpExtractAll.class, "regexp_extract_all"),
scalar(RegexpExtractOrNull.class, "regexp_extract_or_null"),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

package org.apache.doris.nereids.trees.expressions.functions.scalar;

import org.apache.doris.catalog.FunctionSignature;
import org.apache.doris.nereids.trees.expressions.Expression;
import org.apache.doris.nereids.trees.expressions.functions.AlwaysNullable;
import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
import org.apache.doris.nereids.trees.expressions.functions.PropagateNullLiteral;
import org.apache.doris.nereids.trees.expressions.shape.BinaryExpression;
import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
import org.apache.doris.nereids.types.BigIntType;
import org.apache.doris.nereids.types.StringType;
import org.apache.doris.nereids.types.VarcharType;

import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;

import java.util.List;

/**
* ScalarFunction 'regexp_count'. This class is generated by GenerateFunction.
*/
public class RegexpCount extends ScalarFunction
implements BinaryExpression, ExplicitlyCastableSignature, AlwaysNullable, PropagateNullLiteral {
public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
FunctionSignature.ret(BigIntType.INSTANCE).args(VarcharType.SYSTEM_DEFAULT, VarcharType.SYSTEM_DEFAULT),
FunctionSignature.ret(BigIntType.INSTANCE).args(StringType.INSTANCE, StringType.INSTANCE)
);

/**
* constructor with 2 arguments.
*/
public RegexpCount(Expression arg0, Expression arg1) {
super("regexp_count", arg0, arg1);
}

/**
* withChildren.
*/
@Override
public RegexpCount withChildren(List<Expression> children) {
Preconditions.checkArgument(children.size() == 2);
return new RegexpCount(children.get(0), children.get(1));
}

@Override
public List<FunctionSignature> getSignatures() {
return SIGNATURES;
}

@Override
public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
return visitor.visitRegexpCount(this, context);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,7 @@
import org.apache.doris.nereids.trees.expressions.functions.scalar.Radians;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Random;
import org.apache.doris.nereids.trees.expressions.functions.scalar.RandomBytes;
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpCount;
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtract;
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtractAll;
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtractOrNull;
Expand Down Expand Up @@ -1859,6 +1860,10 @@ default R visitRegexpReplaceOne(RegexpReplaceOne regexpReplaceOne, C context) {
return visitScalarFunction(regexpReplaceOne, context);
}

default R visitRegexpCount(RegexpCount regexpCount, C context) {
return visitScalarFunction(regexpCount, context);
}

default R visitRepeat(Repeat repeat, C context) {
return visitScalarFunction(repeat, context);
}
Expand Down
59 changes: 59 additions & 0 deletions regression-test/data/nereids_syntax_p0/regexpCount.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
-- Test Case 1: Basic match (3 lowercase words with spaces)
3

-- Test Case 2: All letters (2 words)
2

-- Test Case 3: Single letters (3 matches)
3

-- Test Case 4: Uppercase letters (no matches)
0

-- Test Case 5: Letter-number combinations (3 matches)
3

-- Test Case 6: Multiple spaces (3 matches)
3

-- Test Case 7: Single letter without spaces (1 match)
1

-- Test Case 8: Newline/tab separators (3 matches)
3

-- Test Case 9: No letters (0 matches)
0

-- Test Case 10: Empty string (0 matches)
0

-- Test Case 11: Whitespace only (0 matches)
0

-- Test Case 12: Trailing whitespace (1 match)
1

-- Test Case 13: Leading whitespace (1 match)
1

-- Test Case 14: Letters with special characters (2 matches)
2

-- Test Case 15: Consecutive letters and numbers (1 match)
1

-- Test Case 16: Words with length ≥3 (1 match)
1

-- Test Case 17: Words with length =2 (2 matches)
2

-- Test Case 18: Letters surrounded by special characters (3 matches)
3

-- Test Case 19: Mixed newline characters (3 matches)
3

-- Test Case 20: String "NULL" (0 matches, uppercase)
0
58 changes: 58 additions & 0 deletions regression-test/suites/nereids_function_p0/regexpCount.groovy
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
-- Test Case 1: Basic match (3 lowercase words with spaces)
SELECT regexp_count('1a 2b 14m', '\s*[a-z]+\s*') AS result;

-- Test Case 2: All letters (2 words)
SELECT regexp_count('hello world', '\s*[a-z]+\s*') AS result;

-- Test Case 3: Single letters (3 matches)
SELECT regexp_count('a b c', '\s*[a-z]+\s*') AS result;

-- Test Case 4: Uppercase letters (no matches)
SELECT regexp_count('A B C', '\s*[a-z]+\s*') AS result;

-- Test Case 5: Letter-number combinations (3 matches)
SELECT regexp_count('a1 b2 c3', '\s*[a-z]+\d\s*') AS result;

-- Test Case 6: Multiple spaces (3 matches)
SELECT regexp_count('a b c', '\s+[a-z]+\s+') AS result;

-- Test Case 7: Single letter without spaces (1 match)
SELECT regexp_count('a', '\s*[a-z]+\s*') AS result;

-- Test Case 8: Newline/tab separators (3 matches)
SELECT regexp_count('a\nc\tb', '\s*[a-z]+\s*') AS result;

-- Test Case 9: No letters (0 matches)
SELECT regexp_count('123', '\s*[a-z]+\s*') AS result;

-- Test Case 10: Empty string (0 matches)
SELECT regexp_count('', '\s*[a-z]+\s*') AS result;

-- Test Case 11: Whitespace only (0 matches)
SELECT regexp_count(' ', '\s*[a-z]+\s*') AS result;
-- Test Case 12: Trailing whitespace (1 match)
SELECT regexp_count('a ', '\s*[a-z]+\s*') AS result;

-- Test Case 13: Leading whitespace (1 match)
SELECT regexp_count(' a', '\s*[a-z]+\s*') AS result;

-- Test Case 14: Letters with special characters (2 matches)
SELECT regexp_count('ab-cd_ef', '\s*[a-z]+[^a-z]\s*') AS result;

-- Test Case 15: Consecutive letters and numbers (1 match)
SELECT regexp_count('xyz123xyz', '\s*[a-z]+\d+\s*') AS result;

-- Test Case 16: Words with length ≥3 (1 match)
SELECT regexp_count('longword', '\s*[a-z]{3,}\s*') AS result;

-- Test Case 17: Words with length =2 (2 matches)
SELECT regexp_count('ab cd', '\s*[a-z]{2}\s*') AS result;

-- Test Case 18: Letters surrounded by special characters (3 matches)
SELECT regexp_count('!@#a$%b^&c', '\s*[a-z]+\s*') AS result;

-- Test Case 19: Mixed newline characters (3 matches)
SELECT regexp_count('a\nb\tc\r', '\s*[a-z]+\s*') AS result;

-- Test Case 20: String "NULL" (0 matches, uppercase)
SELECT regexp_count('NULL', '\s*[a-z]+\s*') AS result;